# Feature Engineering

### 1. Setup and Processed Data Import

In [283]:
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from tqdm import tqdm
import pandas as pd

In [284]:
lis_df = pd.read_parquet('data/processed/02_listings.parquet')
rev_df = pd.read_parquet('data/processed/02_reviews.parquet')
print(lis_df.shape,'\n',rev_df.shape)

(20905, 40) 
 (366636, 5)


### 2. Feature Generation

In [285]:
# Store original columns
original_columns = lis_df.columns.copy()

In [286]:
# Location Features
COPENHAGEN_CENTER_LAT = 55.6761
COPENHAGEN_CENTER_LON = 12.5683

# Distance to center in birdeye view
lis_df['distance_to_center_km'] = np.sqrt(
    (lis_df['latitude'] - COPENHAGEN_CENTER_LAT)**2 + 
    (lis_df['longitude'] - COPENHAGEN_CENTER_LON)**2
) * 111  # Rough conversion to kilometers

# Neighborhood density
lis_df['listings_in_neighborhood'] = lis_df.groupby('neighbourhood_cleansed')['id'].transform('count')

In [287]:
# Price Features
# Neighborhood price comparison
lis_df['neighborhood_avg_price'] = lis_df.groupby('neighbourhood_cleansed')['price_DKK'].transform('mean')
lis_df['price_vs_neighborhood'] = lis_df['price_DKK'] / lis_df['neighborhood_avg_price']

# Room type price comparison
lis_df['room_type_avg_price'] = lis_df.groupby('room_type')['price_DKK'].transform('mean')
lis_df['price_vs_room_type'] = lis_df['price_DKK'] / lis_df['room_type_avg_price']

# Value indicators
lis_df['price_per_person'] = lis_df['price_DKK'] / lis_df['accommodates']
lis_df['price_per_bedroom'] = lis_df['price_DKK'].div(lis_df['bedrooms'].where(lis_df['bedrooms'] > 0, 1))

In [288]:
# Create boolean masks for different conditions
mask_diff_dates = lis_df['first_review'] != lis_df['last_review']
mask_same_dates = lis_df['first_review'] == lis_df['last_review']


# Calculate for different first and last review dates
lis_df.loc[mask_diff_dates, 'host_experience_years'] = (
    (lis_df.loc[mask_diff_dates, 'last_review'] - 
     lis_df.loc[mask_diff_dates, 'first_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Calculate for same first and last review date
lis_df.loc[mask_same_dates, 'host_experience_years'] = (
    (pd.Timestamp.now() - lis_df.loc[mask_same_dates, 'last_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Set host_experience_years to 0 where both review dates are missing
mask_no_reviews = lis_df['last_review'].isna() & lis_df['first_review'].isna()
lis_df.loc[mask_no_reviews, 'host_experience_years'] = 0

In [289]:
# How big of a scale does the host operate on relatively to the rest in this neighborhood
lis_df['host_listings_ratio'] = lis_df['host_total_listings_count'] / lis_df['listings_in_neighborhood']

In [290]:
# Avoid division by zero in yearly_review calculation
lis_df['yearly_review'] = lis_df['number_of_reviews'] / lis_df['host_experience_years'].replace(0, 1)
lis_df['yearly_review'] = lis_df['yearly_review'].fillna(0)  # No reviews = 0 reviews per year

# Calculate review score variance; consistency of the host's recieved reviews by guests
review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
lis_df['review_scores_variance'] = lis_df[review_score_cols].var(axis=1)

# Drop review score columns except rating and variance
cols_to_drop = [col for col in lis_df.columns if 'review_scores_' in col 
                and col != 'review_scores_rating' 
                and col != 'review_scores_variance']
lis_df.drop(columns=cols_to_drop, inplace=True)

In [291]:
# Print summary of new features
print("\nNew Feature Summary:")
new_features = lis_df.columns.difference(original_columns)
for col in new_features:
    print(f"\n{col}:")
    print(lis_df[col].describe())

# Check for any issues in new features
print("\nChecking for issues in new features:")
print(lis_df[new_features].isnull().sum())


New Feature Summary:

distance_to_center_km:
count    20905.000000
mean         3.733406
std          2.014533
min          0.096576
25%          2.259348
50%          3.445544
75%          4.762564
max         13.204881
Name: distance_to_center_km, dtype: float64

host_experience_years:
count    20905.000000
mean         2.159213
std          2.661794
min          0.000000
25%          0.134155
50%          1.013005
75%          3.107461
max         13.859001
Name: host_experience_years, dtype: float64

host_listings_ratio:
count    20905.000000
mean         0.005180
std          0.033082
min          0.000268
25%          0.000334
50%          0.000558
75%          0.001115
max          0.802680
Name: host_listings_ratio, dtype: float64

listings_in_neighborhood:
count    20905.000000
mean      2568.974169
std       1008.924101
min        365.000000
25%       1869.000000
50%       2247.000000
75%       3586.000000
max       3734.000000
Name: listings_in_neighborhood, dtype: float64


#### Drop first & last review date, as we've now created the attribute host_experience_years.

In [292]:
lis_df.drop(['first_review', 'last_review'], axis=1, inplace=True)

### 3. Sentiment Scoring

In [293]:
# Calculate total length of all comments
total_comment_length = rev_df['comments'].str.len().sum()
print(f"Total length of all comments: {total_comment_length:,} characters")

Total length of all comments: 92,513,161 characters


In [294]:
'''# Initialize model and tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"  
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model=model_name,
    tokenizer=model_name,
    device=0 if torch.cuda.is_available() else -1
)

def get_sentiment_score(text):
    if pd.isna(text) or len(str(text).strip()) < 3:
        return 0
    try:
        # Truncate long texts to 512 tokens
        result = sentiment_analyzer(str(text)[:10000])[0]
        
        # This model returns scores from 1-5 stars
        score = int(result['label'][0])  # Get first character (1-5)
        
        # Convert 1-5 scale to [-1, 1]
        normalized_score = (score - 3) / 2
        
        # Weight by confidence
        final_score = normalized_score * result['score']
        
        return final_score
        
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return 0

# Take a sample of 1000 reviews
sample_size = 10
print(f"Processing sample of {sample_size} reviews...")
rev_sample = rev_df.sample(sample_size, random_state=42)

# Process reviews with progress bar
print("Calculating sentiment scores...")
rev_sample['sentiment_score'] = [get_sentiment_score(text) for text in tqdm(rev_sample['comments'])]

# Show results
print("\nSample Results by Sentiment Category:")
for sentiment, range_vals in [
    ('Very Positive', (0.5, 1.0)),
    ('Positive', (0.1, 0.5)),
    ('Neutral', (-0.1, 0.1)),
    ('Negative', (-0.5, -0.1)),
    ('Very Negative', (-1.0, -0.5))
]:
    mask = (rev_sample['sentiment_score'] >= range_vals[0]) & (rev_sample['sentiment_score'] <= range_vals[1])
    print(f"\n{sentiment} Reviews Examples (3 random samples):")
    sample = rev_sample[mask].sample(min(3, len(rev_sample[mask])))
    for _, row in sample.iterrows():
        print(f"\nScore: {row['sentiment_score']:.3f}")
        print(f"Review: {row['comments'][:20000]}")

# Print distribution statistics
print("\nSentiment Score Distribution:")
print(rev_sample['sentiment_score'].describe())

# Save results
rev_sample.to_parquet('data/processed/04_sentiment_bert.parquet', index=False)
rev_sample.to_csv('data/processed/04_sentiment_bert.csv', index=False)'''

'# Initialize model and tokenizer\nmodel_name = "nlptown/bert-base-multilingual-uncased-sentiment"  \nsentiment_analyzer = pipeline(\n    "sentiment-analysis",\n    model=model_name,\n    tokenizer=model_name,\n    device=0 if torch.cuda.is_available() else -1\n)\n\ndef get_sentiment_score(text):\n    if pd.isna(text) or len(str(text).strip()) < 3:\n        return 0\n    try:\n        # Truncate long texts to 512 tokens\n        result = sentiment_analyzer(str(text)[:10000])[0]\n        \n        # This model returns scores from 1-5 stars\n        score = int(result[\'label\'][0])  # Get first character (1-5)\n        \n        # Convert 1-5 scale to [-1, 1]\n        normalized_score = (score - 3) / 2\n        \n        # Weight by confidence\n        final_score = normalized_score * result[\'score\']\n        \n        return final_score\n        \n    except Exception as e:\n        print(f"Error processing text: {str(e)}")\n        return 0\n\n# Take a sample of 1000 reviews\nsample

In [295]:
import pandas as pd
rev_s = pd.read_parquet('data/processed/04_sentiment_bert.parquet')
rev_s['sentiment_score_1_5'] = (rev_s['sentiment_score'] * 2) + 3
rev_s = rev_s.drop('sentiment_score', axis=1)
rev_s.shape

(366636, 6)

### 4. Final Status Check

In [296]:
lis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20905 entries, 0 to 20904
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         20905 non-null  int64         
 1   listing_url                20905 non-null  string        
 2   name                       20905 non-null  string        
 3   description                20232 non-null  string        
 4   host_id                    20905 non-null  int64         
 5   host_since                 20904 non-null  datetime64[ns]
 6   host_response_time         20905 non-null  string        
 7   host_response_rate_pct     14439 non-null  float64       
 8   host_acceptance_rate_pct   20905 non-null  float64       
 9   host_neighbourhood         20905 non-null  string        
 10  host_total_listings_count  20905 non-null  float64       
 11  host_has_profile_pic       20905 non-null  bool          
 12  host

In [297]:
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_s.shape)

Listings shape: (20905, 44)
Reviews shape: (366636, 6)


### 5. Save Processed Datasets

In [298]:
lis_df.to_parquet('data/processed/03_listings.parquet')
rev_s.to_parquet('data/processed/05_sentiment_bert.parquet')