# Feature Engineering

### 1. Setup and Processed Data Import

In [152]:
import pandas as pd
import numpy as np
from transformers import pipeline
import torch
from tqdm import tqdm
import pandas as pd

In [153]:
lis_df = pd.read_parquet('data/processed/02_listings.parquet')
rev_df = pd.read_parquet('data/processed/02_reviews.parquet')
print(lis_df.shape,'\n',rev_df.shape)

(20905, 40) 
 (366636, 5)


### 2. Feature Generation

In [154]:
# Store original columns
original_columns = lis_df.columns.copy()

In [155]:
# Location Features
COPENHAGEN_CENTER_LAT = 55.6761
COPENHAGEN_CENTER_LON = 12.5683

# Distance to center in birdeye view
lis_df['distance_to_center_km'] = np.sqrt(
    (lis_df['latitude'] - COPENHAGEN_CENTER_LAT)**2 + 
    (lis_df['longitude'] - COPENHAGEN_CENTER_LON)**2
) * 111  # Rough conversion to kilometers

# Neighborhood density
lis_df['listings_in_neighborhood'] = lis_df.groupby('neighbourhood_cleansed')['id'].transform('count')

In [156]:
# Price Features
# Neighborhood price comparison
lis_df['neighborhood_avg_price'] = lis_df.groupby('neighbourhood_cleansed')['price_DKK'].transform('mean')
lis_df['price_vs_neighborhood'] = lis_df['price_DKK'] / lis_df['neighborhood_avg_price']

# Room type price comparison
lis_df['room_type_avg_price'] = lis_df.groupby('room_type')['price_DKK'].transform('mean')
lis_df['price_vs_room_type'] = lis_df['price_DKK'] / lis_df['room_type_avg_price']

# Value indicators
lis_df['price_per_person'] = lis_df['price_DKK'] / lis_df['accommodates']
lis_df['price_per_bedroom'] = lis_df['price_DKK'].div(lis_df['bedrooms'].where(lis_df['bedrooms'] > 0, 1))

In [157]:
# Create boolean masks for different conditions
mask_diff_dates = lis_df['first_review'] != lis_df['last_review']
mask_same_dates = lis_df['first_review'] == lis_df['last_review']


# Calculate for different first and last review dates
lis_df.loc[mask_diff_dates, 'host_experience_years'] = (
    (lis_df.loc[mask_diff_dates, 'last_review'] - 
     lis_df.loc[mask_diff_dates, 'first_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Calculate for same first and last review date
lis_df.loc[mask_same_dates, 'host_experience_years'] = (
    (pd.Timestamp.now() - lis_df.loc[mask_same_dates, 'last_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Set host_experience_years to 0 where both review dates are missing
mask_no_reviews = lis_df['last_review'].isna() & lis_df['first_review'].isna()
lis_df.loc[mask_no_reviews, 'host_experience_years'] = 0

In [158]:
# How big of a scale does the host operate on relatively to the rest in this neighborhood
lis_df['host_listings_ratio'] = lis_df['host_total_listings_count'] / lis_df['listings_in_neighborhood']

In [159]:
# Avoid division by zero in yearly_review calculation
lis_df['yearly_review'] = lis_df['number_of_reviews'] / lis_df['host_experience_years'].replace(0, 1)
lis_df['yearly_review'] = lis_df['yearly_review'].fillna(0)  # No reviews = 0 reviews per year

# Calculate review score variance; consistency of the host's recieved reviews by guests
review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
lis_df['review_score_variance'] = lis_df[review_score_cols].var(axis=1)
lis_df.drop(columns=[col for col in lis_df.columns if 'review_scores_' in col], inplace=True)

In [160]:
# Print summary of new features
print("\nNew Feature Summary:")
new_features = lis_df.columns.difference(original_columns)
for col in new_features:
    print(f"\n{col}:")
    print(lis_df[col].describe())

# Check for any issues in new features
print("\nChecking for issues in new features:")
print(lis_df[new_features].isnull().sum())


New Feature Summary:

distance_to_center_km:
count    20905.000000
mean         3.733406
std          2.014533
min          0.096576
25%          2.259348
50%          3.445544
75%          4.762564
max         13.204881
Name: distance_to_center_km, dtype: float64

host_experience_years:
count    20905.000000
mean         2.159199
std          2.661796
min          0.000000
25%          0.134155
50%          1.013005
75%          3.107461
max         13.859001
Name: host_experience_years, dtype: float64

host_listings_ratio:
count    20905.000000
mean         0.005180
std          0.033082
min          0.000268
25%          0.000334
50%          0.000558
75%          0.001115
max          0.802680
Name: host_listings_ratio, dtype: float64

listings_in_neighborhood:
count    20905.000000
mean      2568.974169
std       1008.924101
min        365.000000
25%       1869.000000
50%       2247.000000
75%       3586.000000
max       3734.000000
Name: listings_in_neighborhood, dtype: float64


#### Drop first & last review date, as we've now created the attribute host_experience_years.

In [161]:
lis_df.drop(['first_review', 'last_review'], axis=1, inplace=True)

### 3. Sentiment Scoring

In [162]:
# Calculate total length of all comments
total_comment_length = rev_df['comments'].str.len().sum()
print(f"Total length of all comments: {total_comment_length:,} characters")

Total length of all comments: 92,513,161 characters


In [None]:
'''# Initialize model and tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"  
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model=model_name,
    tokenizer=model_name,
    device=0 if torch.cuda.is_available() else -1
)

def get_sentiment_score(text):
    if pd.isna(text) or len(str(text).strip()) < 3:
        return 0
    try:
        # Truncate long texts to 512 tokens
        result = sentiment_analyzer(str(text)[:10000])[0]
        
        # This model returns scores from 1-5 stars
        score = int(result['label'][0])  # Get first character (1-5)
        
        # Convert 1-5 scale to [-1, 1]
        normalized_score = (score - 3) / 2
        
        # Weight by confidence
        final_score = normalized_score * result['score']
        
        return final_score
        
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return 0

# Take a sample of 1000 reviews
sample_size = 10
print(f"Processing sample of {sample_size} reviews...")
rev_sample = rev_df.sample(sample_size, random_state=42)

# Process reviews with progress bar
print("Calculating sentiment scores...")
rev_sample['sentiment_score'] = [get_sentiment_score(text) for text in tqdm(rev_sample['comments'])]

# Show results
print("\nSample Results by Sentiment Category:")
for sentiment, range_vals in [
    ('Very Positive', (0.5, 1.0)),
    ('Positive', (0.1, 0.5)),
    ('Neutral', (-0.1, 0.1)),
    ('Negative', (-0.5, -0.1)),
    ('Very Negative', (-1.0, -0.5))
]:
    mask = (rev_sample['sentiment_score'] >= range_vals[0]) & (rev_sample['sentiment_score'] <= range_vals[1])
    print(f"\n{sentiment} Reviews Examples (3 random samples):")
    sample = rev_sample[mask].sample(min(3, len(rev_sample[mask])))
    for _, row in sample.iterrows():
        print(f"\nScore: {row['sentiment_score']:.3f}")
        print(f"Review: {row['comments'][:20000]}")

# Print distribution statistics
print("\nSentiment Score Distribution:")
print(rev_sample['sentiment_score'].describe())

# Save results
rev_sample.to_parquet('data/processed/04_sentiment_bert.parquet', index=False)
rev_sample.to_csv('data/processed/04_sentiment_bert.csv', index=False)'''

Device set to use cpu


Processing sample of 10 reviews...
Calculating sentiment scores...


100%|██████████| 10/10 [00:01<00:00,  5.86it/s]


Sample Results by Sentiment Category:

Very Positive Reviews Examples (3 random samples):

Score: 0.656
Review: Runes flat is a very unique place. We liked the style very much. It was cosy and very well located. We didn't need any public transportation at all.

Score: 0.825
Review: Amazing host

Score: 0.844
Review: Die Wohnung war einfahch klasse. Sehr sauber, alles neu, top ausgestattet und Elise war sehr hilfriech wenn es um Tips und Unterstützung ging. Wir waren sehr zufrieden und werden sicherlich versuchen bei Elise wieder zu übernachten wenn wir das nächste Mal nach Kopenhagen kommen.

Positive Reviews Examples (3 random samples):

Score: 0.296
Review: Die Unterkunft ist sehr zentral gelegen (Hbf und Tivoli in 5 Minuten zu Fuß zu erreichen) und dennoch absolut ruhig im geschützten Hinterhof. Es ist weniger eine Unterkunft zum Verweilen oder gemütlich Essen, sondern vielmehr ideal zum Ausruhen nach einem langen Tag in der wunderschönen Stadt. Die Unterkunft ist sehr sauber und h




In [171]:
rev_sample[:2]

Unnamed: 0,listing_id,id,date,reviewer_id,comments,sentiment_score
355286,999182760952148206,1058546998493773811,2023-12-31,23184612,"Die Wohnung war einfahch klasse. Sehr sauber, ...",0.843871
3787,338992,610427953,2020-02-24,7530520,Bente's beautiful place is located right near ...,0.4774


In [166]:
df = pd.read_parquet('data/processed/04_sentiment_bert.parquet')
df

Unnamed: 0,listing_id,id,date,reviewer_id,comments,sentiment_score
0,999182760952148206,1058546998493773811,2023-12-31,23184612,"Die Wohnung war einfahch klasse. Sehr sauber, ...",0.843871
1,338992,610427953,2020-02-24,7530520,Bente's beautiful place is located right near ...,0.4774
2,39186014,643209693077290041,2022-06-06,47045585,Comfortable and nice. Great communication and ...,0.299731
3,569909,365270425,2019-01-01,56799129,Runes flat is a very unique place. We liked th...,0.656043
4,937157101494923200,1154230602692129699,2024-05-11,502588195,"A great price for Copenhagen, good neighbourho...",0.0
5,26093247,964367199455001036,2023-08-23,456728446,Amazing host,0.825185
6,29085636,870788038695621336,2023-04-16,14102662,Die Unterkunft ist sehr zentral gelegen (Hbf u...,0.295688
7,913101728617102476,973073633815093496,2023-09-04,9266496,Wir waren drei Tage in der sehr schönen Wohnun...,0.894202
8,186454,672315615467612603,2022-07-16,207879695,Great place in a great location.,0.804084
9,17853510,437437060108678517,2021-08-26,30689889,Ces quelques jours chez Tomas ont été incroyab...,0.83822


# Remember to append sentiment score to rev_df!!!

### 4. Final Status Check

In [167]:
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

Listings shape: (20905, 43)
Reviews shape: (366636, 5)


### 5. Save Processed Datasets

In [168]:
lis_df.to_parquet('data/processed/03_listings.parquet')
# rev_df.to_parquet('data/processed/03_reviews.parquet')