# Feature Engineering

In [52]:
import pandas as pd
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

In [None]:
lis_df = pd.read_parquet('data/processed/02_listings.parquet')
rev_df = pd.read_parquet('data/processed/02_reviews.parquet')
print(lis_df.shape,'\n',rev_df.shape)

In [54]:
# Store original columns
original_columns = lis_df.columns.copy()

In [None]:
lis_df.info()

In [56]:
# Listing Quality Indicators
# lis_df['is_superhost'] = lis_df['host_is_superhost']  # Already boolean
#lis_df['total_reviews'] = lis_df['number_of_reviews'] #TODO
# lis_df['avg_rating'] = lis_df['review_scores_rating'] #TODO 
# lis_df['review_frequency'] = lis_df['reviews_per_month']


In [57]:
# Location Features
COPENHAGEN_CENTER_LAT = 55.6761
COPENHAGEN_CENTER_LON = 12.5683

# Distance to center in birdeye view
lis_df['distance_to_center_km'] = np.sqrt(
    (lis_df['latitude'] - COPENHAGEN_CENTER_LAT)**2 + 
    (lis_df['longitude'] - COPENHAGEN_CENTER_LON)**2
) * 111  # Rough conversion to kilometers

# Neighborhood density
lis_df['listings_in_neighborhood'] = lis_df.groupby('neighbourhood_cleansed')['id'].transform('count')

In [58]:
# Price Features
# Neighborhood price comparison
lis_df['neighborhood_avg_price'] = lis_df.groupby('neighbourhood_cleansed')['price_DKK'].transform('mean')
lis_df['price_vs_neighborhood'] = lis_df['price_DKK'] / lis_df['neighborhood_avg_price']

# Room type price comparison
lis_df['room_type_avg_price'] = lis_df.groupby('room_type')['price_DKK'].transform('mean')
lis_df['price_vs_room_type'] = lis_df['price_DKK'] / lis_df['room_type_avg_price']

# Value indicators
lis_df['price_per_person'] = lis_df['price_DKK'] / lis_df['accommodates']
lis_df['price_per_bedroom'] = lis_df['price_DKK'].div(lis_df['bedrooms'].where(lis_df['bedrooms'] > 0, 1))

# Host

In [59]:
# Create boolean masks for different conditions
mask_diff_dates = lis_df['first_review'] != lis_df['last_review']
mask_same_dates = lis_df['first_review'] == lis_df['last_review']

# Initialize host_experience_years with zeros
# lis_df['host_experience_years'] = 0

# Calculate for different first and last review dates
lis_df.loc[mask_diff_dates, 'host_experience_years'] = (
    (lis_df.loc[mask_diff_dates, 'last_review'] - 
     lis_df.loc[mask_diff_dates, 'first_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Calculate for same first and last review date
lis_df.loc[mask_same_dates, 'host_experience_years'] = (
    (pd.Timestamp.now() - lis_df.loc[mask_same_dates, 'last_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Set host_experience_years to 0 where both review dates are missing
mask_no_reviews = lis_df['last_review'].isna() & lis_df['first_review'].isna()
lis_df.loc[mask_no_reviews, 'host_experience_years'] = 0

In [60]:
# Response Quality: encode ordinal rating of each possible response time category
# lis_df['host_response_time']

In [61]:
# How big of a scale does the host operate on relatively to the rest in this neighborhood
lis_df['host_listings_ratio'] = lis_df['host_total_listings_count'] / lis_df['listings_in_neighborhood']

In [62]:
# Avoid division by zero in yearly_review calculation
lis_df['yearly_review'] = lis_df['number_of_reviews'] / lis_df['host_experience_years'].replace(0, 1)
lis_df['yearly_review'] = lis_df['yearly_review'].fillna(0)  # No reviews = 0 reviews per year

# Calculate review score variance; consistency of the host's recieved reviews by guests
review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
lis_df['review_score_variance'] = lis_df[review_score_cols].var(axis=1)
lis_df.drop(columns=[col for col in lis_df.columns if 'review_scores_' in col], inplace=True)

In [None]:
# Print summary of new features
print("\nNew Feature Summary:")
new_features = lis_df.columns.difference(original_columns)
for col in new_features:
    print(f"\n{col}:")
    print(lis_df[col].describe())

# Check for any issues in new features
print("\nChecking for issues in new features:")
print(lis_df[new_features].isnull().sum())

## Drop first|last review date, and latitude|longitude after calculated.

In [64]:
lis_df.drop(['first_review', 'last_review'], axis=1, inplace=True)

### xx. Sentiment Score Generation

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import pandas as pd

# Initialize model and tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"  # Using a more stable model
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model=model_name,
    tokenizer=model_name,
    device=0 if torch.cuda.is_available() else -1
)

def get_sentiment_score(text):
    if pd.isna(text) or len(str(text).strip()) < 3:
        return 0
    try:
        # Truncate long texts to 512 tokens
        result = sentiment_analyzer(str(text)[:512])[0]
        
        # This model returns scores from 1-5 stars
        score = int(result['label'][0])  # Get first character (1-5)
        
        # Convert 1-5 scale to [-1, 1]
        normalized_score = (score - 3) / 2
        
        # Weight by confidence
        final_score = normalized_score * result['score']
        
        return final_score
        
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return 0

# Take a sample of 1000 reviews
sample_size = 1000
print(f"Processing sample of {sample_size} reviews...")
rev_sample = rev_df.sample(sample_size, random_state=42)

# Process reviews with progress bar
print("Calculating sentiment scores...")
rev_sample['sentiment_score'] = [get_sentiment_score(text) for text in tqdm(rev_sample['comments'])]

# Show results
print("\nSample Results by Sentiment Category:")
for sentiment, range_vals in [
    ('Very Positive', (0.5, 1.0)),
    ('Positive', (0.1, 0.5)),
    ('Neutral', (-0.1, 0.1)),
    ('Negative', (-0.5, -0.1)),
    ('Very Negative', (-1.0, -0.5))
]:
    mask = (rev_sample['sentiment_score'] >= range_vals[0]) & (rev_sample['sentiment_score'] <= range_vals[1])
    print(f"\n{sentiment} Reviews Examples (3 random samples):")
    sample = rev_sample[mask].sample(min(3, len(rev_sample[mask])))
    for _, row in sample.iterrows():
        print(f"\nScore: {row['sentiment_score']:.3f}")
        print(f"Review: {row['comments'][:200]}...")

# Print distribution statistics
print("\nSentiment Score Distribution:")
print(rev_sample['sentiment_score'].describe())

# Save results
rev_sample.to_csv('sentiment_analysis_sample_bert.csv', index=False)

### xx. Final Status Check

In [None]:
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

### xx. Save Processed Datasets

In [67]:
lis_df.to_parquet('data/processed/03_listings.parquet')
rev_df.to_parquet('data/processed/03_reviews.parquet')