# Feature Engineering

In [33]:
import pandas as pd
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

In [None]:
lis_df = pd.read_parquet('data/processed/02_listings.parquet')
rev_df = pd.read_parquet('data/processed/02_reviews.parquet')
print(lis_df.shape,'\n',rev_df.shape)

In [35]:
# Store original columns
original_columns = lis_df.columns.copy()

In [None]:
lis_df.info()

In [37]:
# Listing Quality Indicators
# lis_df['is_superhost'] = lis_df['host_is_superhost']  # Already boolean
#lis_df['total_reviews'] = lis_df['number_of_reviews'] #TODO
# lis_df['avg_rating'] = lis_df['review_scores_rating'] #TODO 
# lis_df['review_frequency'] = lis_df['reviews_per_month']


In [38]:
# Location Features
COPENHAGEN_CENTER_LAT = 55.6761
COPENHAGEN_CENTER_LON = 12.5683

# Distance to center in birdeye view
lis_df['distance_to_center_km'] = np.sqrt(
    (lis_df['latitude'] - COPENHAGEN_CENTER_LAT)**2 + 
    (lis_df['longitude'] - COPENHAGEN_CENTER_LON)**2
) * 111  # Rough conversion to kilometers

# Neighborhood density
lis_df['listings_in_neighborhood'] = lis_df.groupby('neighbourhood_cleansed')['id'].transform('count')

In [39]:
# Price Features
# Neighborhood price comparison
lis_df['neighborhood_avg_price'] = lis_df.groupby('neighbourhood_cleansed')['price_DKK'].transform('mean')
lis_df['price_vs_neighborhood'] = lis_df['price_DKK'] / lis_df['neighborhood_avg_price']

# Room type price comparison
lis_df['room_type_avg_price'] = lis_df.groupby('room_type')['price_DKK'].transform('mean')
lis_df['price_vs_room_type'] = lis_df['price_DKK'] / lis_df['room_type_avg_price']

# Value indicators
lis_df['price_per_person'] = lis_df['price_DKK'] / lis_df['accommodates']
lis_df['price_per_bedroom'] = lis_df['price_DKK'].div(lis_df['bedrooms'].where(lis_df['bedrooms'] > 0, 1))

# Host

In [40]:
# Create boolean masks for different conditions
mask_diff_dates = lis_df['first_review'] != lis_df['last_review']
mask_same_dates = lis_df['first_review'] == lis_df['last_review']

# Initialize host_experience_years with zeros
# lis_df['host_experience_years'] = 0

# Calculate for different first and last review dates
lis_df.loc[mask_diff_dates, 'host_experience_years'] = (
    (lis_df.loc[mask_diff_dates, 'last_review'] - 
     lis_df.loc[mask_diff_dates, 'first_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Calculate for same first and last review date
lis_df.loc[mask_same_dates, 'host_experience_years'] = (
    (pd.Timestamp.now() - lis_df.loc[mask_same_dates, 'last_review']).dt.total_seconds() / (365.25 * 24 * 60 * 60)
)

# Set host_experience_years to 0 where both review dates are missing
mask_no_reviews = lis_df['last_review'].isna() & lis_df['first_review'].isna()
lis_df.loc[mask_no_reviews, 'host_experience_years'] = 0

In [41]:
# Response Quality: encode ordinal rating of each possible response time category
# lis_df['host_response_time']

In [42]:
# How big of a scale does the host operate on relatively to the rest in this neighborhood
lis_df['host_listings_ratio'] = lis_df['host_total_listings_count'] / lis_df['listings_in_neighborhood']

In [43]:
# Avoid division by zero in yearly_review calculation
lis_df['yearly_review'] = lis_df['number_of_reviews'] / lis_df['host_experience_years'].replace(0, 1)
lis_df['yearly_review'] = lis_df['yearly_review'].fillna(0)  # No reviews = 0 reviews per year

# Calculate review score variance; consistency of the host's recieved reviews by guests
review_score_cols = [col for col in lis_df.columns if col.startswith('review_scores_')]
lis_df['review_score_variance'] = lis_df[review_score_cols].var(axis=1)
lis_df.drop(columns=[col for col in lis_df.columns if 'review_scores_' in col], inplace=True)

In [None]:
# Print summary of new features
print("\nNew Feature Summary:")
new_features = lis_df.columns.difference(original_columns)
for col in new_features:
    print(f"\n{col}:")
    print(lis_df[col].describe())

# Check for any issues in new features
print("\nChecking for issues in new features:")
print(lis_df[new_features].isnull().sum())

## Drop first|last review date, and latitude|longitude after calculated.

In [45]:
lis_df.drop(['first_review', 'last_review'], axis=1, inplace=True)

### xx. Sentiment Score Generation

In [None]:
# Import required libraries for language detection and sentiment analysis
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from transformers import pipeline
import torch

# Function to safely detect language
def detect_language(text):
    try:
        if pd.isna(text) or len(str(text).strip()) < 3:  # Check for very short texts
            return 'unknown'
        return detect(str(text))
    except LangDetectException:
        return 'unknown'
    except Exception as e:
        print(f"Error detecting language: {str(e)}")
        return 'unknown'

# Initialize the multilingual sentiment analyzer
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    device=0 if torch.cuda.is_available() else -1
)

# Function to get sentiment score (-1 to 1 range)
def get_multilingual_sentiment(text):
    try:
        if pd.isna(text) or len(str(text).strip()) < 3:
            return 0
        
        # Get sentiment (returns 1-5 score)
        result = sentiment_analyzer(text[:512])[0]  # Truncate to 512 tokens max
        score = int(result['label'][0])  # Extract 1-5 score
        
        # Convert 1-5 score to -1 to 1 range
        normalized_score = (score - 3) / 2  # Convert 1-5 to -1 to 1
        
        return normalized_score
    
    except Exception as e:
        print(f"Error processing text: {str(e)}")
        return 0

# Calculate sentiment scores and detect language
print("Calculating sentiment scores and detecting languages...")
rev_df['detected_language'] = rev_df['comments'].apply(detect_language)

# Process reviews in batches to show progress
batch_size = 100
total_batches = len(rev_df) // batch_size + 1

for i in range(total_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(rev_df))
    
    print(f"Processing batch {i+1}/{total_batches}")
    
    rev_df.loc[start_idx:end_idx, 'sentiment_score'] = rev_df.loc[start_idx:end_idx, 'comments'].apply(
        get_multilingual_sentiment
    )

# Print language distribution
print("\nLanguage Distribution:")
print(rev_df['detected_language'].value_counts())

# Sample reviews in different languages with their sentiment scores
for lang in rev_df['detected_language'].value_counts().head().index:
    if lang != 'unknown':  # Skip unknown language samples
        print(f"\nSample {lang} reviews:")
        print("-" * 80)
        sample = rev_df[rev_df['detected_language'] == lang].sample(2)
        print(sample[['comments', 'sentiment_score', 'detected_language']].to_string())

### xx. Final Status Check

In [None]:
print("Listings shape:", lis_df.shape)
print("Reviews shape:", rev_df.shape)

### xx. Save Processed Datasets

In [48]:
lis_df.to_parquet('data/processed/03_listings.parquet')
rev_df.to_parquet('data/processed/03_reviews.parquet')