# Feature engineering
This step is about enriching the data. The best feature is not always the one provided by the dataset. We often need to create new variables or transform existing ones so that the model can learn better.

## Why is it important? 
- The performance of a model often depends not on the algorithm, but on the features.
- Good feature engineering = better predictive performance + easier interpretability.

## Types of Features to Create and How to Approach Them
- Creating new features (e.g., from date: month, season, weekend)
- Coding categories (one-hot encoding, label encoding)
- Scaling/normalization (so that variables are of similar magnitude)
- Processing text variables (e.g., description → length, sentiment)
- Feature selection: removing irrelevant/redundant features (e.g., highly correlated variables)

### Setting Up Libraries and Environment for Feature Engineering

In [None]:
import os
import sys
import re
import nltk

import pandas as pd
import numpy as np
from dotenv import load_dotenv
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

pd.set_option('display.max_columns', None) 

project_root = "/Users/erikvida/PycharmProjects/airbnb-price-prediction"
if project_root not in sys.path:
    sys.path.append(project_root)

from src.db_connection import DatabaseConfig, DatabaseConnection


dotenv_path = "/Users/erikvida/PycharmProjects/airbnb-price-prediction/.env"
load_dotenv(dotenv_path)

### 1.0 Loading Data and Initial Overview for Processing

In [None]:
amsterdams_airbnbs_cleaned_data = pd.read_csv("../data/cleaned/amsterdam_airbnbs_clean_data.csv")
df = amsterdams_airbnbs_cleaned_data
df.head()

### 1.1 Inspecting and Understanding the Loaded Data

In [None]:
df.info()

print(f"Data loaded from: {data_path}")
print(f"Number of rows: {df.shape[0]}, columns: {df.shape[1]}")

### 2.0 Analyzing and Preparing Host-Level Features

#### 2.1 Converting Percentage Strings to Numeric Values for Modeling.

In [None]:
percent_cols = ['host_response_rate', 'host_acceptance_rate']  

for col in percent_cols:
    df[col] = (
        df[col]
        .astype(str)                 
        .str.rstrip('%')            
        .replace('nan', np.nan)      
        .astype(float) / 100
    )

df.head()

#### 2.2  Converting Binary Features (True/False) to 0/1 for Easier Processing

In [None]:
binary_cols = ['host_is_superhost', 'host_has_profile_pic']

for col in binary_cols:
    df[col] = (
        df[col]
        .replace({'t': 1, 'f': 0, 'nan': np.nan})
        .astype(int)   
    )

df.head()

#### 2.3 Creating Host Experience Feature: Ratio of Total to Active Listings

In [None]:
df['host_experience_ratio'] = (
    df['host_total_listings_count'] /
    df['host_listings_count'].replace(0, np.nan)   
)

df['host_experience_ratio'] = df['host_experience_ratio'].fillna(0)

df.head()


#### 2.4 Save Processed Host Features to a Separate Table and CSV File

In [None]:
host_features_df = df[['host_response_rate',
              'host_acceptance_rate',
              'host_is_superhost',
              'host_listings_count',  
              'host_total_listings_count',
              'host_has_profile_pic',
              'host_experience_ratio']]


config = DatabaseConfig()
db = DatabaseConnection(config)

TABLE_NAME = "host_features"

db.write_dataframe(host_features_df, TABLE_NAME, if_exists="replace")

host_features_path = "../data/processed/host_features.csv"
df.to_csv(host_features_path, index=False)
print(f"Cleaned data saved to CSV: {host_features_path}")

### 3.0 Location and Neighborhood based features

#### Encoding and Ranking Neighbourhood Features for Location-Based Price Patterns

In [None]:
unique_neighbourhoods = df["neighbourhood_cleansed"].unique()

neighbourhood_dict = {name: i+1 for i, name in enumerate(unique_neighbourhoods)}

df['neighbourhood_rank'] = df['neighbourhood_cleansed'].map(neighbourhood_dict)

df.head()

#### 3.2 Save Processed Location Features to a Separate Table and CSV File

In [None]:
location_features_df = df[['neighbourhood','neighbourhood_cleansed','neighbourhood_rank']]


config = DatabaseConfig()
db = DatabaseConnection(config)

TABLE_NAME = "location_features"

db.write_dataframe(location_features_df, TABLE_NAME, if_exists="replace")

location_features_path = "../data/processed/location_features.csv"
df.to_csv(location_features_path, index=False)
print(f"Cleaned data saved to CSV: {location_features_path}")

### 4.0 Proprerty Type Features

#### 4.1 Encoding and Ranking Property Type Features 

In [None]:
unique_property_types = sorted(df["property_type"].unique())
unique_room_types = sorted(df["room_type"].unique())

property_type_dict = {name: i+1 for i, name in enumerate(unique_property_types)}
room_type_dict = {name: i+1 for i, name in enumerate(unique_room_types)}

df['property_type_id'] = df['property_type'].map(property_type_dict)
df['room_type_id'] = df['room_type'].map(room_type_dict)

df.head()

#### 4.2 Bedroom Bath Ratio

In [None]:
df['bedroom_bath_ratio'] = df['bedrooms'] / df['bathrooms']

df.head()

#### 4.3 People per Bed

In [None]:
df['people_per_bed'] = df['accommodates'] / df['beds']

df.head()

#### 4.4 Number of Total Rooms

In [None]:
def parse_bathrooms(text):
    if pd.isna(text):
        return np.nan
    if 'Half' in text:
        return 0.5
    else:
        match = re.search(r'\d+(\.\d+)?', text)
        return float(match.group()) if match else np.nan

df['bathrooms'] = df['bathrooms_text'].apply(parse_bathrooms)

df['rooms_total'] = df['bedrooms'] + df['bathrooms']

df.head()

#### 4.5 Save Processed Property Features to a Separate Table and CSV File

In [None]:
property_features_df = df[[
    "property_type",
    "room_type",
    "accommodates",
    "bathrooms",
    "bathrooms_text",
    "bedrooms",
    "beds",
    "bedroom_bath_ratio",
    "people_per_bed",
    "rooms_total"
]]


config = DatabaseConfig()
db = DatabaseConnection(config)

TABLE_NAME = "property_features"

db.write_dataframe(property_features_df, TABLE_NAME, if_exists="replace")

property_features_df_path = "../data/processed/property_features.csv"

df.to_csv(property_features_df_path, index=False)
print(f"Cleaned data saved to CSV: {property_features_df_path}")

### 5.0 Sentiment Anlysis

In [None]:
def get_sentiment_vader(text):
    return sia.polarity_scores(str(text))['compound']

df['description_sentiment'] = df['description'].apply(get_sentiment_vader)
df['amenities_sentiment'] = df['amenities'].apply(get_sentiment_vader)

def sentiment_label_vader(compound):
    if compound >= 0.05:
        return "positive"
    elif compound <= -0.05:
        return "negative"
    else:
        return "neutral"

df['description_sentiment_label'] = df['description_sentiment'].apply(sentiment_label_vader)
df['amenities_sentiment_label'] = df['amenities_sentiment'].apply(sentiment_label_vader)

df.head()

### 6.0 Reorder Rows and Save to New Table and new CSV File

In [None]:
featured_df = df[[
    # 1. Basic info
    'id', 'name', 'description', 'description_sentiment', 'description_sentiment_label',
    
    # 2. Host info
    'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_has_profile_pic', 'host_experience_ratio',
    
    # 3. Location
    'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_rank',
    
    # 4. Property features
    'property_type', 'room_type', 'property_type_id', 'room_type_id',
    'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds',
    'bedroom_bath_ratio', 'people_per_bed', 'rooms_total',
    
    # 5. Amenities
    'amenities', 'amenities_sentiment', 'amenities_sentiment_label',
    
    # 6. Price & availability
    'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
    'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
    
    # 7. Reviews
    'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'number_of_reviews_ly',
    'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin',
    'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable',
    
    
]]

config = DatabaseConfig()
db = DatabaseConnection(config)

TABLE_NAME = "feature_eningineered_data"

db.write_dataframe(featured_df, TABLE_NAME, if_exists="replace")

featured_df_path = "../data/processed/amsterdam_airbnbs_feature_engineered_data.csv"

df.to_csv(featured_df_path, index=False)
print(f"Cleaned data saved to CSV: {featured_df_path}")