## Table of Contents
- [INTRODUCTION](#introduction)
- [EDA - STATIC PLOTS](#eda-static)
- [EDA - DYNAMIC 3D PLOTS](#eda-dynamic)
- [CORRELATION ANALYSIS](#correlation)
- [MACHINE LEARNING MODEL](#ml-model)
- [HYPER-PARAMETER OPTIMIZATION](#hyperopt)
- [MODEL EVALUATION](#evaluate)

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

<a id="introduction"></a>
## INTRODUCTION

In [None]:
df = pd.read_csv('data/Case 1 - Airbnb Dataset.csv')
df_catalogue = pd.read_excel('data/Data Catalogue - Case 1 Airbnb Dataset.xlsx')

df

In [None]:
# Exploratory data analysis to understand the dataset
#df.isnull().sum() #--> There are no missing values in the dataset
df.describe()

In [None]:
df['room_type'].value_counts()

In [None]:


# Data preprocessing
# Convert boolean to integer for correlation analysis
boolean_cols = ['room_shared', 'room_private', 'host_is_superhost']
for col in boolean_cols:
    df[col] = df[col].astype(int)

# Ordinal encoding for room_type: Shared room < Private room < Entire home/apt
room_type_mapping = {'Shared room': 0, 'Private room': 1, 'Entire home/apt': 2}
df['room_type_encoded'] = df['room_type'].map(room_type_mapping)

# Ordinal encoding of city based on the average price
city_mean = df.groupby('city')['realSum'].mean()
sorted_cities = city_mean.sort_values()
city_encoding = {city: idx for idx, city in enumerate(sorted_cities.index)}

# Step 4: Apply encoding to the original DataFrame
df['city_encoded'] = df['city'].map(city_encoding)

df

<a id="eda-static"></a>
# EDA STATIC PLOTS

In [None]:

# 1.2 Room Type Analysis
plt.figure(figsize=(16, 6))

plt.subplot(1, 3, 1)
sns.boxplot(x='room_type', y='realSum', data=df)
plt.title('Price by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Price (USD)')

plt.subplot(1, 3, 2)
room_type_counts = df['room_type'].value_counts()
plt.pie(room_type_counts, labels=room_type_counts.index, autopct='%1.1f%%')
plt.title('Room Type Distribution')

plt.subplot(1, 3, 3)
avg_price_by_room = df.groupby('room_type')['realSum'].mean().sort_values()
sns.barplot(x=avg_price_by_room.index, y=avg_price_by_room.values)
plt.title('Average Price by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Average Price (USD)')

plt.tight_layout()
plt.savefig('plots/room_type_analysis.png')
plt.show()

In [None]:

# 1.3 Ratings Analysis
plt.figure(figsize=(16, 6))

plt.subplot(1, 3, 1)
sns.histplot(df['guest_satisfaction_overall'], kde=True)
plt.title('Guest Satisfaction Distribution')
plt.xlabel('Guest Satisfaction Rating (1-100)')

plt.subplot(1, 3, 2)
sns.boxplot(x='room_type', y='guest_satisfaction_overall', data=df)
plt.title('Guest Satisfaction by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Guest Satisfaction Rating (1-100)')

plt.subplot(1, 3, 3)
sns.scatterplot(x='cleanliness_rating', y='guest_satisfaction_overall', data=df)
plt.title('Cleanliness vs Overall Satisfaction')
plt.xlabel('Cleanliness Rating (1-10)')
plt.ylabel('Guest Satisfaction Rating (1-100)')

plt.tight_layout()
plt.savefig('plots/ratings_analysis.png')
plt.show()


In [None]:

# 1.4 Distance and Location Analysis
plt.figure(figsize=(16, 10))

plt.subplot(2, 2, 1)
sns.scatterplot(x='dist', y='realSum', data=df)
plt.title('Price vs Distance to City Center')
plt.xlabel('Distance to City Center (km)')
plt.ylabel('Price (USD)')

plt.subplot(2, 2, 2)
sns.scatterplot(x='metro_dist', y='realSum', data=df)
plt.title('Price vs Distance to Metro')
plt.xlabel('Distance to Metro (km)')
plt.ylabel('Price (USD)')

plt.subplot(2, 2, 3)
sns.boxplot(x='city', y='realSum', data=df)
plt.title('Price by City')
plt.xlabel('City')
plt.ylabel('Price (USD)')
plt.xticks(rotation=45)

plt.subplot(2, 2, 4)
sns.scatterplot(x='attr_index_norm', y='realSum', hue='city', data=df)
plt.title('Price vs Attraction Index')
plt.xlabel('Normalized Attraction Index')
plt.ylabel('Price (USD)')

plt.tight_layout()
plt.savefig('plots/location_analysis.png')
plt.show()

In [None]:
plt.figure(figsize=(16, 10))

plt.subplot(2, 2, 1)
sns.scatterplot(x='dist', y='realSum', hue='room_type', data=df)
plt.title('Price vs Distance to City Center by Room Type')
plt.xlabel('Distance to City Center (km)')
plt.ylabel('Price (USD)')

plt.subplot(2, 2, 2)
sns.scatterplot(x='metro_dist', y='realSum', hue='room_type', data=df)
plt.title('Price vs Distance to Metro by Room Type')
plt.xlabel('Distance to Metro (km)')
plt.ylabel('Price (USD)')

plt.subplot(2, 2, 3)
sns.boxplot(x='room_type', y='realSum', data=df)
plt.title('Price by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Price (USD)')
plt.xticks(rotation=45)

plt.subplot(2, 2, 4)
sns.scatterplot(x='attr_index_norm', y='realSum', hue='room_type', data=df)
plt.title('Price vs Attraction Index by Room Type')
plt.xlabel('Normalized Attraction Index')
plt.ylabel('Price (USD)')

plt.tight_layout()
plt.savefig('plots/location_analysis_by_room_type.png')
plt.show()


In [None]:

# 1.5 Amenities and Features Analysis
plt.figure(figsize=(16, 10))

plt.subplot(2, 2, 1)
sns.boxplot(x='host_is_superhost', y='realSum', data=df)
plt.title('Price by Superhost Status')
plt.xlabel('Is Superhost')
plt.ylabel('Price (USD)')
plt.xticks([0, 1], ['No', 'Yes'])

plt.subplot(2, 2, 2)
sns.boxplot(x='biz', y='realSum', data=df)
plt.title('Price by Business Facilities')
plt.xlabel('Has Business Facilities')
plt.ylabel('Price (USD)')
plt.xticks([0, 1], ['No', 'Yes'])

plt.subplot(2, 2, 3)
sns.boxplot(x='room_shared', y='realSum', data=df)
plt.title('Price by Shared Room Status')
plt.xlabel('Is Shared')
plt.ylabel('Price (USD)')
plt.xticks([0, 1], ['No', 'Yes'])

plt.subplot(2, 2, 4)
sns.boxplot(x='room_private', y='realSum', data=df)
plt.title('Price by Private Room Status')
plt.xlabel('Is Private')
plt.ylabel('Price (USD)')
plt.xticks([0, 1], ['No', 'Yes'])

plt.tight_layout()
plt.savefig('plots/amenities_analysis.png')
plt.show()

In [None]:

# 1.6 Capacity and Bedrooms Analysis
plt.figure(figsize=(20, 6))

plt.subplot(1, 3, 1)
sns.boxplot(x='person_capacity', y='realSum', data=df)
plt.title('Price by Person Capacity')
plt.xlabel('Person Capacity')
plt.ylabel('Price (USD)')

plt.subplot(1, 3, 2)
sns.boxplot(x='bedrooms', y='realSum', data=df)
plt.title('Price by Number of Bedrooms')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Price (USD)')

plt.subplot(1, 3, 3)
capacity_bedrooms = df.groupby(['person_capacity', 'bedrooms'])['realSum'].mean().reset_index()
pivot_table = capacity_bedrooms.pivot(index='person_capacity', columns='bedrooms', values='realSum')
sns.heatmap(pivot_table, annot=True, cmap='YlGnBu', fmt='.1f')
plt.title('Average Price by Capacity and Bedrooms')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Person Capacity')


plt.tight_layout()
plt.savefig('plots/capacity_bedrooms_analysis.png')
plt.show()


<a id="eda-dynamic"></a>
# Dynamic 3D plots using Plotly

In [None]:

# 2. INTERACTIVE VISUALIZATIONS WITH PLOTLY

# 2.1 Price Distribution by Room Type
fig = px.histogram(
    df, 
    x='realSum', 
    color='room_type',
    marginal='box',
    hover_data=df.columns,
    title='Price Distribution by Room Type',
    labels={'realSum': 'Price (USD)', 'room_type': 'Room Type'}
)
fig.write_html('plots/price_by_room_type_interactive.html')
fig.show()

In [None]:


# 2.2 Scatter Plot of Price vs. Distance with Multiple Features
fig = px.scatter(
    df,
    x='dist',
    y='realSum',
    color='room_type',
    size='person_capacity',
    hover_data=['city', 'guest_satisfaction_overall', 'cleanliness_rating'],
    title='Price vs. Distance to City Center',
    labels={
        'realSum': 'Price (USD)',
        'dist': 'Distance to City Center (km)',
        'room_type': 'Room Type',
        'person_capacity': 'Capacity'
    }
)
fig.write_html('plots/price_vs_distance_interactive.html')
fig.show()

In [None]:

# 2.3 Interactive Map
fig = px.scatter_mapbox(
    df,
    lat='lat',
    lon='lng',
    color='realSum',
    size='person_capacity',
    hover_name='room_type',
    hover_data=['realSum', 'guest_satisfaction_overall', 'dist'],
    color_continuous_scale=px.colors.cyclical.IceFire,
    zoom=10,
    mapbox_style='open-street-map',
    title='Geographic Distribution of Listings',
    width=1500,
    height=800,
)
fig.write_html('plots/geographic_distribution_interactive.html')
fig.show()

In [None]:

# 2.4 3D Scatter Plot for Price, Distance, and Ratings
fig = px.scatter_3d(
    df,
    x='dist',
    y='guest_satisfaction_overall',
    z='realSum',
    color='room_type',
    size='person_capacity',
    hover_data=['city', 'cleanliness_rating'],
    title='3D Relationship: Price, Distance, and Satisfaction',
    height=1000,
    width=1200,
    labels={
        'realSum': 'Price (USD)',
        'dist': 'Distance to City Center (km)',
        'guest_satisfaction_overall': 'Guest Satisfaction',
        'room_type': 'Room Type'
    }
)
fig.write_html('plots/3d_relationship_interactive.html')
fig.show()

<a id="correlation"></a>
# Outlier removal and correlation analysis

In [None]:
df_new = df.copy()
cols_to_filter = [
    'realSum', 'person_capacity', 'cleanliness_rating', 'guest_satisfaction_overall',
    'bedrooms', 'dist', 'metro_dist','attr_index_norm','rest_index_norm'
]

# Apply outlier filtering based on 5th and 95th percentile
for col in cols_to_filter:
    lower = df[col].quantile(0.05)
    upper = df[col].quantile(0.95)
    df_new = df[(df[col] >= lower) & (df[col] <= upper)]



print(f"Original length of the dataset : {len(df)}")
print(f"Length of the dataset after removing outliers : {len(df_new)} ")
old_df = df.copy()
df = df_new

In [None]:

# 3. CORRELATION ANALYSIS

# Select numerical columns for correlation analysis
numerical_cols = ['realSum', 'room_type_encoded', 'room_shared', 'room_private', 
                 'person_capacity', 'host_is_superhost', 'multi', 'biz',
                 'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms',
                 'dist', 'metro_dist', 'attr_index_norm',
                 'rest_index_norm', 'weekday', 'city_encoded']

# Calculate correlation matrix
corr_matrix = df_new[numerical_cols].corr(method='pearson')

# Plot correlation heatmap
plt.figure(figsize=(16, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(
    corr_matrix,
    mask=mask,
    cmap=cmap,
    vmax=1,
    vmin=-1,
    center=0,
    square=True,
    linewidths=.5,
    annot=True,
    fmt='.2f',
    cbar_kws={'shrink': .8}
)
plt.title('Correlation Matrix of Numerical Features', fontsize=16)
plt.tight_layout()
plt.savefig('plots/correlation_matrix.png')
plt.show()


In [None]:

# 4. TOP CORRELATIONS WITH PRICE

# Get correlations with price and sort
price_correlations = corr_matrix['realSum'].drop('realSum').sort_values(ascending=False)

# Plot the top correlations
plt.figure(figsize=(12, 8))
price_correlations.plot(kind='bar')
plt.title('Feature Correlations with Price', fontsize=16)
plt.xlabel('Features')
plt.ylabel('Correlation Coefficient')
plt.axhline(y=0, color='r', linestyle='-', alpha=0.3)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('plots/price_correlations.png')
plt.show()

In [None]:

# 5. PAIRPLOT FOR KEY VARIABLES
key_vars = ['realSum','room_type_encoded', 'person_capacity', 
           'guest_satisfaction_overall', 'dist', 'bedrooms']

plt.figure(figsize=(15, 12))
sns.pairplot(df[key_vars], height=2.5, corner=True)
plt.suptitle('Pairplot of Key Variables', y=1.02, fontsize=16)
plt.savefig('plots/key_variables_pairplot.png')
plt.show()

In [None]:

# 6. ANALYSIS BY CITY

# Average price by city and room type
city_room_price = df.groupby(['city', 'room_type'])['realSum'].mean().reset_index()
pivot_city_room = city_room_price.pivot(index='city', columns='room_type', values='realSum')

plt.figure(figsize=(12, 8))
pivot_city_room.plot(kind='bar', figsize=(12, 6))
plt.title('Average Price by City and Room Type', fontsize=16)
plt.xlabel('City')
plt.ylabel('Average Price (USD)')
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.legend(title='Room Type')
plt.tight_layout()
plt.savefig('plots/city_room_type_price.png')
plt.show()

In [None]:

# 7. Feature Importance for Price (using simple linear model coefficients)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Selecting features for the model
X = df[numerical_cols].drop('realSum', axis=1)
y = df['realSum']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train a simple linear model
model = LinearRegression()
model.fit(X_scaled, y)

# Get feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': np.abs(model.coef_)
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance for Price Prediction', fontsize=16)
plt.xlabel('Absolute Coefficient Value')
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('plots/feature_importance.png')
plt.show()


In [None]:

# 8. SUMMARY OF FINDINGS

print("\n\nSUMMARY OF FINDINGS\n")
print("1. Price Distribution Analysis:")
print(f"   - Average Price: ${df['realSum'].mean():.2f}")
print(f"   - Median Price: ${df['realSum'].median():.2f}")
print(f"   - Price Range: ${df['realSum'].min():.2f} - ${df['realSum'].max():.2f}")

print("\n2. Top 5 Factors Correlated with Price:")
for i, (feature, corr) in enumerate(price_correlations.head(5).items(), 1):
    print(f"   {i}. {feature}: {corr:.3f}")

print("\n3. Key Insights:")
print("   - Room Type Impact: ", end="")
room_type_avg = df.groupby('room_type')['realSum'].mean().sort_values()
print(", ".join([f"{room}: ${price:.2f}" for room, price in room_type_avg.items()]))

print("   - Location Effect: ", end="")
if corr_matrix['realSum']['dist'] < 0:
    print(f"Properties closer to city center tend to be more expensive (correlation: {corr_matrix['realSum']['dist']:.3f})")
else:
    print(f"Distance to city center shows unusual positive correlation with price (correlation: {corr_matrix['realSum']['dist']:.3f})")

print("   - Capacity and Price: ", end="")
capacity_corr = corr_matrix['realSum']['person_capacity']
print(f"Person capacity has {abs(capacity_corr):.3f} {'positive' if capacity_corr > 0 else 'negative'} correlation with price")

print("   - Ratings Relationship: ", end="")
rating_corr = corr_matrix['realSum']['guest_satisfaction_overall']
print(f"Guest satisfaction has {abs(rating_corr):.3f} {'positive' if rating_corr > 0 else 'negative'} correlation with price")

print("\n4. Recommendations for Feature Engineering:")
print("   - Create interaction features between room type and location")
print("   - Develop proximity scores combining metro_dist and dist")
print("   - Generate price per person ratio (realSum/person_capacity)")
print("   - Create categorical price bands for classification tasks")


<a id="ml-model"></a>
# BUILDING A MACHINE LEARNING MODEL

In [None]:

from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope
import warnings
warnings.filterwarnings('ignore')


# Define features and target variable
# Based on correlation matrix, we'll select the most relevant features for price prediction
X = df[[
    'room_type_encoded', 'person_capacity', 'host_is_superhost',
    'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms',
    'dist', 'metro_dist', 'attr_index_norm', 'rest_index_norm',
    'city_encoded'
]]

y = df['realSum']

print("\nSelected features:", X.columns.tolist())
print("Number of features:", X.shape[1])

<a id="hyperopt"></a>
# HYPER-PARAMETER OPTIMIZATION

In [None]:
'''
# Define a function for hyperopt to optimize
def objective(params):
    """
    Objective function for hyperopt optimization
    """
    # Extract parameters
    n_estimators = int(params['n_estimators'])
    max_depth = int(params['max_depth']) if params['max_depth'] is not None else None
    min_samples_split = float(params['min_samples_split'])
    min_samples_leaf = float(params['min_samples_leaf'])
    max_features = params['max_features']
    bootstrap = params['bootstrap']
    
    # Create RF model with the parameters
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42,
        n_jobs=-1
    )
    
    # Create 5-fold cross validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Calculate negative MAE (we want to maximize this, which means minimizing MAE)
    cv_scores = cross_val_score(rf, X, y, cv=kf, scoring='neg_mean_absolute_error')
    mean_mae = -cv_scores.mean()
    
    # Return the result to hyperopt
    return {'loss': mean_mae, 'status': STATUS_OK}

# Define the search space for hyperopt

space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 500, 10)),  # Integer, 50 to 500
    'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),           # Integer, 5 to 50
    'min_samples_split': hp.uniform('min_samples_split', 0.01, 0.5),       # Float, 0.01 to 0.5
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.01 , 0.5),         # Float, 0.01 to 0.5
    'max_features': hp.choice('max_features', [ 'sqrt', 'log2', None]),
    'bootstrap': hp.choice('bootstrap', [True, False]),
}

# Create a trials object to store the results
trials = Trials()

# Run hyperopt optimization
print("\nStarting hyperparameter optimization...")
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=25,
    trials=trials
)
'''

In [None]:
'''
# Print the best parameters
print("\nBest hyperparameters:")
# Convert max_depth parameter
if best.get('max_depth') == 0:
    best_max_depth = None
else:
    best_max_depth = int(best.get('max_depth_int', 10))

max_features_list = ['auto', 'sqrt', 'log2']
bootstrap_list = [True, False]

best_params = {
    'n_estimators': int(best['n_estimators']),
    'max_depth': best_max_depth,
    'min_samples_split': int(best['min_samples_split']),
    'min_samples_leaf': int(best['min_samples_leaf']),
    'max_features': max_features_list[best['max_features']],
    'bootstrap': bootstrap_list[best['bootstrap']]
}

for param, value in best_params.items():
    print(f"{param}: {value}")
'''

In [None]:

# Train the best model
best_rf = RandomForestRegressor(
    n_estimators=130,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=5,
    max_features='sqrt',
    bootstrap=False,
    random_state=42,
    n_jobs=-1
)

# Perform 5-fold cross-validation with the best model
kf = KFold(n_splits=5, shuffle=True, random_state=42)
mae_scores = -cross_val_score(best_rf, X, y, cv=kf, scoring='neg_mean_absolute_error')
rmse_scores = np.sqrt(-cross_val_score(best_rf, X, y, cv=kf, scoring='neg_mean_squared_error'))
r2_scores = cross_val_score(best_rf, X, y, cv=kf, scoring='r2')

# Print cross-validation results
print("\n5-Fold Cross-Validation Results:")
print(f"Mean Absolute Error (MAE): {mae_scores.mean():.2f} ± {mae_scores.std():.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_scores.mean():.2f} ± {rmse_scores.std():.2f}")
print(f"R² Score: {r2_scores.mean():.4f} ± {r2_scores.std():.4f}")

In [None]:

# Fit the best model on the entire dataset
best_rf.fit(X, y)

# Plot feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Random Forest Feature Importance', fontsize=16)
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('plots/feature_importance_rf.png')
plt.show()


In [None]:
print("\nFeature Importance Ranking:")
for i, (feature, importance) in enumerate(zip(feature_importances['Feature'], feature_importances['Importance']), 1):
    print(f"{i}. {feature}: {importance:.4f}")


<a id="evaluate"></a>
## Model Evaluation

In [None]:

# Make predictions on the dataset for visualization
y_pred = best_rf.predict(X)

# Create scatter plot of actual vs predicted prices
plt.figure(figsize=(10, 8))
plt.scatter(y, y_pred, alpha=0.5)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.tight_layout()
plt.savefig('plots/actual_vs_predicted.png')
plt.show()


In [None]:


# Calculate error metrics on the entire dataset
mae = mean_absolute_error(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
r2 = r2_score(y, y_pred)

print("\nFinal Model Performance (on full dataset):")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

# Calculate error by price range
df['predicted_price'] = y_pred
df['absolute_error'] = np.abs(df['realSum'] - df['predicted_price'])
df['percent_error'] = 100 * df['absolute_error'] / df['realSum']

price_bins = [0, 100, 200, 500, 1000, 10000]
df['price_bin'] = pd.cut(df['realSum'], bins=price_bins)

error_by_price_range = df.groupby('price_bin')[['absolute_error', 'percent_error']].mean()
print("\nError by Price Range:")
print(error_by_price_range)

# Plot error by price range
plt.figure(figsize=(12, 6))
ax = error_by_price_range['absolute_error'].plot(kind='bar', color='skyblue')
plt.title('Mean Absolute Error by Price Range')
plt.ylabel('Mean Absolute Error')
plt.xlabel('Price Range')
plt.tight_layout()
plt.savefig('plots/error_by_price_range.png')
plt.show()

plt.figure(figsize=(12, 6))
ax = error_by_price_range['percent_error'].plot(kind='bar', color='salmon')
plt.title('Mean Percentage Error by Price Range')
plt.ylabel('Mean Percentage Error (%)')
plt.xlabel('Price Range')
plt.tight_layout()
plt.savefig('plots/percent_error_by_price_range.png')
plt.show()
