In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [8]:
df = pd.read_csv('~/Capstone/data/airbnb-listings_cleaned.csv', low_memory=False)

In [None]:
df.head(5)

In [3]:
# Encode categorical features with mean price
categorical_cols = df.select_dtypes(include=['object']).columns
df_le = df.copy()
for col in categorical_cols:
    le = LabelEncoder()
    df_le[col] = df_le.groupby(col)['Price'].transform('mean')

# Convert datetime columns to integer format (seconds since epoch)
datetime_column = df.select_dtypes(include=['datetime64']).columns
for col in datetime_column:
    df_le[col] = df_le[col].astype('int64') // 10**9

# Compute the correlation matrix
corr_matrix = df_le.corr()

# Select the correlations of other variables with the target variable "Price"
price_corr = corr_matrix['Price'].abs().sort_values(ascending=False)

# Select top 30 correlations excluding the target variable itself
top_30_corr = price_corr[1:31]

# Plot the correlation bar chart
plt.figure(figsize=(12, 8))
top_30_corr.plot(kind='barh', color='lightcoral')
plt.title('Top 30 Correlations with Price', fontsize=16)
plt.xlabel('Correlation', fontsize=12)
plt.ylabel('Variables', fontsize=12)
plt.show()

NameError: name 'df' is not defined

#### Feature Selection

In [5]:
sf = ['Amenities', 'Street', 'Neighbourhood Cleansed', 'Host Name', 'Bedrooms', 'Accommodates', 'Room Type', 'Bathrooms']
X = df_le[sf]
y = df_le['Price']

#### First Split

In [6]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

#### Second Split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.1111, random_state=42)

#### Linear Regression Model Application and Evaluation

In [8]:
lm_model = LinearRegression()

#### K-Fold Cross Validation

In [9]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
cv_r = []
for fold, (train_i,val_i) in enumerate(kf.split(X_train),1):
    X_train_f, X_val_f = X_train.iloc[train_i], X_train.iloc[val_i]
    y_train_f, y_val_f = y_train.iloc[train_i], y_train.iloc[val_i]

    s = StandardScaler()
    X_train_s = s.fit_transform(X_train_f)
    X_val_s = s.transform(X_val_f)

    lm_model.fit(X_train_s,y_train_f)
    y_pred = lm_model.predict(X_val_s)

    mse = mean_squared_error(y_val_f, y_pred)
    mae = mean_absolute_error(y_val_f, y_pred)
    r2 = r2_score(y_val_f, y_pred)

    cv_r.append({'Fold': fold, 'MSE': mse, 'MAE': mae, 'R2': r2})
    
    print(f"Fold {fold} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

Fold 1 - MSE: 2978.3434, MAE: 27.2430, R2: 0.8657
Fold 2 - MSE: 2896.5145, MAE: 27.1349, R2: 0.8689
Fold 3 - MSE: 2952.9891, MAE: 27.3699, R2: 0.8659
Fold 4 - MSE: 2959.9590, MAE: 27.3386, R2: 0.8654
Fold 5 - MSE: 3038.9513, MAE: 27.3794, R2: 0.8649


In [11]:
avg_mse = np.mean([fold['MSE'] for fold in cv_r])
avg_mae = np.mean([fold['MAE'] for fold in cv_r])
avg_r2 = np.mean([fold['R2'] for fold in cv_r])
print(f"\nAverage CV - MSE: {avg_mse:.4f}, MAE: {avg_mae:.4f}, R2: {avg_r2:.4f}")


Average CV - MSE: 2965.3514, MAE: 27.2932, R2: 0.8662


#### Train model on whole train set and evaluate on validation set

In [12]:
s_2 = StandardScaler()
X_train_s = s_2.fit_transform(X_train)
X_val_s = s_2.transform(X_val)

In [13]:
lm_model.fit(X_train_s,y_train)
y_pred_val = lm_model.predict(X_val_s)

In [14]:
mse = mean_squared_error(y_val, y_pred_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)
print(f"\nValidation Results - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Validation Results - MSE: 3033.0690, MAE: 27.3368, R2: 0.8585


#### Last Evaluation on test set

In [15]:
X_test_s = s_2.transform(X_test)
y_pred_test = lm_model.predict(X_test_s)

In [16]:
mse_t = mean_squared_error(y_test, y_pred_test)
mae_t = mean_absolute_error(y_test, y_pred_test)
r2_t = r2_score(y_test, y_pred_test)
print(f"\nTest Results - MSE: {mse_t:.4f}, MAE: {mae_t:.4f}, R2: {r2_t:.4f}")


Test Results - MSE: 2977.0707, MAE: 27.2840, R2: 0.8653


#### Random Forest Regression Model Application and Evaluation

In [17]:
rf_model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)

#### K-Fold Cross Validation

In [18]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [19]:
cv_r = []
for fold, (train_i,val_i) in enumerate(kf.split(X_train),1):
    X_train_f, X_val_f = X_train.iloc[train_i], X_train.iloc[val_i]
    y_train_f, y_val_f = y_train.iloc[train_i], y_train.iloc[val_i]

    s = StandardScaler()
    X_train_s = s.fit_transform(X_train_f)
    X_val_s = s.transform(X_val_f)

    rf_model.fit(X_train_s,y_train_f)
    y_pred = rf_model.predict(X_val_s)

    mse = mean_squared_error(y_val_f, y_pred)
    mae = mean_absolute_error(y_val_f, y_pred)
    r2 = r2_score(y_val_f, y_pred)

    cv_r.append({'Fold': fold, 'MSE': mse, 'MAE': mae, 'R2': r2})
    
    print(f"Fold {fold} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

Fold 1 - MSE: 1881.2501, MAE: 15.3614, R2: 0.9152
Fold 2 - MSE: 1837.6162, MAE: 15.4155, R2: 0.9168
Fold 3 - MSE: 1865.8119, MAE: 15.4441, R2: 0.9153
Fold 4 - MSE: 1887.7830, MAE: 15.5338, R2: 0.9142
Fold 5 - MSE: 1973.1052, MAE: 15.4727, R2: 0.9123


In [20]:
avg_mse = np.mean([fold['MSE'] for fold in cv_r])
avg_mae = np.mean([fold['MAE'] for fold in cv_r])
avg_r2 = np.mean([fold['R2'] for fold in cv_r])
print(f"\nAverage CV - MSE: {avg_mse:.4f}, MAE: {avg_mae:.4f}, R2: {avg_r2:.4f}")


Average CV - MSE: 1889.1133, MAE: 15.4455, R2: 0.9147


#### Train model on whole train set and evaluate on validation set

In [21]:
s_2 = StandardScaler()
X_train_s = s_2.fit_transform(X_train)
X_val_s = s_2.transform(X_val)

In [22]:
rf_model.fit(X_train_s,y_train)
y_pred_val = rf_model.predict(X_val_s)

In [23]:
mse = mean_squared_error(y_val, y_pred_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)
print(f"\nValidation Results - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Validation Results - MSE: 1840.5930, MAE: 15.1676, R2: 0.9141


#### Last Evaluation on test set

In [24]:
X_test_s = s_2.transform(X_test)
y_pred_test = rf_model.predict(X_test_s)

In [25]:
mse_t = mean_squared_error(y_test, y_pred_test)
mae_t = mean_absolute_error(y_test, y_pred_test)
r2_t = r2_score(y_test, y_pred_test)
print(f"\nTest Results - MSE: {mse_t:.4f}, MAE: {mae_t:.4f}, R2: {r2_t:.4f}")


Test Results - MSE: 1837.2659, MAE: 15.0533, R2: 0.9168


#### Gradient Boosting Regression Model Application and Evaluation

In [26]:
gb_model = GradientBoostingRegressor()

#### K-Fold Cross Validation

In [27]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [28]:
cv_r = []
for fold, (train_i,val_i) in enumerate(kf.split(X_train),1):
    X_train_f, X_val_f = X_train.iloc[train_i], X_train.iloc[val_i]
    y_train_f, y_val_f = y_train.iloc[train_i], y_train.iloc[val_i]

    s = StandardScaler()
    X_train_s = s.fit_transform(X_train_f)
    X_val_s = s.transform(X_val_f)

    gb_model.fit(X_train_s,y_train_f)
    y_pred = gb_model.predict(X_val_s)

    mse = mean_squared_error(y_val_f, y_pred)
    mae = mean_absolute_error(y_val_f, y_pred)
    r2 = r2_score(y_val_f, y_pred)

    cv_r.append({'Fold': fold, 'MSE': mse, 'MAE': mae, 'R2': r2})
    
    print(f"Fold {fold} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

Fold 1 - MSE: 2147.7309, MAE: 20.9766, R2: 0.9032
Fold 2 - MSE: 2086.8547, MAE: 21.0400, R2: 0.9055
Fold 3 - MSE: 2119.3647, MAE: 21.1550, R2: 0.9038
Fold 4 - MSE: 2134.9786, MAE: 20.9059, R2: 0.9029
Fold 5 - MSE: 2216.8537, MAE: 21.2301, R2: 0.9014


In [29]:
avg_mse = np.mean([fold['MSE'] for fold in cv_r])
avg_mae = np.mean([fold['MAE'] for fold in cv_r])
avg_r2 = np.mean([fold['R2'] for fold in cv_r])
print(f"\nAverage CV - MSE: {avg_mse:.4f}, MAE: {avg_mae:.4f}, R2: {avg_r2:.4f}")


Average CV - MSE: 2141.1565, MAE: 21.0615, R2: 0.9034


#### Train model on whole train set and evaluate on validation set

In [30]:
s_2 = StandardScaler()
X_train_s = s_2.fit_transform(X_train)
X_val_s = s_2.transform(X_val)

In [31]:
gb_model.fit(X_train_s,y_train)
y_pred_val = gb_model.predict(X_val_s)

In [32]:
mse = mean_squared_error(y_val, y_pred_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)
print(f"\nValidation Results - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Validation Results - MSE: 2199.6697, MAE: 21.1132, R2: 0.8974


#### Last Evaluation on test set

In [33]:
X_test_s = s_2.transform(X_test)
y_pred_test = gb_model.predict(X_test_s)

In [34]:
mse_t = mean_squared_error(y_test, y_pred_test)
mae_t = mean_absolute_error(y_test, y_pred_test)
r2_t = r2_score(y_test, y_pred_test)
print(f"\nTest Results - MSE: {mse_t:.4f}, MAE: {mae_t:.4f}, R2: {r2_t:.4f}")


Test Results - MSE: 2168.9048, MAE: 20.9695, R2: 0.9018


#### K Nearest Neighbors Regression Model Application and Evaluation

In [35]:
knn_model = KNeighborsRegressor()

#### K-Fold Cross Validation

In [36]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [37]:
cv_r = []
for fold, (train_i,val_i) in enumerate(kf.split(X_train),1):
    X_train_f, X_val_f = X_train.iloc[train_i], X_train.iloc[val_i]
    y_train_f, y_val_f = y_train.iloc[train_i], y_train.iloc[val_i]

    s = StandardScaler()
    X_train_s = s.fit_transform(X_train_f)
    X_val_s = s.transform(X_val_f)

    knn_model.fit(X_train_s,y_train_f)
    y_pred = knn_model.predict(X_val_s)

    mse = mean_squared_error(y_val_f, y_pred)
    mae = mean_absolute_error(y_val_f, y_pred)
    r2 = r2_score(y_val_f, y_pred)

    cv_r.append({'Fold': fold, 'MSE': mse, 'MAE': mae, 'R2': r2})
    
    print(f"Fold {fold} - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

Fold 1 - MSE: 2303.0381, MAE: 20.8901, R2: 0.8962
Fold 2 - MSE: 2260.1015, MAE: 20.8061, R2: 0.8977
Fold 3 - MSE: 2308.7543, MAE: 20.9020, R2: 0.8952
Fold 4 - MSE: 2318.1488, MAE: 21.0389, R2: 0.8946
Fold 5 - MSE: 2365.2770, MAE: 20.9024, R2: 0.8948


In [38]:
avg_mse = np.mean([fold['MSE'] for fold in cv_r])
avg_mae = np.mean([fold['MAE'] for fold in cv_r])
avg_r2 = np.mean([fold['R2'] for fold in cv_r])
print(f"\nAverage CV - MSE: {avg_mse:.4f}, MAE: {avg_mae:.4f}, R2: {avg_r2:.4f}")


Average CV - MSE: 2311.0639, MAE: 20.9079, R2: 0.8957


#### Train model on whole train set and evaluate on validation set

In [39]:
s_2 = StandardScaler()
X_train_s = s_2.fit_transform(X_train)
X_val_s = s_2.transform(X_val)

In [40]:
knn_model.fit(X_train_s,y_train)
y_pred_val = knn_model.predict(X_val_s)

In [42]:
mse = mean_squared_error(y_val, y_pred_val)
mae = mean_absolute_error(y_val, y_pred_val)
r2 = r2_score(y_val, y_pred_val)
print(f"\nValidation Results - MSE: {mse:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")


Validation Results - MSE: 2348.9617, MAE: 20.7650, R2: 0.8904


#### Last Evaluation on test set

In [43]:
X_test_s = s_2.transform(X_test)
y_pred_test = knn_model.predict(X_test_s)

In [44]:
mse_t = mean_squared_error(y_test, y_pred_test)
mae_t = mean_absolute_error(y_test, y_pred_test)
r2_t = r2_score(y_test, y_pred_test)
print(f"\nTest Results - MSE: {mse_t:.4f}, MAE: {mae_t:.4f}, R2: {r2_t:.4f}")


Test Results - MSE: 2361.1094, MAE: 20.5758, R2: 0.8931
