# Airbnb Property Listing - Regression Modelling with Numerical Data

## Library Imports

In [12]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import data_cleaning

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## Import Data

In [2]:
with ZipFile('./airbnb-property-listings.zip') as myzip:
    data = myzip.open("AirbnbDataSci/tabular_data/AirBnbData.csv")

data_df = pd.read_csv(data)
cleaned_data_df = data_df.pipe(data_cleaning.clean_tabular_data)
cleaned_data_df.head()

Unnamed: 0,id,category,title,description,amenities,location,guests,beds,bathrooms,price_night,cleanliness_rate,accuracy_rate,communication_rate,location_rate,check-in_rate,value_rate,amenities_count,url,bedrooms
0,f9dcbd09-32ac-41d9-a0b1-fdb2793378cf,Treehouses,Red Kite Tree Tent - Ynys Affalon,Escape to one of these two fabulous Tree Tents...,"['What this place offers', 'Bathroom', 'Shampo...",Llandrindod Wells United Kingdom,2.0,1.0,1.0,105.0,4.6,4.7,4.3,5.0,4.3,4.3,13.0,https://www.airbnb.co.uk/rooms/26620994?adults...,1.0
1,1b4736a7-e73e-45bc-a9b5-d3e7fcf652fd,Treehouses,Az Alom Cabin - Treehouse Tree to Nature Cabin,Come and spend a romantic stay with a couple o...,"['What this place offers', 'Bedroom and laundr...",Guyonvelle Grand Est France,3.0,3.0,0.0,92.0,4.3,4.7,4.6,4.9,4.7,4.5,8.0,https://www.airbnb.co.uk/rooms/27055498?adults...,1.0
2,d577bc30-2222-4bef-a35e-a9825642aec4,Treehouses,Cabane Entre Les Pins\n🌲🏕️🌲,"Rustic cabin between the pines, 3 meters high ...","['What this place offers', 'Scenic views', 'Ga...",Duclair Normandie France,4.0,2.0,1.5,52.0,4.2,4.6,4.8,4.8,4.8,4.7,51.0,https://www.airbnb.co.uk/rooms/51427108?adults...,1.0
3,ca9cbfd4-7798-4e8d-8c17-d5a64fba0abc,Treehouses,Tree Top Cabin with log burner & private hot tub,The Tree top cabin is situated in our peaceful...,"['What this place offers', 'Bathroom', 'Hot wa...",Barmouth Wales United Kingdom,2.0,1.0,1.0,132.0,4.8,4.9,4.9,4.9,5.0,4.6,23.0,https://www.airbnb.co.uk/rooms/49543851?adults...,1.0
5,cfe479b9-c8f8-44af-9bc6-46ede9f14bb5,Treehouses,Treehouse near Paris Disney,"Charming cabin nestled in the leaves, real unu...","['What this place offers', 'Bathroom', 'Hair d...",Le Plessis-Feu-Aussoux Île-de-France France,4.0,3.0,1.0,143.0,5.0,4.9,5.0,4.7,5.0,4.7,32.0,https://www.airbnb.co.uk/rooms/935398?adults=1...,2.0


In [3]:
cleaned_data_df.isna().sum().sum()

0

## Baseline Model

In [4]:
X = cleaned_data_df.select_dtypes('number').drop(columns = 'price_night')
y = cleaned_data_df.price_night

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression with No Scaling

In [8]:
X_train_train, X_valid, y_train_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
linear_regression = LinearRegression()
linear_regression.fit(X_train_train, y_train_train)
linear_regression_mse = mean_squared_error(y_valid, linear_regression.predict(X_valid))
linear_regression_mse

13371.730585482395

### Linear Regression with Standard Scaler

In [10]:
std_scaler = StandardScaler()
X_train_train_std_scale = std_scaler.fit_transform(X_train_train)
linear_regression_std_scale = LinearRegression()
linear_regression_std_scale.fit(X_train_train_std_scale, y_train_train)
linear_regression_std_scale_mse = mean_squared_error(y_valid, linear_regression_std_scale.predict(std_scaler.transform(X_valid)))
linear_regression_std_scale_mse

13371.730585482373

### Linear Regression with Min-Max Scaler

In [11]:
min_max_scaler = MinMaxScaler()
X_train_train_min_max_scale = min_max_scaler.fit_transform(X_train_train)
linear_regression_min_max_scale = LinearRegression()
linear_regression_min_max_scale.fit(X_train_train_min_max_scale, y_train_train)
linear_regression_min_max_scale_mse = mean_squared_error(y_valid, linear_regression_min_max_scale.predict(min_max_scaler.transform(X_valid)))
linear_regression_min_max_scale_mse

13371.730585482375

### SGD Regressor with No Scaling

In [13]:
sgd_regression = SGDRegressor()
sgd_regression.fit(X_train_train, y_train_train)
sgd_regression_mse = mean_squared_error(y_valid, sgd_regression.predict(X_valid))
sgd_regression_mse

5.598455051690578e+20

### SGD Regressor with Standard Scaler

In [14]:
std_scaler = StandardScaler()
X_train_train_std_scale = std_scaler.fit_transform(X_train_train)
sgd_regression_std_scale = SGDRegressor()
sgd_regression_std_scale.fit(X_train_train_std_scale, y_train_train)
sgd_regression_std_scale_mse = mean_squared_error(y_valid, sgd_regression_std_scale.predict(std_scaler.transform(X_valid)))
sgd_regression_std_scale_mse

13385.845975513213

### SGD Regressor with Min-Max Scaler

In [15]:
min_max_scaler = MinMaxScaler()
X_train_train_min_max_scale = min_max_scaler.fit_transform(X_train_train)
sgd_regression_min_max_scale = SGDRegressor()
sgd_regression_min_max_scale.fit(X_train_train_min_max_scale, y_train_train)
sgd_regression_min_max_scale_mse = mean_squared_error(y_valid, sgd_regression_min_max_scale.predict(min_max_scaler.transform(X_valid)))
sgd_regression_min_max_scale_mse

15793.375873913477

So the best baseline MSE is with Linear Regression and a score of 13371.

In [16]:
linear_regression_train_mse = mean_squared_error(y_train_train, linear_regression.predict(X_train_train))
linear_regression_train_mse

8867.150982054105

So the best model so far is over-fitting.