# Airbnb NYC Linear regression

source: https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data

author: Elvira Dzhuraeva

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

In [2]:
nyc_airbnb_df = pd.read_csv('AB_NYC_2019.csv', index_col="id")
nyc_airbnb_df.head()

Unnamed: 0_level_0,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
nyc_airbnb_df.shape

(48895, 15)

In [4]:
nyc_airbnb_df.isnull().sum()
nyc_airbnb_df.isna().sum()

name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [5]:
nyc_airbnb_df = nyc_airbnb_df[nyc_airbnb_df.price != 0]

In [6]:
nyc_airbnb_df.dropna(how='any', inplace=True )

In [7]:
duplicates = nyc_airbnb_df[nyc_airbnb_df.duplicated()]
duplicates

Unnamed: 0_level_0,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1


In [8]:
nyc_airbnb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38811 entries, 2539 to 36455809
Data columns (total 15 columns):
name                              38811 non-null object
host_id                           38811 non-null int64
host_name                         38811 non-null object
neighbourhood_group               38811 non-null object
neighbourhood                     38811 non-null object
latitude                          38811 non-null float64
longitude                         38811 non-null float64
room_type                         38811 non-null object
price                             38811 non-null int64
minimum_nights                    38811 non-null int64
number_of_reviews                 38811 non-null int64
last_review                       38811 non-null object
reviews_per_month                 38811 non-null float64
calculated_host_listings_count    38811 non-null int64
availability_365                  38811 non-null int64
dtypes: float64(3), int64(6), object(6)
memory u

In [9]:
nyc_airbnb_df.describe()

Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,38811.0,38811.0,38811.0,38811.0,38811.0,38811.0,38811.0,38811.0,38811.0
mean,64246110.0,40.728134,-73.951157,142.369199,5.868723,29.288088,1.373135,5.166757,114.881631
std,75903710.0,0.054992,0.046695,197.006883,17.390315,48.184653,1.680276,26.306326,129.535406
min,2438.0,40.50641,-74.24442,10.0,1.0,1.0,0.01,1.0,0.0
25%,7028184.0,40.68864,-73.98247,69.0,1.0,3.0,0.19,1.0,0.0
50%,28370920.0,40.72171,-73.95481,101.0,2.0,9.0,0.72,1.0,55.0
75%,101887200.0,40.762995,-73.93503,170.0,4.0,33.0,2.02,2.0,229.0
max,273841700.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [10]:
import math

def haversine(colms):
    R = 6372800  # Earth radius in meters
    center_lat, center_lon = 40.71813, -73.95677    # 40.7549, 73.9840 # Midtown Manhattan
    lat, lon = colms.latitude, colms.longitude
    
    phi1, phi2 = math.radians(center_lat), math.radians(lat) 
    dphi       = math.radians(lat - center_lat)
    dlambda    = math.radians(lon - center_lon)
    
    a = math.sin(dphi/2)**2 + \
        math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    
    return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))

In [11]:
nyc_airbnb_df['distance_center'] = nyc_airbnb_df.apply(haversine, axis=1)

In [12]:
nyc_airbnb_df.drop(['name', 'host_id', 'host_name', 'last_review', 'neighbourhood', 'latitude', 'longitude'],axis=1, inplace=True)

In [13]:
nyc_airbnb_df.head()

Unnamed: 0_level_0,neighbourhood_group,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,distance_center
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2539,Brooklyn,Private room,149,1,9,0.21,6,365,7966.444359
2595,Manhattan,Entire home/apt,225,1,45,0.38,2,355,4556.337024
3831,Brooklyn,Entire home/apt,89,1,270,4.64,1,194,3678.008653
5022,Manhattan,Entire home/apt,80,10,9,0.1,1,0,9004.9769
5099,Manhattan,Entire home/apt,200,3,74,0.59,1,129,3627.133653


In [None]:
sns.pairplot(nyc_airbnb_df)

<seaborn.axisgrid.PairGrid at 0x1a1959c8d0>

In [None]:
nyc_airbnb_df.groupby(['neighbourhood_group']).agg(['mean', 'count']).head()

In [None]:
nyc_airbnb_df.drop('price',axis=1).hist(figsize=(20, 12));

In [None]:
nyc_airbnb_df.price.hist(bins=100, figsize=(20,10));

In [None]:
nyc_airbnb_df['price'] = np.log(nyc_airbnb_df['price'])
nyc_airbnb_df['price'].head()

In [None]:
nyc_airbnb_df.price.hist(bins=30);

In [None]:
nyc_airbnb_df.head()

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(nyc_airbnb_df.corr(), annot=True);

In [None]:
df_categor = pd.get_dummies(nyc_airbnb_df['neighbourhood_group'], drop_first=True, prefix='code')

nyc_airbnb_df = pd.concat([nyc_airbnb_df.drop('neighbourhood_group',axis=1),df_categor], axis=1)
nyc_airbnb_df.head()

In [None]:
df_categor = pd.get_dummies(nyc_airbnb_df['room_type'], drop_first=False, prefix='code')

nyc_airbnb_df = pd.concat([nyc_airbnb_df.drop('room_type',axis=1),df_categor], axis=1)
nyc_airbnb_df.head()

In [None]:
# Split dataset 
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(nyc_airbnb_df, test_size = 0.3)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
continues_columns = ['distance_center','minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
scaler.fit(df_train[continues_columns]) 

In [None]:
df_train_scale = scaler.transform(df_train[continues_columns])

df_test_scale = scaler.transform(df_test[continues_columns])

In [None]:
df_train[continues_columns] = df_train_scale
df_test[continues_columns] = df_test_scale

In [None]:
df_train.head()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
y_train = df_train.price 
y_test = df_test.price 

X_train = df_train.drop('price', axis=1).get_values()
X_test = df_test.drop('price', axis=1).get_values()

# Metics

In [None]:
from sklearn import metrics

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def print_metrics(y_test,y_pred, name):
    print('Method: ', name)
    print('MAE:', metrics.mean_absolute_error(np.exp(y_test), np.exp(y_pred)))
    print('RMSE:', )
    print('MAPE:', mean_absolute_percentage_error(y_test, y_pred))
    pass

def get_metrics(y_test,y_pred, name):
    data = [{'MAE': metrics.mean_absolute_error(np.exp(y_test), np.exp(y_pred)),
             'RMSE': np.sqrt(metrics.mean_squared_error(np.exp(y_test), np.exp(y_pred))),
             'MAPE':mean_absolute_percentage_error(y_test, y_pred),
             'R2':  metrics.r2_score(y_test, y_pred) }] 
    return pd.DataFrame(data, index=[name], columns=["MAE", "RMSE","MAPE","R2"]) 

# Naive prediction

In [None]:
y_mean = np.mean(y_train)                     
y_pred_naive = np.ones(len(y_test)) * y_mean
y_pred_naive[:5]

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

mr = LinearRegression()
mr.fit(X_train, y_train)
y_pred_regr = mr.predict(X_test)

In [None]:
featureImportance = pd.DataFrame({"feature": nyc_airbnb_df.drop('price',axis=1).columns, 
                                  "importance": mr.coef_})

featureImportance.set_index('feature', inplace=True)
featureImportance.sort_values(["importance"], ascending=False, inplace=True)
featureImportance["importance"].plot('bar', figsize=(10, 6));

# Lasso

In [None]:
from sklearn.linear_model import LassoCV

lasso_cv = LassoCV(cv=5)
lasso_cv.fit(X_train, y_train)
print('best', lasso_cv.alpha_)

y_pred_lasso_cv = lasso_cv.predict(X_test)

In [None]:
featureImportance = pd.DataFrame({"feature": nyc_airbnb_df.drop('price',axis=1).columns, 
                                  "importance": lasso_cv.coef_})

featureImportance.set_index('feature', inplace=True)
featureImportance.sort_values(["importance"], ascending=False, inplace=True)
featureImportance["importance"].plot('bar', figsize=(10, 8));

# Ridge

In [None]:
from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV(cv=5)
ridge_cv.fit(X_train, y_train)
print('best', ridge_cv.alpha_)

y_pred_ridge_cv = ridge_cv.predict(X_test)

In [None]:
featureImportance = pd.DataFrame({"feature": nyc_airbnb_df.drop('price',axis=1).columns, 
                                  "importance": ridge_cv.coef_})

featureImportance.set_index('feature', inplace=True)
featureImportance.sort_values(["importance"], ascending=False, inplace=True)
featureImportance["importance"].plot('bar', figsize=(10, 6));

# Elastic Net

In [None]:
from sklearn.linear_model import ElasticNetCV

en_cv = ElasticNetCV(cv=5)
en_cv.fit(X_train, y_train)
print('best', en_cv.alpha_)

y_pred_en_cv = en_cv.predict(X_test)

In [None]:
featureImportance = pd.DataFrame({"feature": nyc_airbnb_df.drop('price',axis=1).columns, 
                                  "importance": en_cv.coef_})

featureImportance.set_index('feature', inplace=True)
featureImportance.sort_values(["importance"], ascending=False, inplace=True)
featureImportance["importance"].plot('bar', color='ycm', figsize=(10, 6));

# Metrics comparison

In [None]:
naive_metric = get_metrics(y_test, y_pred_naive, "Naive")
lr_metric = get_metrics(y_test,y_pred_regr, "Linear Regression")
lasso_metric = get_metrics(y_test,y_pred_lasso_cv, "Lasso")
ridge_metric = get_metrics(y_test,y_pred_ridge_cv, "Ridge")
en_metric = get_metrics(y_test,y_pred_en_cv, "Elastic Net")




In [None]:
summary = pd.concat([naive_metric, lr_metric, lasso_metric, ridge_metric, en_metric]).sort_values(by=["RMSE"], ascending=True)
summary