In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image

In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from sklearn.model_selection import GridSearchCV

## EDA

In [3]:
#Dataset airbnb
london = pd.read_csv('clusterai_regresion_dataset_airbnb_london.csv', delimiter=',', parse_dates = True)
london.head(5)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,9554,"Cozy, 3 minutes to Piccadilly Line",31655,Guy,,Haringey,51.587767,-0.105666,Private room,35,1,131,2018-08-03,1.71,4,262
1,11076,The Sanctuary,40471,Rosa,,Ealing,51.515645,-0.314508,Private room,70,2,2,2016-11-23,0.07,6,62
2,13913,Holiday London DB Room Let-on going,54730,Alina,,Islington,51.568017,-0.111208,Private room,45,1,14,2018-06-17,0.14,2,364
3,17402,Superb 3-Bed/2 Bath & Wifi: Trendy W1,67564,Liz,,Westminster,51.520982,-0.140024,Entire home/apt,300,3,31,2018-08-01,0.34,12,135
4,24328,Battersea 2 bedroom house & parking,41759,Joe,,Wandsworth,51.472981,-0.163764,Entire home/apt,150,30,92,2016-09-07,0.98,1,362


In [4]:
np.shape(london)

(75213, 16)

In [5]:
london.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [6]:
london.index

RangeIndex(start=0, stop=75213, step=1)

In [7]:
#verifico los nans
london.isnull().sum().sort_values(ascending=False)

neighbourhood_group               75213
reviews_per_month                 20357
last_review                       20353
name                                 31
host_name                            13
availability_365                      0
calculated_host_listings_count        0
number_of_reviews                     0
minimum_nights                        0
price                                 0
room_type                             0
longitude                             0
latitude                              0
neighbourhood                         0
host_id                               0
id                                    0
dtype: int64

In [8]:
# elimino las rows que tienen NaNs en la columna "reviews_per_month" , que otras opciones existen?
london = london.dropna(subset = ["reviews_per_month"])

In [9]:
# reviso como queda mi dataframe luego de eliminar los renglones con NaNs en review per month
london.shape

(54856, 16)

In [10]:
# elimino columnas que no me interesan
london = london.drop(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'last_review', 'latitude', 'longitude'], axis=1)

In [11]:
london.shape

(54856, 8)

In [12]:
london.head(3)

Unnamed: 0,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Haringey,Private room,35,1,131,1.71,4,262
1,Ealing,Private room,70,2,2,0.07,6,62
2,Islington,Private room,45,1,14,0.14,2,364


In [13]:
# valor máximo en el dataframe
np.max(london)

neighbourhood                     Westminster
room_type                         Shared room
price                                   10000
minimum_nights                           1000
number_of_reviews                         536
reviews_per_month                       15.56
calculated_host_listings_count           1034
availability_365                          365
dtype: object

In [14]:
#obtengo el percentil 97.5
price_q97 = london.price.quantile(0.975)
print("el cuantil 0.97 de la feature 'price' es = " + str(price_q97))

el cuantil 0.97 de la feature 'price' es = 300.0


In [15]:
min_nights_q97 = london.minimum_nights.quantile(0.975)
print("el cuantil 0.97 de la feature 'minimum_nights' es = " + str(min_nights_q97))

el cuantil 0.97 de la feature 'minimum_nights' es = 14.0


In [16]:
# aca filtro y conservo los valores que cumplen las dos condiciones al mismo tiempo
london_filt = london.loc[(london.price < price_q97) & (london.minimum_nights < min_nights_q97)]

In [17]:
london_filt.head(3)

Unnamed: 0,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Haringey,Private room,35,1,131,1.71,4,262
1,Ealing,Private room,70,2,2,0.07,6,62
2,Islington,Private room,45,1,14,0.14,2,364


In [18]:
np.shape(london_filt)

(51827, 8)

## Genero los dummies

In [19]:
# genero dummies en la columna neighbourhood
neighs_dummie = pd.get_dummies(london_filt.neighbourhood)
neighs_dummie.head(3)

Unnamed: 0,Barking and Dagenham,Barnet,Bexley,Brent,Bromley,Camden,City of London,Croydon,Ealing,Enfield,...,Merton,Newham,Redbridge,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,Westminster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#idem para room_type
room_dummie = pd.get_dummies(london_filt.room_type)
room_dummie.head(3)

Unnamed: 0,Entire home/apt,Private room,Shared room
0,0,1,0
1,0,1,0
2,0,1,0


In [21]:
# agrego las dummies al dataframe filtrado
london_filt = london_filt.join([neighs_dummie, room_dummie])

In [22]:
london_filt.head(3)

Unnamed: 0,neighbourhood,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Barking and Dagenham,Barnet,...,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,Westminster,Entire home/apt,Private room,Shared room
0,Haringey,Private room,35,1,131,1.71,4,262,0,0,...,0,0,0,0,0,0,0,0,1,0
1,Ealing,Private room,70,2,2,0.07,6,62,0,0,...,0,0,0,0,0,0,0,0,1,0
2,Islington,Private room,45,1,14,0.14,2,364,0,0,...,0,0,0,0,0,0,0,0,1,0


## Creo las variables dependientes (y) e independientes (x)

In [23]:
# la variable dependiente va a ser el precio
y = np.array(london_filt[["price"]])
y

array([[ 35],
       [ 70],
       [ 45],
       ...,
       [280],
       [ 35],
       [ 66]], dtype=int64)

In [24]:
# variable independiente --> el dataframe menos las 3 columnas que se quitan
x = london_filt.drop(['price', 'neighbourhood','room_type'], axis=1)

In [25]:
x.shape

(51827, 41)

### Divido el dataset en train y test

In [26]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.90, random_state=42)
# random state con un determinado número se usa para que siempre tome los mismos valores de train y test

In [27]:
xtrain.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Barking and Dagenham,Barnet,Bexley,Brent,Bromley,...,Richmond upon Thames,Southwark,Sutton,Tower Hamlets,Waltham Forest,Wandsworth,Westminster,Entire home/apt,Private room,Shared room
4109,3,10,0.61,1,321,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8703,1,2,0.05,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
62456,4,1,0.94,1,137,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7200,2,2,0.18,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
44756,1,3,0.25,1,220,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [28]:
xtrain.shape

(5182, 41)

In [29]:
xtest.shape

(46645, 41)

In [30]:
# Step 4: auto scaling train- set (mean = 0, std = 1)
scaler = preprocessing.StandardScaler().fit(xtrain)
scaler

StandardScaler()

In [31]:
# auto escalo mis muestras de train utilizando el scaler fiteado con el xtrain
xtrain_scal = scaler.transform(xtrain)  

In [32]:
# auto escalo mis muestras de test utilizando el scaler fiteado con el xtrain
xtest_scal = scaler.transform(xtest)  

## Creo un modelo de regresión lineal

In [33]:
# 1) creo un modelo generico de regresion lineal
lr = LinearRegression()

In [34]:
# 2) Ajusto el modelo de regresion lineal utilizando el set de train, tanto las features X como las samples Y
lr.fit(xtrain_scal, ytrain)

LinearRegression()

In [35]:
# 3) Obtengo las predicciones que realiza mi modelo con las muestras de test, sin mostrarle las labels (Ytest)
# las predicciones las guardo en el vector "ypred"
ypred = lr.predict(xtest_scal)

In [36]:
np.shape(ypred)

(46645, 1)

In [37]:
# calculo el error de mi modelo con las muestras de train = error de train
np.sqrt(mean_squared_error(ytest, ypred))

39.310451488740775

In [38]:
mean_squared_error(ytest, ypred)

1545.3115962486415

In [39]:
from sklearn.metrics import mean_absolute_error

In [40]:
mean_absolute_error(ytest, ypred)

28.10907127981235

# KNN

In [41]:
from sklearn.neighbors import KNeighborsRegressor as KN

In [42]:
# estimador para el KNN
estimador_knn= KN()
# Determino parametros (dato de clase)
parameters_k = np.arange(20,31,5)
parameters_knn = [{'n_neighbors': parameters_k}]
n_folds=5

# Defino gridsearch
gs = GridSearchCV(estimador_knn,param_grid=parameters_knn,refit=True,
                 cv=n_folds, scoring="neg_mean_squared_error",
                verbose=3, n_jobs=3)

In [43]:
gs.fit(xtrain_scal, ytrain)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed:    2.1s finished


GridSearchCV(cv=5, estimator=KNeighborsRegressor(), n_jobs=3,
             param_grid=[{'n_neighbors': array([20, 25, 30])}],
             scoring='neg_mean_squared_error', verbose=3)

In [44]:
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

KNeighborsRegressor(n_neighbors=20) 

{'n_neighbors': 20} 

-1633.0440172967944 



In [45]:
KNN_prediction = gs.best_estimator_.predict(xtest_scal)
KNN_r2 = r2_score(y_true=ytest, y_pred=KNN_prediction)
KNN_mse = mean_squared_error(y_true=ytest, y_pred=KNN_prediction)
KNN_mae = mean_absolute_error(y_true=ytest, y_pred=KNN_prediction)

In [46]:
print ("R2 score: " +str(KNN_r2)+ "\nMSE: " +str(KNN_mse))


R2 score: 0.49343204328146584
MSE: 1586.6006127130454


In [47]:
print (f'R2 score {KNN_r2}')
print (f'MSE {KNN_mse}')
print (f'MAE {KNN_mae}')

R2 score 0.49343204328146584
MSE 1586.6006127130454
MAE 28.06733840711759


# SVR

In [48]:
# estimador para el KNN
SVR = SVR()
# Determino parametros (dato de clase)
parameters_svr_rbf = [{'kernel':['rbf'] , 'C': [1,100],'gamma': [0.1,0.5] }]
n_folds=5

# Defino gridsearch
gs = GridSearchCV(SVR,param_grid=parameters_svr_rbf,refit=True,
                 cv=n_folds, scoring="neg_mean_squared_error",
                verbose=3, n_jobs=3)

In [49]:
# entreno el modelo con .fit
gs.fit(xtrain_scal, ytrain)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  20 out of  20 | elapsed:    9.8s finished
  return f(**kwargs)


GridSearchCV(cv=5, estimator=SVR(), n_jobs=3,
             param_grid=[{'C': [1, 100], 'gamma': [0.1, 0.5],
                          'kernel': ['rbf']}],
             scoring='neg_mean_squared_error', verbose=3)

In [50]:
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

SVR(C=100, gamma=0.1) 

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'} 

-1680.886555843345 



In [51]:
SVR_prediction = gs.best_estimator_.predict(xtest_scal)
SVR_r2 = r2_score(y_true=ytest, y_pred=SVR_prediction)
SVR_mse = mean_squared_error(y_true=ytest, y_pred=SVR_prediction)
SVR_mae = mean_absolute_error(y_true=ytest, y_pred=SVR_prediction)

In [52]:
print ("R2 score: " +str(KNN_r2)+ "\nMSE: " +str(KNN_mse))

R2 score: 0.49343204328146584
MSE: 1586.6006127130454


In [53]:
print (f'R2 score {SVR_r2}')
print (f'MSE {SVR_mse}')
print (f'MAE {SVR_mae}')

R2 score 0.4796776491625605
MSE 1629.6801834737307
MAE 27.19362211041141


# Random Forest

In [54]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

#estimador para random forest
RF = RandomForestRegressor()

# Determino parametros (dato de clase)
parameters_RF = {'n_estimators': [100, 200, 300, 1000]}
n_folds=5

# Defino gridsearch
gs = GridSearchCV(RF,param_grid=parameters_RF,refit=True,
                 cv=n_folds, scoring="neg_mean_squared_error",
                verbose=3, n_jobs=3)

In [55]:
# entreno el modelo con .fit
gs.fit(xtrain_scal, ytrain)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  20 out of  20 | elapsed:  1.0min finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, estimator=RandomForestRegressor(), n_jobs=3,
             param_grid={'n_estimators': [100, 200, 300, 1000]},
             scoring='neg_mean_squared_error', verbose=3)

In [56]:
print(gs.best_estimator_, "\n")
print(gs.best_params_, "\n")
print(gs.best_score_, "\n")

RandomForestRegressor(n_estimators=1000) 

{'n_estimators': 1000} 

-1600.3414908210468 



In [57]:
RF_prediction = gs.best_estimator_.predict(xtest_scal)
RF_r2 = r2_score(y_true=ytest, y_pred=RF_prediction)
RF_mse = mean_squared_error(y_true=ytest, y_pred=RF_prediction)
RF_mae = mean_absolute_error(y_true=ytest, y_pred=RF_prediction)

In [58]:
print ("R2 score: " +str(RF_r2)+ "\nMSE: " +str(RF_mse))


R2 score: 0.4892577034797252
MSE: 1599.674890308527


In [59]:
print (f'R2 score {RF_r2}')
print (f'MSE {RF_mse}')
print (f'MAE {RF_mae}')

R2 score 0.4892577034797252
MSE 1599.674890308527
MAE 28.123946842860096
