In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import joblib

In [2]:
df = pd.read_csv('../AirbnbData/CleanData/df_modelo_limpio.csv')
df = df.drop(columns = (['Unnamed: 0', 'id', 'neighbourhood_cleansed', 'property_type', 'puntuacion_media']))
df

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_group_cleansed,room_type,bedrooms,beds,price,availability_365,number_of_reviews,calculated_host_listings_count,reviews_per_month,bathrooms
0,1,1,Chamartín,Private room,1.0,1.0,60.0,180,78,1,0.58,1.0
1,0,1,Latina,Private room,1.0,1.0,31.0,364,33,2,0.42,1.0
2,0,1,Arganzuela,Private room,1.0,1.0,26.0,365,149,1,1.12,1.0
3,0,0,Centro,Entire home/apt,1.0,2.0,65.0,351,170,3,1.29,1.0
4,0,0,Centro,Entire home/apt,1.0,1.0,54.0,0,8,1,0.11,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
11535,0,1,Tetuán,Entire home/apt,3.0,4.0,62.0,360,1,38,1.00,1.5
11536,0,0,Centro,Private room,1.0,2.0,60.0,158,2,6,2.00,1.0
11537,0,0,Tetuán,Private room,1.0,1.0,25.0,175,1,1,1.00,1.0
11538,0,1,Centro,Entire home/apt,1.0,1.0,44.0,331,1,2,1.00,1.0


In [3]:
df['bathrooms'] = df['bathrooms'].apply(lambda x: float(x))

In [4]:
df = df.dropna()

In [5]:
df.dtypes

host_is_superhost                   int64
host_identity_verified              int64
neighbourhood_group_cleansed       object
room_type                          object
bedrooms                          float64
beds                              float64
price                             float64
availability_365                    int64
number_of_reviews                   int64
calculated_host_listings_count      int64
reviews_per_month                 float64
bathrooms                         float64
dtype: object

### Datos

In [6]:
X = df.drop(columns = (['price']))

In [7]:
X_enc = pd.get_dummies(X)

In [8]:
X_enc.isna().sum()

host_is_superhost                                     0
host_identity_verified                                0
bedrooms                                              0
beds                                                  0
availability_365                                      0
number_of_reviews                                     0
calculated_host_listings_count                        0
reviews_per_month                                     0
bathrooms                                             0
neighbourhood_group_cleansed_Arganzuela               0
neighbourhood_group_cleansed_Barajas                  0
neighbourhood_group_cleansed_Carabanchel              0
neighbourhood_group_cleansed_Centro                   0
neighbourhood_group_cleansed_Chamartín                0
neighbourhood_group_cleansed_Chamberí                 0
neighbourhood_group_cleansed_Ciudad Lineal            0
neighbourhood_group_cleansed_Fuencarral - El Pardo    0
neighbourhood_group_cleansed_Hortaleza          

In [9]:
y = df['price']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.25, random_state=123)

### Modelo Inicial

In [12]:
model = RandomForestRegressor()

In [13]:
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 25)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(50, 130, num = 15)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [14]:
random_cv = RandomizedSearchCV(model, params, cv=10, n_iter = 20)

In [15]:
random_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(), n_iter=20,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [50, 55, 61, 67, 72, 78,
                                                      84, 90, 95, 101, 107, 112,
                                                      118, 124, 130],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [500, 562, 625, 687,
                                                         750, 812, 875, 937,
                                                         1000, 1062, 1125, 1187,
                                                         1250, 1312, 1375, 1437,
                                

In [16]:
predictions = random_cv.best_estimator_.predict(X_test)

In [17]:
#Mean Absolute Error 
print('MAE:', metrics.mean_absolute_error(y_test, predictions))

#Mean Squared Error
print('MSE:', metrics.mean_squared_error(y_test, predictions))

#Root Mean Squared Error
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

#R2
print('R2:', metrics.r2_score(y_test, predictions))

MAE: 19.043685475346017
MSE: 756.3915526522962
RMSE: 27.502573564164795
R2: 0.5692405053499139


In [18]:
random_cv.best_estimator_.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 55,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1312,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [21]:
joblib_file = "modelo_RF_baseline.pkl"  
joblib.dump(random_cv, joblib_file)

['modelo_RF_baseline.pkl']

#### Normalizando Datos

In [12]:
CONT_COLUMNS = ['bedrooms', 'beds', 'availability_365', 'number_of_reviews', 'calculated_host_listings_count', 'reviews_per_month', 'bathrooms']

In [13]:
X_enc[CONT_COLUMNS] = (X_enc[CONT_COLUMNS]-X_enc[CONT_COLUMNS].mean())/X_enc[CONT_COLUMNS].std() 

In [15]:
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X_enc, y, test_size=0.2, random_state=123)

In [16]:
model = RandomForestRegressor()

In [17]:
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 25)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(50, 130, num = 15)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [18]:
random_cv = RandomizedSearchCV(model, params, cv=10, n_iter = 10)

In [20]:
random_cv.fit(X_train_norm, y_train_norm)

RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [50, 55, 61, 67, 72, 78,
                                                      84, 90, 95, 101, 107, 112,
                                                      118, 124, 130],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [500, 562, 625, 687,
                                                         750, 812, 875, 937,
                                                         1000, 1062, 1125, 1187,
                                                         1250, 1312, 1375, 1437,
                                           

In [21]:
random_cv.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 50,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1625,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [22]:
predictions_norm = random_cv.best_estimator_.predict(X_test_norm)

In [23]:
#Mean Absolute Error 
print('MAE:', metrics.mean_absolute_error(y_test_norm, predictions_norm))

#Mean Squared Error
print('MSE:', metrics.mean_squared_error(y_test_norm, predictions_norm))

#Root Mean Squared Error
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test_norm, predictions_norm)))

#R2
print('R2:', metrics.r2_score(y_test_norm, predictions_norm))

MAE: 19.137284555237088
MSE: 765.8269425421662
RMSE: 27.673578419535232
R2: 0.5634611372962846
