In [20]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import joblib

In [3]:
df = pd.read_csv('../AirbnbData/CleanData/df_modelo_limpio.csv')
df = df.drop(columns = (['Unnamed: 0', 'id', 'neighbourhood_cleansed', 'property_type']))
df

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_group_cleansed,room_type,bedrooms,beds,price,availability_365,number_of_reviews,calculated_host_listings_count,reviews_per_month,bathrooms,puntuacion_media
0,1,1,Chamartín,Private room,1.0,1.0,60.0,180,78,1,0.58,1.0,4.583333
1,0,1,Latina,Private room,1.0,1.0,31.0,364,33,2,0.42,1.0,4.653846
2,0,1,Arganzuela,Private room,1.0,1.0,26.0,365,149,1,1.12,1.0,4.612613
3,0,0,Centro,Entire home/apt,1.0,2.0,65.0,351,170,3,1.29,1.0,4.596774
4,0,0,Centro,Entire home/apt,1.0,1.0,54.0,0,8,1,0.11,1.0,4.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11535,0,1,Tetuán,Entire home/apt,3.0,4.0,62.0,360,1,38,1.00,1.5,5.000000
11536,0,0,Centro,Private room,1.0,2.0,60.0,158,2,6,2.00,1.0,4.500000
11537,0,0,Tetuán,Private room,1.0,1.0,25.0,175,1,1,1.00,1.0,5.000000
11538,0,1,Centro,Entire home/apt,1.0,1.0,44.0,331,1,2,1.00,1.0,5.000000


In [4]:
df['bathrooms'] = df['bathrooms'].apply(lambda x: float(x))

In [5]:
df = df.dropna()

In [6]:
df.dtypes

host_is_superhost                   int64
host_identity_verified              int64
neighbourhood_group_cleansed       object
room_type                          object
bedrooms                          float64
beds                              float64
price                             float64
availability_365                    int64
number_of_reviews                   int64
calculated_host_listings_count      int64
reviews_per_month                 float64
bathrooms                         float64
puntuacion_media                  float64
dtype: object

### Datos

In [7]:
X = df.drop(columns = (['price']))

In [8]:
X_enc = pd.get_dummies(X)

In [9]:
X_enc.isna().sum()

host_is_superhost                                     0
host_identity_verified                                0
bedrooms                                              0
beds                                                  0
availability_365                                      0
number_of_reviews                                     0
calculated_host_listings_count                        0
reviews_per_month                                     0
bathrooms                                             0
puntuacion_media                                      0
neighbourhood_group_cleansed_Arganzuela               0
neighbourhood_group_cleansed_Barajas                  0
neighbourhood_group_cleansed_Carabanchel              0
neighbourhood_group_cleansed_Centro                   0
neighbourhood_group_cleansed_Chamartín                0
neighbourhood_group_cleansed_Chamberí                 0
neighbourhood_group_cleansed_Ciudad Lineal            0
neighbourhood_group_cleansed_Fuencarral - El Par

In [10]:
y = df['price']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.25, random_state=123)

### Modelo Inicial

In [41]:
model = RandomForestRegressor()

In [42]:
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 1000, num = 25)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(5, 150, num = 15)]
min_samples_split = [2, 5, 10, 20]
min_samples_leaf = [1, 2, 4, 10, 15, 20]
bootstrap = [True, False]

params = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [43]:
random_cv = RandomizedSearchCV(model, params, cv=10)

In [44]:
random_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(),
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 15, 25, 36, 46, 56, 67,
                                                      77, 87, 98, 108, 118, 129,
                                                      139, 150],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 10, 15,
                                                             20],
                                        'min_samples_split': [2, 5, 10, 20],
                                        'n_estimators': [50, 89, 129, 168, 208,
                                                         247, 287, 327, 366,
                                                         406, 445, 485, 525,
                                                  

In [45]:
predictions = random_cv.best_estimator_.predict(X_test)

In [48]:
#Mean Absolute Error 
print('MAE:', metrics.mean_absolute_error(y_test, predictions))

#Mean Squared Error
print('MSE:', metrics.mean_squared_error(y_test, predictions))

#Root Mean Squared Error
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

#R2
print('R2:', metrics.r2_score(y_test, predictions))

MAE: 18.930700530102627
MSE: 750.756345793245
RMSE: 27.39993331731384
R2: 0.5724497147208301


In [51]:
random_cv.best_estimator_.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 87,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 920,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Segundo Modelo

In [54]:
model_v2 = RandomForestRegressor()

In [55]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1700, num = 25)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(30, 130, num = 15)]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4, 10]
bootstrap = [True, False]

params_v2 = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [58]:
random_cv_v2 = RandomizedSearchCV(model_v2, params_v2, cv=7, n_iter = 15)

In [59]:
random_cv_v2.fit(X_train, y_train)

RandomizedSearchCV(cv=7, estimator=RandomForestRegressor(), n_iter=15,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [30, 37, 44, 51, 58, 65,
                                                      72, 80, 87, 94, 101, 108,
                                                      115, 122, 130],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [1, 2, 4, 10],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 262, 325, 387,
                                                         450, 512, 575, 637,
                                                         700, 762, 825, 887,
                                                         950, 1012, 1075, 1137,
                                   

In [60]:
predictions_v2 = random_cv_v2.best_estimator_.predict(X_test)

In [62]:
random_cv_v2.best_estimator_.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 80,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 637,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [61]:
#Mean Absolute Error 
print('MAE:', metrics.mean_absolute_error(y_test, predictions_v2))

#Mean Squared Error
print('MSE:', metrics.mean_squared_error(y_test, predictions_v2))

#Root Mean Squared Error
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions_v2)))

#R2
print('R2:', metrics.r2_score(y_test, predictions_v2))

MAE: 18.983830223027567
MSE: 751.5973339030467
RMSE: 27.41527555767125
R2: 0.5719707781014104


### Modelo Final

In [16]:
model_rf_best = RandomForestRegressor(n_estimators = 920, max_depth = 87, min_samples_split=5, 
                                      min_samples_leaf=1, max_features = 'log2') 

In [17]:
model_rf_best.fit(X_train, y_train)

RandomForestRegressor(max_depth=87, max_features='log2', min_samples_split=5,
                      n_estimators=920)

In [18]:
pred_best = model_rf_best.predict(X_test)

In [19]:
#Mean Absolute Error 
print('MAE:', metrics.mean_absolute_error(y_test, pred_best))

#Mean Squared Error
print('MSE:', metrics.mean_squared_error(y_test, pred_best))

#Root Mean Squared Error
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred_best)))

#R2
print('R2:', metrics.r2_score(y_test, pred_best))

MAE: 19.06117517948031
MSE: 750.8951061636155
RMSE: 27.40246533003218
R2: 0.5723706916978883


In [21]:
joblib_file = "modelo_RF_reviews.pkl"  
joblib.dump(model_rf_best, joblib_file)

['modelo_RF_reviews.pkl']