# Prototype
#### Import necessary packages:

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import math

import xgboost as xgb
from sklearn import linear_model
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error

In /Users/cathleenpena1/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/cathleenpena1/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/cathleenpena1/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In /Users/cathleenpena1/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/stylelib/_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In /Users/cathleenpena1/anac

### Load Data

In [2]:
df = pd.read_csv('data/argentina_final.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head(3)

Unnamed: 0,start_date,end_date,created_on,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,...,property_type_House,property_type_Lot,property_type_Office,property_type_Other,property_type_PH,property_type_Shop,property_type_Warehouse,operation_type_For Rent,operation_type_For Sale,operation_type_For Sublease
0,2020-06-17,2020-06-18,2020-06-17,-34.99534,-58.047423,2.844661,0.0,1.70148,474.884308,6370.15545,...,0,0,0,0,0,0,0,0,1,0
1,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,5.0,2.0,1.0,72.0,72.0,...,0,0,0,0,0,0,0,0,1,0
2,2020-07-14,2020-08-13,2020-07-14,-31.116769,-64.482921,2.844661,3.0,1.0,747.0,244.0,...,1,0,0,0,0,0,0,0,1,0


In [3]:
df['start_date']= pd.to_datetime(df['start_date'])
df['end_date']= pd.to_datetime(df['end_date'])
df['created_on']= pd.to_datetime(df['created_on'])

### Split data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['price', 'start_date', 'end_date', 'created_on'], axis=1), df.price, test_size=.3)
print(len(X_train)/len(df), "\n", len(X_test)/len(df))

0.6999993770104392 
 0.30000062298956076


## Create Model

In [9]:
%%time
rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)
rfr_preds = rfr.predict(X_test)

CPU times: user 17min 48s, sys: 19.3 s, total: 18min 7s
Wall time: 20min 8s


In [43]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Evaluate Model 

In [11]:
rfr_rmse = mean_squared_error(y_test, rfr_preds, squared=False)
rfr_mae = median_absolute_error(y_test, rfr_preds)

print("RMSE - Random Forest: ", rfr_rmse)
print("MAE - Random Forest: ", rfr_mae)

RMSE - Random Forest:  369810.62902795215
MAE - Random Forest:  5670.335828825264


#### K-fold cross validation

In [18]:
%%time  1`11`1
all_accuracies = cross_val_score(estimator=rfr, X=X_train, y=y_train, cv=5, scoring="neg_root_mean_squared_error")

CPU times: user 59min 46s, sys: 44.5 s, total: 1h 30s
Wall time: 1h 1min 33s


In [19]:
all_accuracies

array([  -331492.82749166, -18250322.8639767 ,   -310183.65359923,
       -27234667.68048456,   -510503.45014636])

## Hyperparameter Tuning
Grid Search CV

In [44]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [80, 100],
    'min_samples_leaf': [1, 3, 5],
    'min_samples_split': [2, 5],
    'n_estimators': [100, 500, 800]
}

In [None]:
%%time
grid_search = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [None]:
%%time
best_grid = grid_search.best_estimator_
best_grid_preds = best_grid.predict(X_test)
best_grid_rmse = mean_squared_error(y_test, best_grid_preds, squared=False)

In [None]:
print("Base RMSE: \t", rfr_rmse)
print("Tuned RMSE: \t", best_grid_rmse)

# Compare to base model and untuned model
Metrics and Visualizations

In [None]:
dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)
mean_preds = dummy_mean.predict(X_test)

dme_rmse = mean_squared_error(y_test, mean_preds, squared=False)

In [None]:
evals = pd.Dataframe([['Base Model', dme_rmse], 
                      ['Untuned Randomn Forest', rfr_rmse], 
                      ['Tuned Random Forest', best_grid_rmse]], columns=['Model', 'RMSE'])

In [None]:
sns.barplot(data=evals, x='Model', y='RMSE')