# Experiment with Various Models

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

In [2]:
argentina = pd.read_csv('data/argentina_cleaned.csv')
argentina.head(2)

Unnamed: 0.1,Unnamed: 0,start_date,end_date,created_on,lat,lon,l1,l2,l3,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,price_period,property_type,operation_type
0,0,2020-06-17,2020-06-18,2020-06-17,-34.99534,-58.047423,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,2.844661,0.0,1.70148,474.884308,6370.15545,176556.301186,Monthly,Department,For Sale
1,1,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,5.0,2.0,1.0,72.0,72.0,176556.301186,Monthly,Department,For Sale


In [3]:
argentina.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
argentina.head(1)

Unnamed: 0,start_date,end_date,created_on,lat,lon,l1,l2,l3,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,price_period,property_type,operation_type
0,2020-06-17,2020-06-18,2020-06-17,-34.99534,-58.047423,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,2.844661,0.0,1.70148,474.884308,6370.15545,176556.301186,Monthly,Department,For Sale
1,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,5.0,2.0,1.0,72.0,72.0,176556.301186,Monthly,Department,For Sale


In [5]:
argentina.describe()

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price
count,963098.0,963098.0,963098.0,963098.0,963098.0,963098.0,963098.0,963098.0
mean,-34.816634,-59.193616,2.844661,2.084058,1.70148,474.884308,6370.155,176556.3
std,2.711359,2.366182,1.210602,1.48937,0.9459,2499.260252,2188542.0,10198070.0
min,-54.840886,-119.69939,1.0,-16.0,1.0,-136.0,-130.0,0.01
25%,-34.816634,-59.193616,2.844661,2.0,1.0,178.0,120.0,19000.0
50%,-34.649614,-58.498129,2.844661,2.084058,1.70148,474.884308,6370.155,88000.0
75%,-34.54875,-58.373499,3.0,2.084058,2.0,474.884308,6370.155,180000.0
max,85.051129,-1.981231,40.0,900.0,20.0,200000.0,2147484000.0,10000000000.0


## Create Baseline Model

In [6]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error

In [7]:
X_train, X_test, y_train, y_test = train_test_split(argentina.drop('price', axis=1), argentina.price, test_size=.3)

In [8]:
dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)
dummy_median = DummyRegressor(strategy = 'median').fit(X_train, y_train)

mean_preds = dummy_mean.predict(X_test)
median_preds = dummy_median.predict(X_test)

In [9]:
print("MSE - Dummy Regressor w/ Mean: ", mean_squared_error(y_test, mean_preds))
print("MSE - Dummy Regressor w/ Median: ", mean_squared_error(y_test, median_preds))
print()
print("MAE - Dummy Regressor w/ Mean: ", median_absolute_error(y_test, mean_preds))
print("MAE - Dummy Regressor w/ Median: ", median_absolute_error(y_test, median_preds))
print()
print("R2- Dummy Regressor w/ Mean: ", r2_score(y_test, mean_preds))
print("R2 - Dummy Regressor w/ Median: ", r2_score(y_test, median_preds))



MSE - Dummy Regressor w/ Mean:  346317439932974.94
MSE - Dummy Regressor w/ Median:  346329127798157.9

MAE - Dummy Regressor w/ Mean:  111606.71971210578
MAE - Dummy Regressor w/ Median:  78000.0

R2- Dummy Regressor w/ Mean:  -3.846608915614169e-06
R2 - Dummy Regressor w/ Median:  -3.759573267880789e-05


In [51]:
from sklearn.ensemble import GradientBoostingRegressor

Below I get an error for having string columns I believe

In [57]:
gbr = GradientBoostingRegressor()
#gbr.fit(X_train, y_train)
#gbr.predict(X_test)
#gbr.score(X_test, y_test)

## Fix string columns and one-hot encode

In [12]:
argentina.head()

Unnamed: 0,start_date,end_date,created_on,lat,lon,l1,l2,l3,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,price_period,property_type,operation_type
0,2020-06-17,2020-06-18,2020-06-17,-34.99534,-58.047423,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,2.844661,0.0,1.70148,474.884308,6370.15545,176556.301186,Monthly,Department,For Sale
1,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,5.0,2.0,1.0,72.0,72.0,176556.301186,Monthly,Department,For Sale
2,2020-07-14,2020-08-13,2020-07-14,-31.116769,-64.482921,Argentina,Córdoba,Valle Hermoso,2.844661,3.0,1.0,747.0,244.0,176556.301186,Monthly,House,For Sale
3,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,3.0,1.0,1.0,40.0,40.0,176556.301186,Monthly,Department,For Sale
4,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,2.844661,2.084058,1.0,35.0,35.0,176556.301186,Monthly,Office,For Sale


In [13]:
argentina.dtypes

start_date          object
end_date            object
created_on          object
lat                float64
lon                float64
l1                  object
l2                  object
l3                  object
rooms              float64
bedrooms           float64
bathrooms          float64
surface_total      float64
surface_covered    float64
price              float64
price_period        object
property_type       object
operation_type      object
dtype: object

Convert necessary columns to datetime:

In [15]:
argentina['end_date'] = pd.to_datetime(argentina['end_date'])
argentina['start_date'] = pd.to_datetime(argentina['start_date'])
argentina['created_on'] = pd.to_datetime(argentina['created_on'])
argentina.dtypes

start_date         datetime64[ns]
end_date           datetime64[ns]
created_on         datetime64[ns]
lat                       float64
lon                       float64
l1                         object
l2                         object
l3                         object
rooms                     float64
bedrooms                  float64
bathrooms                 float64
surface_total             float64
surface_covered           float64
price                     float64
price_period               object
property_type              object
operation_type             object
dtype: object

#### One-Hot Encode columns: l1, l2, l3, price_period, property_type, operation_type

In [16]:
argentina.l3.nunique()

1210

In [17]:
argentina.l2.nunique()

43

In [18]:
argentina.l1.nunique()

4

In [19]:
argentina.price_period.nunique()

3

In [20]:
argentina.property_type.nunique()

10

In [21]:
argentina.operation_type.nunique()

3

I might exclude l3 from one hot encoding and models for now since it will explode with columns

In [22]:
arg2 = argentina.drop('l3', axis=1)

In [39]:
#categorical data
categorical_cols = ['l1', 'l2', 'price_period', 'property_type', 'operation_type'] 

#import pandas as pd
arg3 = pd.get_dummies(arg2, columns = categorical_cols, dtype=int)
print(arg3.columns, len(arg3.columns))

Index(['start_date', 'end_date', 'created_on', 'lat', 'lon', 'rooms',
       'bedrooms', 'bathrooms', 'surface_total', 'surface_covered', 'price',
       'l1_Argentina', 'l1_Brasil', 'l1_Estados Unidos', 'l1_Uruguay',
       'l2_Bs.As. G.B.A. Zona Norte', 'l2_Bs.As. G.B.A. Zona Oeste',
       'l2_Bs.As. G.B.A. Zona Sur', 'l2_Buenos Aires Costa Atlántica',
       'l2_Buenos Aires Interior', 'l2_Canelones', 'l2_Capital Federal',
       'l2_Catamarca', 'l2_Chaco', 'l2_Chubut', 'l2_Colonia', 'l2_Corrientes',
       'l2_Córdoba', 'l2_Entre Ríos', 'l2_Florida', 'l2_Formosa', 'l2_Jujuy',
       'l2_La Pampa', 'l2_La Rioja', 'l2_Maldonado', 'l2_Maryland',
       'l2_Mendoza', 'l2_Miami', 'l2_Michigan', 'l2_Misiones', 'l2_Montevideo',
       'l2_Neuquén', 'l2_New York', 'l2_Pennsylvania',
       'l2_Rio Grande do Norte', 'l2_Rio de Janeiro', 'l2_Rocha',
       'l2_Río Negro', 'l2_Salta', 'l2_San Juan', 'l2_San Luis',
       'l2_Santa Catarina', 'l2_Santa Cruz', 'l2_Santa Fe',
       'l2_Santiag

In [40]:
arg3.head(3)

Unnamed: 0,start_date,end_date,created_on,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,...,property_type_House,property_type_Lot,property_type_Office,property_type_Other,property_type_PH,property_type_Shop,property_type_Warehouse,operation_type_For Rent,operation_type_For Sale,operation_type_For Sublease
0,2020-06-17,2020-06-18,2020-06-17,-34.99534,-58.047423,2.844661,0.0,1.70148,474.884308,6370.15545,...,0,0,0,0,0,0,0,0,1,0
1,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,5.0,2.0,1.0,72.0,72.0,...,0,0,0,0,0,0,0,0,1,0
2,2020-07-14,2020-08-13,2020-07-14,-31.116769,-64.482921,2.844661,3.0,1.0,747.0,244.0,...,1,0,0,0,0,0,0,0,1,0


In [41]:
arg3.dtypes.value_counts()

int64             63
float64            8
datetime64[ns]     3
dtype: int64

## Back to Modeling...

### Gradient Boosting Regressor

In [75]:
X_train, X_test, y_train, y_test = train_test_split(arg3.drop('price', axis=1), arg3.price, test_size=.3)

GBR apparently doesn't allow datetime objects so we will drop those columns

In [55]:
#gbc = GradientBoostingRegressor()
#gbc.fit(X_train, y_train)
#gbc.predict(X_test)
#gbc.score(X_test, y_test)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(arg3.drop(['price', 'start_date', 'end_date', 'created_on'], axis=1), arg3.price, test_size=.3)

In [62]:
gbc = GradientBoostingRegressor()
gbc.fit(X_train, y_train)
gbc_preds = gbc.predict(X_test)

GradientBoostingRegressor()

In [61]:
print("MSE - Gradient Boosting Regressor: ", mean_squared_error(y_test, gbc_preds)**(1/2))
print("MAE - Gradient Boosting Regressor: ", median_absolute_error(y_test, gbc_preds))
print("R2 - Gradient Boosting Regressor: ", r2_score(y_test, gbc_preds))

('R2 score:', 0.00011462314632548765)

In [None]:
print("R2 score:", gbc.score(X_test, y_test))

### Dummy Mean and Median Regressor on newly transformed data

In [58]:
dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)
mean_preds = dummy_mean.predict(X_test)

print("MSE - Dummy Mean Regressor: ", mean_squared_error(y_test, mean_preds))
print("MAE - Dummy Mean Regressor: ", median_absolute_error(y_test, mean_preds))
print("R2 - Dummy Mean Regressor: ", r2_score(y_test, mean_preds))

In [74]:
dummy_median = DummyRegressor(strategy = 'median').fit(X_train, y_train)
median_preds = dummy_median.predict(X_test)

print("MSE - Dummy Median Regressor: ", mean_squared_error(y_test, median_preds))
print("MAE - Dummy Median Regressor: ", median_absolute_error(y_test, median_preds))
print("R2 - Dummy Median Regressor: ", r2_score(y_test, median_preds))

MSE - Dummy Mean Regressor:  346249920847790.06
MAE - Dummy Mean Regressor:  111789.43146323215
R2- Dummy Mean Regressor:  -3.720030803178176e-06

MSE - Dummy Median Regressor:  346261555689671.56
MAE - Dummy Median Regressor:  78000.0
R2 - Dummy Median Regressor:  -3.7322594913291596e-05


### XGBoost

In [63]:
import xgboost as xgb

In [66]:
xgbr = xgb.XGBRegressor(n_estimators=100, learning_rate=.08, gamma=0, subsample=.75, colsample_bytree=1, max_depth=7)
xgbr.fit(X_train, y_train)
XGpreds = xgbr.predict(X_test)

In [72]:
print("MSE - XGBoost: ", mean_squared_error(y_test, XGpreds)**(1/2))
print("MAE - XGBoost: ", median_absolute_error(y_test, XGpreds))
print("R2 - XGBoost: ", r2_score(y_test, XGpreds))

MSE:  18606325.28085089
MAE:  35751.5234375
R2 score:  0.00015391348970772434


### Lasso Regression

In [None]:
from sklearn import linear_model
las = linear_model.Lasso(max_iter=10000)
las.fit(X_train, y_train)
las_preds = las.predict(X_test)

In [None]:
print("MSE - Lasso: ", mean_squared_error(y_test, las_preds)**(1/2))
print("MAE - Lasso: ", median_absolute_error(y_test, las_preds))
print("R2 - Lasso: ", r2_score(y_test, las_preds))