In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

In [2]:
argentina = pd.read_csv('data/argentina_cleaned.csv')
argentina.head(2)

Unnamed: 0.1,Unnamed: 0,start_date,end_date,created_on,lat,lon,l1,l2,l3,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,price_period,property_type,operation_type
0,0,2020-06-17,2020-06-18,2020-06-17,-34.99534,-58.047423,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,2.844661,0.0,1.70148,474.884308,6370.15545,176556.301186,Monthly,Department,For Sale
1,1,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,5.0,2.0,1.0,72.0,72.0,176556.301186,Monthly,Department,For Sale


In [3]:
argentina.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
argentina.head(2)

Unnamed: 0,start_date,end_date,created_on,lat,lon,l1,l2,l3,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,price_period,property_type,operation_type
0,2020-06-17,2020-06-18,2020-06-17,-34.99534,-58.047423,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,2.844661,0.0,1.70148,474.884308,6370.15545,176556.301186,Monthly,Department,For Sale
1,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,5.0,2.0,1.0,72.0,72.0,176556.301186,Monthly,Department,For Sale


In [5]:
argentina.describe()

Unnamed: 0,lat,lon,rooms,bedrooms,bathrooms,surface_total,surface_covered,price
count,963098.0,963098.0,963098.0,963098.0,963098.0,963098.0,963098.0,963098.0
mean,-34.816634,-59.193616,2.844661,2.084058,1.70148,474.884308,6370.155,176556.3
std,2.711359,2.366182,1.210602,1.48937,0.9459,2499.260252,2188542.0,10198070.0
min,-54.840886,-119.69939,1.0,-16.0,1.0,-136.0,-130.0,0.01
25%,-34.816634,-59.193616,2.844661,2.0,1.0,178.0,120.0,19000.0
50%,-34.649614,-58.498129,2.844661,2.084058,1.70148,474.884308,6370.155,88000.0
75%,-34.54875,-58.373499,3.0,2.084058,2.0,474.884308,6370.155,180000.0
max,85.051129,-1.981231,40.0,900.0,20.0,200000.0,2147484000.0,10000000000.0


## Create Baseline Model

In [6]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error

In [7]:
X_train, X_test, y_train, y_test = train_test_split(argentina.drop('price', axis=1), argentina.price, test_size=.3)

In [8]:
dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)
dummy_median = DummyRegressor(strategy = 'median').fit(X_train, y_train)

mean_preds = dummy_mean.predict(X_test)
median_preds = dummy_median.predict(X_test)

In [9]:
print("MSE - Dummy Regressor w/ Mean: ", mean_squared_error(y_test, mean_preds))
print("MSE - Dummy Regressor w/ Median: ", mean_squared_error(y_test, median_preds))
print()
print("MAE - Dummy Regressor w/ Mean: ", median_absolute_error(y_test, mean_preds))
print("MAE - Dummy Regressor w/ Median: ", median_absolute_error(y_test, median_preds))
print()
print("R2- Dummy Regressor w/ Mean: ", r2_score(y_test, mean_preds))
print("R2 - Dummy Regressor w/ Median: ", r2_score(y_test, median_preds))



MSE - Dummy Regressor w/ Mean:  188373133022.37988
MSE - Dummy Regressor w/ Median:  194119569176.26944

MAE - Dummy Regressor w/ Mean:  122316.86789246171
MAE - Dummy Regressor w/ Median:  78224.0

R2- Dummy Regressor w/ Mean:  -0.0013385511528161143
R2 - Dummy Regressor w/ Median:  -0.03188498821793795


In [10]:
from sklearn.ensemble import GradientBoostingClassifier

In [11]:
gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1).fit(X_train, y_train)
gbc.score(X_test, y_test)

ValueError: could not convert string to float: '2020-11-18'

## Fix string columns and one-hot encode

In [13]:
argentina.head()

Unnamed: 0,start_date,end_date,created_on,lat,lon,l1,l2,l3,rooms,bedrooms,bathrooms,surface_total,surface_covered,price,price_period,property_type,operation_type
0,2020-06-17,2020-06-18,2020-06-17,-34.99534,-58.047423,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,2.844661,0.0,1.70148,474.884308,6370.15545,176556.301186,Monthly,Department,For Sale
1,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,5.0,2.0,1.0,72.0,72.0,176556.301186,Monthly,Department,For Sale
2,2020-07-14,2020-08-13,2020-07-14,-31.116769,-64.482921,Argentina,Córdoba,Valle Hermoso,2.844661,3.0,1.0,747.0,244.0,176556.301186,Monthly,House,For Sale
3,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,3.0,1.0,1.0,40.0,40.0,176556.301186,Monthly,Department,For Sale
4,2020-05-20,2020-05-20,2020-05-20,-34.816634,-59.193616,Argentina,Neuquén,Neuquén,2.844661,2.084058,1.0,35.0,35.0,176556.301186,Monthly,Office,For Sale


In [14]:
argentina.dtypes

start_date          object
end_date            object
created_on          object
lat                float64
lon                float64
l1                  object
l2                  object
l3                  object
rooms              float64
bedrooms           float64
bathrooms          float64
surface_total      float64
surface_covered    float64
price              float64
price_period        object
property_type       object
operation_type      object
dtype: object

In [16]:
argentina['end_date'] = pd.to_datetime(argentina['end_date'])
argentina['start_date'] = pd.to_datetime(argentina['start_date'])
argentina['created_on'] = pd.to_datetime(argentina['created_on'])
argentina.dtypes

start_date         datetime64[ns]
end_date           datetime64[ns]
created_on         datetime64[ns]
lat                       float64
lon                       float64
l1                         object
l2                         object
l3                         object
rooms                     float64
bedrooms                  float64
bathrooms                 float64
surface_total             float64
surface_covered           float64
price                     float64
price_period               object
property_type              object
operation_type             object
dtype: object