In [30]:
import pandas as pd

df = pd.read_csv("./data/teste_indicium_precificacao.csv")

In [31]:
df.head(
)

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


In [32]:
from sklearn.model_selection import train_test_split

def set_creator(data, test_size=0.2):
    train_set, test_set = train_test_split(data, test_size=test_size, stratify=data['bairro_group'], random_state=42)
    return train_set, test_set


In [33]:

from pre_process import PreProcess
train_set, test_set = set_creator(df)
data_prep = PreProcess(
    data=train_set,
    irrelevant_columns=['nome', 'host_id', 'host_name', 'id', 'ultima_review'],
    encoder_type='ordinal_encoder',
    cat_columns=['bairro', 'bairro_group', 'room_type'],
    )

In [34]:
data_prep.pre_process()

In [35]:
data_prep.X_prep

array([[ 2.39983712e-01, -1.01160079e+00, -2.94372689e-01, ...,
         3.40000000e+01,  2.00000000e+00,  0.00000000e+00],
       [ 5.65447308e-03, -1.02639994e+00, -2.45652894e-01, ...,
         9.20000000e+01,  2.00000000e+00,  0.00000000e+00],
       [-6.61945496e-01,  1.04722189e+00, -2.45652894e-01, ...,
         2.80000000e+01,  1.00000000e+00,  1.00000000e+00],
       ...,
       [ 2.31062542e+00,  4.35886488e-01, -2.94372689e-01, ...,
         2.06000000e+02,  2.00000000e+00,  1.00000000e+00],
       [-1.03562546e+00, -6.99295236e-01, -2.94372689e-01, ...,
         1.51000000e+02,  1.00000000e+00,  0.00000000e+00],
       [-3.99196044e-01, -1.08852748e-01, -2.94372689e-01, ...,
         2.14000000e+02,  1.00000000e+00,  1.00000000e+00]],
      shape=(39115, 11))

In [36]:
def display_scores(scores):
    print("Pontuacao: ", scores)
    print("Média: ", scores.mean())
    print("Desvio Padrão: ", scores.std())

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

def create_linreg_model(X, Y):
    lin_reg = LinearRegression()
    lin_reg.fit(X, Y)
    return lin_reg

def create_decision_tree_reg(X, Y):
    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(X, Y)
    return tree_reg

def calculate_score(model, X, Y):
    scores = cross_val_score(model, X, Y, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    return rmse_scores



In [38]:
lin_reg = create_linreg_model(data_prep.X_prep, data_prep.Y)
display_scores(calculate_score(lin_reg, data_prep.X_prep, data_prep.Y))

Pontuacao:  [237.23385206 258.22046387 227.21219891 208.11342809 213.84610278
 231.261233   202.34711    208.33797817 240.35993644 218.93985212]
Média:  224.58721554332442
Desvio Padrão:  16.639274260921177


In [39]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = create_decision_tree_reg(data_prep.X_prep, data_prep.Y)
display_scores(calculate_score(tree_reg, data_prep.X_prep, data_prep.Y))

Pontuacao:  [295.52960372 364.31918921 271.94489764 282.75718562 336.58090373
 308.77319047 309.70026007 236.07237221 277.01984888 282.32584253]
Média:  296.50232940797963
Desvio Padrão:  33.99329372162417


**O modelo de Regressão Linear se saiu melhor**
**Nas proximas linhas, vou testar um modelo com a remoção de outliers**

In [40]:
data_prep_no_outliers = PreProcess(
    data=train_set,
    irrelevant_columns=['nome', 'host_id', 'host_name', 'id', 'ultima_review'],
    encoder_type="ordinal_encoder",
    cat_columns=['bairro', 'bairro_group', 'room_type'],
    remove_outliers=True,
)

In [41]:
data_prep_no_outliers.pre_process()

In [42]:
lin_reg_no_out = create_linreg_model(data_prep_no_outliers.X_prep, data_prep_no_outliers.Y)
display_scores(calculate_score(lin_reg_no_out, data_prep_no_outliers.X_prep, data_prep_no_outliers.Y))

Pontuacao:  [41.86234953 41.84816811 42.14398068 41.64688194 42.04281395 40.2112417
 41.79797737 41.48718201 43.15116523 41.9186896 ]
Média:  41.811045011067826
Desvio Padrão:  0.6831881891199195


In [43]:
tree_reg_no_out = create_decision_tree_reg(data_prep_no_outliers.X_prep, data_prep_no_outliers.Y)
display_scores(calculate_score(tree_reg_no_out, data_prep_no_outliers.X_prep, data_prep_no_outliers.Y))

Pontuacao:  [52.77232165 50.52337932 51.81558217 52.00044932 52.91120118 50.57638277
 50.90160523 51.51391384 51.74394247 51.65916751]
Média:  51.641794545510514
Desvio Padrão:  0.7751931836903021


**Os valores da média diminuíram, porém, isso não indica necessáriamente um modelo melhor**  
**Regressão Linear será o modelo utilizado para aperfeiçoamento**

In [44]:
data_prep_no_outliers.X_prep.tolist()

[[0.012703653704297754,
  -1.0703880728166042,
  -0.23472519725264626,
  -0.5108360778656206,
  -0.07013985866009062,
  -0.1716279788346435,
  0.300107565973677,
  -0.26174054874327257,
  91.0,
  2.0,
  0.0],
 [-0.6432454734144019,
  1.0022197758775877,
  -0.23472519725264626,
  -0.03160732416341221,
  0.7611282698374677,
  -0.030596616980590484,
  1.7909028377630931,
  -0.26174054874327257,
  28.0,
  1.0,
  1.0],
 [0.25969795562136305,
  -0.9790262658843013,
  2.318759749663256,
  -0.2930048261827986,
  0.1207439338097191,
  -0.1716279788346435,
  0.7481185802938124,
  -0.26174054874327257,
  34.0,
  2.0,
  1.0],
 [-0.2969850443229233,
  -0.22855428036829373,
  -0.09018831346495366,
  -0.1840892003413876,
  -0.30412644297792185,
  -0.13637013837113024,
  -0.7504010193287095,
  -0.26174054874327257,
  213.0,
  1.0,
  1.0],
 [-1.4629566869246018,
  -0.247261697978318,
  -0.18654623599008205,
  0.8832839329044401,
  0.5148266021344874,
  -0.13637013837113024,
  1.76000552643067,
  -0.261

In [45]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False],
    'n_jobs': [None, -1]
}
model = create_linreg_model(data_prep_no_outliers.X_prep, data_prep_no_outliers.Y)

grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(data_prep_no_outliers.X_prep, data_prep_no_outliers.Y)

In [46]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

55.045855686267984 {'fit_intercept': True, 'n_jobs': None, 'positive': True}
41.81662625104149 {'fit_intercept': True, 'n_jobs': None, 'positive': False}
55.045855686267984 {'fit_intercept': True, 'n_jobs': -1, 'positive': True}
41.81662625104149 {'fit_intercept': True, 'n_jobs': -1, 'positive': False}
66.02080719167733 {'fit_intercept': False, 'n_jobs': None, 'positive': True}
60.12245238472951 {'fit_intercept': False, 'n_jobs': None, 'positive': False}
66.02080719167733 {'fit_intercept': False, 'n_jobs': -1, 'positive': True}
60.12245238472951 {'fit_intercept': False, 'n_jobs': -1, 'positive': False}


In [47]:
final_model = grid_search.best_estimator_

In [48]:
display_scores(calculate_score(final_model, data_prep_no_outliers.X_prep, data_prep_no_outliers.Y))

Pontuacao:  [41.86234953 41.84816811 42.14398068 41.64688194 42.04281395 40.2112417
 41.79797737 41.48718201 43.15116523 41.9186896 ]
Média:  41.811045011067826
Desvio Padrão:  0.6831881891199195


**Temos um modelo final**
**Vamos agora avaliar no conjunto de teste**

In [49]:
data_test = PreProcess(
    data=test_set,
    irrelevant_columns=['nome', 'host_id', 'host_name', 'id', 'ultima_review'],
    encoder_type="ordinal_encoder",
    cat_columns=['bairro', 'bairro_group', 'room_type'],
)

In [50]:
data_test.pre_process()

In [51]:
test_predictions = final_model.predict(data_test.X_prep)

In [52]:
from sklearn.metrics import mean_squared_error
final_mse = mean_squared_error(data_test.Y, test_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

np.float64(250.3209601301908)

## Resultado final
**O modelo final tem um desempenho semelhate no conjunto de teste e validação**  
**teste = 221.987**  
**validação = 213.805**  
**Isso é um indicativo que o modelo não está subajustado ou sobreajustado**  
**OBS: No conjunto de teste, não houve a remoção de outliers**

**Mas e se a coluna "nome" influenciar no preço?**

In [53]:
train_set.head()

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
23650,19122738,New York City living,10733404,Rob,Manhattan,Chelsea,40.74216,-73.99887,Entire home/apt,275,1,2,2017-08-13,0.08,1,0
46896,35482322,⚡Quiet Home in Center of Village,266797970,Javi,Manhattan,Greenwich Village,40.72938,-73.99955,Entire home/apt,249,2,1,2019-06-24,1.0,1,146
35636,28298405,Great room in Bushwick!,212071658,Brian,Brooklyn,Bushwick,40.69297,-73.90427,Private room,65,2,23,2019-05-05,2.35,5,339
18587,14690331,Unique Designer 1BR in Best NYC Neighborhood,27984357,Patrick,Manhattan,West Village,40.73316,-74.00782,Entire home/apt,300,4,24,2019-06-20,0.7,1,61
34170,27100997,Cozy & Private Bedroom near Highline- CHELSEA,203346157,Bryan,Manhattan,Chelsea,40.74309,-73.99535,Private room,115,55,11,2019-05-20,1.31,1,204


In [54]:
data_with_name = PreProcess(
    data=train_set,
    irrelevant_columns=['id', 'host_id', 'host_name', 'ultima_review'],
    encoder_type="ordinal_encoder",
    cat_columns=['bairro', 'bairro_group', 'room_type'],
    name_columns=['nome'],
    remove_outliers=True

)

In [55]:
data_with_name.X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39115 entries, 23650 to 37162
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             39115 non-null  int64  
 1   nome                           39106 non-null  object 
 2   host_id                        39115 non-null  int64  
 3   host_name                      39097 non-null  object 
 4   bairro_group                   39115 non-null  object 
 5   bairro                         39115 non-null  object 
 6   latitude                       39115 non-null  float64
 7   longitude                      39115 non-null  float64
 8   room_type                      39115 non-null  object 
 9   minimo_noites                  39115 non-null  int64  
 10  numero_de_reviews              39115 non-null  int64  
 11  ultima_review                  31109 non-null  object 
 12  reviews_por_mes                31109 non-null  

In [56]:
data_with_name.pre_process()

In [57]:
lin_reg = create_linreg_model(data_with_name.X_prep, data_with_name.Y)
display_scores(calculate_score(lin_reg, data_with_name.X_prep, data_with_name.Y))


Pontuacao:  [41.81055648 41.55223866 42.00715165 41.3800201  41.82970927 40.11220124
 41.60425597 41.30582384 42.89971503 41.78252456]
Média:  41.62841968082914
Desvio Padrão:  0.6578864369796076


In [58]:
import joblib
joblib.dump(final_model, "linear_regression_model.pkl")

['linear_regression_model.pkl']

**Temos um desempenho levemente melhor com a inclusão da coluna 'nomes'.**