# First Model


## Importing packages and data


In [17]:
from typing import Any

import pickle
import sklearn
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, TargetEncoder

In [18]:
df = pd.read_csv('../dataset/teste_indicium_precificacao.csv')
df.head()

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


## Defining Functions


In [19]:
def predict_data(model: BaseEstimator, data: dict[str, Any] = None) -> float:
    '''
    Takes a model and uses it to predict a value for a set of data.

    data: takes a dict containing the dict to predict, by default `None`
    '''
    if not data:
        data = pd.DataFrame(
            {
                'nome': ['Skylit Midtown Castle'],
                'host_id': ['Jennifer'],
                'bairro_group': ['Manhatan'],
                'bairro': ['Midtown'],
                'latitude': [40.75362],
                'longitude': [-73.98377],
                'room_type': ['Entire home / apt'],
                'minimo_noites': [1],
                'numero_de_reviews': [45],
                'ultima_review': ['2019-05-21'],
                'reviews_por_mes': [0.38],
                'calculado_host_listings_count': [2],
                'disponibilidade_365': [355],
            },
        )
    prediction = model.predict(data)

    return float(prediction[0])

## Exploring data


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48894 entries, 0 to 48893
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             48894 non-null  int64  
 1   nome                           48878 non-null  object 
 2   host_id                        48894 non-null  int64  
 3   host_name                      48873 non-null  object 
 4   bairro_group                   48894 non-null  object 
 5   bairro                         48894 non-null  object 
 6   latitude                       48894 non-null  float64
 7   longitude                      48894 non-null  float64
 8   room_type                      48894 non-null  object 
 9   price                          48894 non-null  int64  
 10  minimo_noites                  48894 non-null  int64  
 11  numero_de_reviews              48894 non-null  int64  
 12  ultima_review                  38842 non-null 

In [21]:
df.nunique()

id                               48894
nome                             47904
host_id                          37457
host_name                        11452
bairro_group                         5
bairro                             221
latitude                         19048
longitude                        14718
room_type                            3
price                              674
minimo_noites                      109
numero_de_reviews                  394
ultima_review                     1764
reviews_por_mes                    937
calculado_host_listings_count       47
disponibilidade_365                366
dtype: int64

In [22]:
df.isna().sum()

id                                   0
nome                                16
host_id                              0
host_name                           21
bairro_group                         0
bairro                               0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimo_noites                        0
numero_de_reviews                    0
ultima_review                    10052
reviews_por_mes                  10052
calculado_host_listings_count        0
disponibilidade_365                  0
dtype: int64

### Treating data


In [23]:
df['ultima_review'] = pd.to_datetime(df['ultima_review'])
df.fillna(
    {
        'nome': df.bairro + ' ' + df.room_type,
        'reviews_por_mes': 0,
        'ultima_review': df['ultima_review'].min(),
    },
    inplace=True,
)
df.drop('host_name', inplace=True, axis=1)

## Encoding


In [24]:
np.random.seed(42)

categorical_features = ['bairro_group', 'bairro', 'room_type']
numerical_features = [
    'minimo_noites',
    'numero_de_reviews',
    'reviews_por_mes',
    'calculado_host_listings_count',
    'disponibilidade_365',
]


categorical_transformer = TargetEncoder(
    categories='auto',
    target_type='continuous',
    smooth=0.2,
)

numerical_transformer = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaling', StandardScaler()),
    ],
)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features),
    ]
)

model = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor()),
    ],
)

param_grid = {
    'model__n_estimators': [100, 300, 500, 1000],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10, 15],
    'model__min_samples_leaf': [1, 2, 5, 10],
    'model__max_features': ['sqrt', 'log2', None],
    'model__bootstrap': [True, False],
}

In [25]:
X = df.drop('price', axis=1).copy()
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [26]:
rs_model = RandomizedSearchCV(model, param_grid, cv=5, verbose=2, n_jobs=5)
rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END model__bootstrap=True, model__max_depth=None, model__max_features=None, model__min_samples_leaf=5, model__min_samples_split=15, model__n_estimators=300; total time=  19.1s
[CV] END model__bootstrap=True, model__max_depth=None, model__max_features=None, model__min_samples_leaf=5, model__min_samples_split=15, model__n_estimators=300; total time=  19.2s
[CV] END model__bootstrap=True, model__max_depth=None, model__max_features=None, model__min_samples_leaf=5, model__min_samples_split=15, model__n_estimators=300; total time=  19.3s
[CV] END model__bootstrap=True, model__max_depth=None, model__max_features=None, model__min_samples_leaf=5, model__min_samples_split=15, model__n_estimators=300; total time=  19.3s
[CV] END model__bootstrap=True, model__max_depth=None, model__max_features=None, model__min_samples_leaf=5, model__min_samples_split=15, model__n_estimators=300; total time=  19.5s
[CV] END model__bootstrap=False, m



[CV] END model__bootstrap=False, model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=500; total time=  15.7s
[CV] END model__bootstrap=False, model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=500; total time=  15.8s
[CV] END model__bootstrap=False, model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=500; total time=  15.9s
[CV] END model__bootstrap=False, model__max_depth=30, model__max_features=sqrt, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=500; total time=  15.8s
[CV] END model__bootstrap=False, model__max_depth=20, model__max_features=log2, model__min_samples_leaf=5, model__min_samples_split=10, model__n_estimators=100; total time=   3.8s
[CV] END model__bootstrap=False, model__max_depth=20, model__max_features=log2, model__min_samples_leaf=

In [27]:
score_test, score_train = rs_model.score(X_test, y_test), rs_model.score(
    X_train, y_train
)
score_test, score_train

(0.13059541029071786, 0.42802655635896103)

In [28]:
prediction = predict_data(rs_model)
prediction

357.90047857142855

In [30]:
pickle.dump(rs_model, open('../models/model.pickle', 'wb'))