# IESB - CIA035 - Aula 9.1 - Intro Pipeline

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/melbourne-housing-snapshot/melb_data.csv


In [2]:
# Carregando os dados
df = pd.read_csv('/kaggle/input/melbourne-housing-snapshot/melb_data.csv')

df.shape

(13580, 21)

In [3]:
# Visualizando os dados
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [4]:
# Uma forma fácil de separar as colunas númericas das colunas categóricas
num_feats = df.select_dtypes(['int', 'float']).columns

num_feats = num_feats.drop('Price')

cat_feats = df.select_dtypes('object').columns

# Visualizando
print(num_feats , len(num_feats))
print(cat_feats, len(cat_feats))

Index(['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude',
       'Propertycount'],
      dtype='object') 12
Index(['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea',
       'Regionname'],
      dtype='object') 8


In [5]:
# Qtde de valore unicos da coluna 'Address'
df['Address'].nunique()

13378

In [6]:
# Vamos retirar da lista a coluna 'Address'
# que possui muitos valores únicos
cat_feats = cat_feats.drop('Address')

cat_feats

Index(['Suburb', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea',
       'Regionname'],
      dtype='object')

In [7]:
# Preparando a lista de colunas
feats = list(num_feats) + list(cat_feats)

feats

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount',
 'Suburb',
 'Type',
 'Method',
 'SellerG',
 'Date',
 'CouncilArea',
 'Regionname']

In [8]:
# Separando o dataframe
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state = 42)

train.shape, test.shape

((10185, 21), (3395, 21))

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessamento dos dados numéricos
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessamento das colunas categóricas
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Juntando o preprocessamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_feats),
        ('cat', categorical_transformer, cat_feats)
    ])

In [10]:
# Modelo
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42)

In [11]:
from sklearn.metrics import mean_absolute_error

# Montando o pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', rfr)
                             ])

my_pipeline.fit(train[feats], train['Price'])

preds = my_pipeline.predict(test[feats])

score = my_pipeline.score(test[feats], test['Price'])

mae = mean_absolute_error(test['Price'], preds)

score, mae

(0.7985634617418413, 166629.11779528717)

In [12]:
# Usando o pipeline em qualquer momento

# Tunning de Hiperparâmetros
# Tentando melhorar o resultado usando GridSearchCV 
from sklearn.model_selection import GridSearchCV

# ATENÇÃO
# Ao usar o pipeline com o GridSearch os nomes dos parâmetros devem
# ser precedidos pelo nome dado ao estimador (no caso, 'model')
# e 2 underlines (__)

rfr_params_grid = {
    'model__n_estimators' : [100,150,200],
    'model__max_depth' : [7],
    'model__max_features': [5],
    'model__min_samples_leaf' : [3],
    'model__min_samples_split' : [4, 6 ,9]
}

# Ao invés de usar o modelo como 'estimator' usamos o pipeline
gscv_rfr_cv = GridSearchCV(my_pipeline, 
                           param_grid=rfr_params_grid,
                           cv = 5 ,
                           n_jobs = -1,
                           verbose = 5)

gscv_rfr_cv.fit(train[feats], train['Price'])

gscv_rfr_cv_score = gscv_rfr_cv.best_score_

gscv_rfr_cv_pred = gscv_rfr_cv.predict(test[feats])

mae = mean_absolute_error(test['Price'], gscv_rfr_cv_pred)

gscv_rfr_cv_score, mae

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   16.2s finished


(0.19477958240807128, 410839.83864060306)

In [13]:
gscv_rfr_cv.best_params_

{'model__max_depth': 7,
 'model__max_features': 5,
 'model__min_samples_leaf': 3,
 'model__min_samples_split': 6,
 'model__n_estimators': 100}