In [108]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [109]:
df_train = pd.read_csv("../datasets/titanic/train.csv")
df_test = pd.read_csv("../datasets/titanic/test.csv")\

y = df_train['Survived']
X = df_train.drop('Survived', axis=1)
X_test = df_test

In [110]:
print("Tipo de dados do dataset de treino:")
for column in df_train.columns:
    print(f"{column}: {df_train[column].dtype}")

print("\nTipo de dados do dataset de teste:")
for column in df_test.columns:
    print(f"{column}: {df_test[column].dtype}")

Tipo de dados do dataset de treino:
PassengerId: int64
Survived: int64
Pclass: int64
Name: object
Sex: object
Age: float64
SibSp: int64
Parch: int64
Ticket: object
Fare: float64
Cabin: object
Embarked: object

Tipo de dados do dataset de teste:
PassengerId: int64
Pclass: int64
Name: object
Sex: object
Age: float64
SibSp: int64
Parch: int64
Ticket: object
Fare: float64
Cabin: object
Embarked: object


In [111]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [112]:
from sklearn.base import BaseEstimator, TransformerMixin

# Seleciona atributos desejados
class AtributosDesejados(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Name', 'Ticket', 'Cabin']
        return self
    def transform(self, X, y=None):
        return X.drop(self.colunasIndesejadas, axis=1)

atributosDesejados = AtributosDesejados()
Xdrop = atributosDesejados.fit_transform(X)
Xdrop

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [113]:
# Seleciona atributos númericos
class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas]

atributosNumericos = AtributosNumericos()
Xnum = atributosNumericos.fit_transform(Xdrop)
Xnum.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [114]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Pipeline para ler os atributos numericos, preecher nulos com mediana e padronizar valores
pipenum = Pipeline([
    ('atributos_numericos', AtributosNumericos()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

XnumLimpo = pipenum.fit_transform(Xnum)
XnumLimpo

array([[ 0.82737724, -0.56573646,  0.43279337, -0.47367361, -0.50244517],
       [-1.56610693,  0.66386103,  0.43279337, -0.47367361,  0.78684529],
       [ 0.82737724, -0.25833709, -0.4745452 , -0.47367361, -0.48885426],
       ...,
       [ 0.82737724, -0.1046374 ,  0.43279337,  2.00893337, -0.17626324],
       [-1.56610693, -0.25833709, -0.4745452 , -0.47367361, -0.04438104],
       [ 0.82737724,  0.20276197, -0.4745452 , -0.47367361, -0.49237783]])

In [115]:
class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    
    def transform(self, X, y=None):
        return X[self.colunasCategoricas]

atributosCategoricos = AtributosCategoricos()
Xcat = atributosCategoricos.fit_transform(Xdrop)
Xcat

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S
...,...,...
886,male,S
887,female,S
888,female,S
889,male,C


In [116]:
from sklearn.preprocessing import OneHotEncoder

# Pipeline para ler o atributos categóricos, preencher os valores nulos com o mais frequente e criar colunas com one-hot-encoder
pipecat = Pipeline([
    ('atributos_categoricos', AtributosCategoricos()),
    ('inputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

XcatLimpo = pipecat.fit_transform(Xdrop)
XcatLimpo

<891x5 sparse matrix of type '<class 'numpy.float64'>'
	with 1782 stored elements in Compressed Sparse Row format>

In [117]:
from sklearn.pipeline import FeatureUnion

# Une dados numericos e categoricos
unecarcteristicas = FeatureUnion([
    ('pipenum', pipenum),
    ('pipecat', pipecat)
])

Xtratado = unecarcteristicas.fit_transform(Xdrop)
Xtratado

<891x10 sparse matrix of type '<class 'numpy.float64'>'
	with 6237 stored elements in Compressed Sparse Row format>

In [118]:
# pipeline final de preprocessamento
preproc = Pipeline([
    ('atributos_desejados', AtributosDesejados()),
    ('unecaracteristicas', unecarcteristicas)
])

Xtratado = preproc.fit_transform(X)
Xtratado

<891x10 sparse matrix of type '<class 'numpy.float64'>'
	with 6237 stored elements in Compressed Sparse Row format>

In [119]:
from sklearn.tree import DecisionTreeClassifier

pipetotal = Pipeline([ 
    ('preproc', preproc),
    ('arvore', DecisionTreeClassifier())
])


In [120]:
from sklearn.metrics import accuracy_score

pipetotal.fit(X, y)
ypred = pipetotal.predict(X)
accuracy_score(y, ypred)

0.9797979797979798

In [121]:
from sklearn.model_selection import cross_validate

scores = cross_validate(pipetotal, X, y)
scores, np.mean(scores['test_score'])

({'fit_time': array([0.02292633, 0.01546431, 0.02206302, 0.01509213, 0.01514792]),
  'score_time': array([0.00539589, 0.00578547, 0.00792599, 0.00530243, 0.00586867]),
  'test_score': array([0.75418994, 0.78089888, 0.80898876, 0.73595506, 0.80337079])},
 0.7766806854560291)

## Modelo: Decision Tree

In [122]:
from sklearn.model_selection import GridSearchCV

parametros = {
    'arvore__max_depth': [None] + list(range(1, 20, 2)),
    'arvore__criterion': ['gini', 'entropy']
}

modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y)
scores, np.mean(scores['test_score'])

({'fit_time': array([1.99056792, 1.95237136, 1.96606541, 2.34236288, 1.99717236]),
  'score_time': array([0.0047121 , 0.00468636, 0.00472212, 0.00460124, 0.00470662]),
  'test_score': array([0.82122905, 0.79775281, 0.80898876, 0.78089888, 0.8258427 ])},
 0.8069424392693489)

In [123]:
modelo.fit(X, y)

modelo.predict(X_test)

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Modelo: Xgboost

In [125]:
#!pip install catboost


import xgboost as xgb



xgboostclassifier = xgb.XGBClassifier(random_state=4)

xgb.set_config(verbosity=0)


pipetotal2 = Pipeline([ 
    ('preproc', preproc),
    ('boost', xgboostclassifier)
])


params = {
        'boost__n_estimators': [10, 20, 30, 100, 500, 600, 700],
        'boost__min_child_weight': [1, 5, 10],
        # 'boost__gamma': [0.5, 1, 1.5, 2, 5],
        # 'boost__subsample': [0.6, 0.8, 1.0],
        # 'boost__colsample_bytree': [0.6, 0.8, 1.0],
        'boost__max_depth': [3, 4, 5]
        }


modelo2 = GridSearchCV(pipetotal2, param_grid=params, verbose=1)

scores = cross_validate(modelo2, X, y)
scores, np.mean(scores['test_score'])

Fitting 5 folds for each of 63 candidates, totalling 315 fits
Fitting 5 folds for each of 63 candidates, totalling 315 fits
Fitting 5 folds for each of 63 candidates, totalling 315 fits
Fitting 5 folds for each of 63 candidates, totalling 315 fits
Fitting 5 folds for each of 63 candidates, totalling 315 fits


({'fit_time': array([38.771029  , 36.15739107, 38.51706982, 35.44217253, 35.83389425]),
  'score_time': array([0.00866175, 0.00965762, 0.00901008, 0.00756073, 0.00756979]),
  'test_score': array([0.81005587, 0.81460674, 0.81460674, 0.80337079, 0.83146067])},
 0.8148201619484026)

In [126]:
modelo2.fit(X, y)
modelo2.best_estimator_

Fitting 5 folds for each of 63 candidates, totalling 315 fits


Pipeline(steps=[('preproc',
                 Pipeline(steps=[('atributos_desejados', AtributosDesejados()),
                                 ('unecaracteristicas',
                                  FeatureUnion(transformer_list=[('pipenum',
                                                                  Pipeline(steps=[('atributos_numericos',
                                                                                   AtributosNumericos()),
                                                                                  ('imputer',
                                                                                   SimpleImputer(strategy='median')),
                                                                                  ('scaler',
                                                                                   StandardScaler())])),
                                                                 ('pipecat',
                                                            

In [127]:
y_pred = modelo2.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [128]:
my_submission = pd.DataFrame({'PassengerId': df_test["PassengerId"], 'Survived': y_pred})
# Arquivo de submissão do kaggle
my_submission.to_csv('../reports/titanic-submission.csv', index=False)