# Esse notebook é uma continuação do Competição Kaggle - Titanic - Data Understanding

## Dentro desse segundo caderno, vamos começar a desenvolver o modelo de machine learning e criar o dataframe para submissão do desafio

In [2]:
#Criação e manipulação dos dataframes
import pandas as pd 

#Operações matemáticas
import numpy as np 

#Biblioteca de normalização
from sklearn.preprocessing import StandardScaler, RobustScaler

#Classificadores
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

#Separar os dataframes em treino e teste
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

#Métricas de qualidade do modelo 
from sklearn.metrics import accuracy_score

#Verificar o run time dos modelos
from time import time

import seaborn as sns
import matplotlib.pyplot as plt

## Importando os dados e realizando o tratamento deles

In [3]:
df_train = pd.read_csv('Data/train.csv')
df_test = pd.read_csv('Data/test.csv')

def limpar_dataframe(df):
    #Criando a nova coluna somando os irmãos/conjugues e pais/filhos
    df['Tamanho_Familia'] = df['SibSp'] + df['Parch']
    
    #Criando uma nova coluna para verificar quem estava sozinho
    df['Sozinho'] = df['Tamanho_Familia'].apply(lambda x: 1 if x == 0 else 0)
    
    #Completando os valores de embarque com a moda da coluna
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    #Completando os valores de idade e preço com a mediana da coluna
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)
    
    #Pegando apenas a letra da cabine
    df['Letra_Cabine'] = df['Cabin'].str[:1]
    
    #Criando uma coluna com apenas o titulo da pessoa
    df['Titulo'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]
    #Pegando os titulos mais comuns e retirando o resto
    titulos = df['Titulo'].value_counts(ascending=False)[:4]
    #Deixando os titulos com poucos valores como Misc
    df['Titulo'] = df['Titulo'].apply(lambda x: x if x in titulos else 'Misc')
    
    #Transformando as colunas categóricas em numéricas
    #df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Letra_Cabine', 'Titulo'])
    
    #Removendo as colunas que não serão necessárias
    df.drop(columns=['PassengerId', 'Cabin', 'Name', 'Ticket'], inplace=True)
    
limpar_dataframe(df_train)
limpar_dataframe(df_test)

df_train = pd.get_dummies(df_train, columns=['Titulo', 'Embarked', 'Sex', 'Letra_Cabine'])
df_test = pd.get_dummies(df_test, columns=['Titulo', 'Embarked', 'Sex', 'Letra_Cabine'])

## Separando em treino/teste

In [4]:
X = df_train.drop(columns=['Survived'])
y = df_train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Testando os diferentes classificadores

In [202]:
models = [
    RandomForestClassifier(),
    BaggingClassifier(),
    ExtraTreesClassifier(),
    Perceptron(),
    MLPClassifier(),
    SVC(),
    GaussianNB(),
    DecisionTreeClassifier(),
    XGBClassifier(),
    CatBoostClassifier(verbose=0)
]

accuracy = []
name = []
parameters = []
tempo = []

for model in models:
    
    tempo_inicial = time()
    
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy.append(accuracy_score(y_test, y_pred))
    name.append(model.__class__.__name__)
    parameters.append(model.get_params())
    
    tempo_final = time() - tempo_inicial
    tempo.append(tempo_final)
    
resultado = pd.DataFrame({'Nome': name,
                         'Accuracy': accuracy,
                         'Run Time': tempo,
                         'Parameters': parameters})

resultado.sort_values(by='Accuracy', ascending=False)



Unnamed: 0,Nome,Accuracy,Run Time,Parameters
9,CatBoostClassifier,0.826816,1.247144,{'verbose': 0}
8,XGBClassifier,0.810056,0.073017,"{'objective': 'binary:logistic', 'use_label_en..."
0,RandomForestClassifier,0.804469,0.12705,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w..."
4,MLPClassifier,0.804469,0.444064,"{'activation': 'relu', 'alpha': 0.0001, 'batch..."
2,ExtraTreesClassifier,0.798883,0.10562,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_..."
1,BaggingClassifier,0.782123,0.021002,"{'base_estimator': None, 'bootstrap': True, 'b..."
6,GaussianNB,0.77095,0.003,"{'priors': None, 'var_smoothing': 1e-09}"
7,DecisionTreeClassifier,0.765363,0.004001,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit..."
5,SVC,0.664804,0.034003,"{'C': 1.0, 'break_ties': False, 'cache_size': ..."
3,Perceptron,0.486034,0.003002,"{'alpha': 0.0001, 'class_weight': None, 'early..."


## Realizando o tuning dos hiperparâmetros do Random Forest Classifier

In [223]:
parameters = {
    'n_estimators': np.arange(100,2000,100),
    'max_features': ['sqrt', 'log2'],
    'max_depth': np.arange(2,20),
    'bootstrap': [True, False],
    'min_samples_leaf': [1,2,4],
    'min_samples_split': [2,5,10]
}

classifier = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, verbose=1)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classifier.best_params_)

Fitting 5 folds for each of 12312 candidates, totalling 61560 fits
0.8212290502793296
{'bootstrap': True, 'max_depth': 9, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}


In [222]:
parameters = {
    'n_estimators': np.arange(100,2000,100),
    'max_features': ['sqrt', 'log2'],
    'max_depth': np.arange(2,20),
    'bootstrap': [True, False],
    'min_samples_leaf': [1,2,4],
    'min_samples_split': [2,5,10]
    
}

classifier = RandomizedSearchCV(RandomForestClassifier(), parameters, n_iter=100,
                               verbose=2, n_jobs=-1, random_state=42)

classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classifier.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
0.8212290502793296
{'n_estimators': 1400, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 6, 'bootstrap': True}


## Tuning dos hiperparâmetros 

In [None]:
classifier = CatBoostClassifier(iterations=5000, verbose=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print(accuracy_score(y_test, y_pred))