## IMPORTS

In [208]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import zipfile
import os
import matplotlib as ml
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, RandomTreesEmbedding
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, precision_recall_curve, roc_curve, classification_report, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit

## FUNCTIONS TO LOAD DATASETS

In [191]:
PATH_TRAIN = '../datasets/train.csv'
PATH_TEST = '../datasets/test.csv'

def load_dataset(path):
    return pd.read_csv(path)

## PREPARANDO DATAFRAME

In [192]:
def find_pronouns(text):
    if ', ' in text:
        text = text.split(', ')[1]
        return text.split('.')[0]

In [193]:
# Como a obtenção de amostras estratificadas não funciona se houver categorias com apenas uma instância, é necessário incluir essas instâncias
# em uma classe chamada Another

def adder_pronouns_cat(input_df):
    input_df['Pronouns'] = input_df['Name'].apply(lambda x: find_pronouns(x))
    s = input_df['Pronouns'].value_counts()
    input_df['Pronouns'] = input_df['Pronouns'].apply(lambda x: 'another' if s[x]<=2 else x)
    return input_df

In [194]:
def adder_age_cat(dataframe):
    bins = [0, 19, 40, 65, 100]
    labels= [0, 1, 2, 3]
    dataframe['AgeCat'] = pd.cut(dataframe['Age'], bins=bins, labels=labels)
    return dataframe

In [195]:
def get_prepare_dataframe(dataframe):
    dataframe = adder_pronouns_cat(dataframe)
    dataframe = adder_age_cat(dataframe)
    return dataframe

## SETS

In [196]:
def get_datasets_train_split():
    train = get_prepare_dataframe(load_dataset(PATH_TRAIN))
    X, y = train.drop('Survived', axis=1), train['Survived']
    return train_test_split(X, y, test_size=0.3, random_state=42)

In [197]:
def get_datasets_train_strat(column_name):
    treino, valid = [], []
    train = get_prepare_dataframe(load_dataset(PATH_TRAIN))
    sp = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
    for train_index, valid_index in sp.split(train, train['Pronouns']):
        treino = train.loc[train_index]
        valid = train.loc[valid_index]   
    
    return treino.drop("Survived", axis=1), valid.drop("Survived", axis=1), treino['Survived'], valid['Survived']

## PIPELINES

In [198]:
numerical_labels = ['Fare', 'Age']
categorical_labels = ['Pclass', 'Sex', 'SibSp', 'Embarked', 'Pronouns', 'AgeCat']

In [199]:
num_pipe = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [200]:
cat_pipe = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder())
])

In [201]:
preprocess = ColumnTransformer([
    ('num', num_pipe, numerical_labels),
    ('ohe', cat_pipe, categorical_labels)
])

## AVALIANDO MODELOS

In [202]:
models = [DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression(), SVC(), SGDClassifier(), KNeighborsClassifier()]

In [214]:
X_treino, X_valid, y_treino, y_valid = get_datasets_train_strat('Pronouns')
#X_treino, X_valid, y_treino, y_valid = get_datasets_train_split()

def display_cross_validate(model):
   estimator = make_pipeline(preprocess, model)
   scores = cross_val_score(estimator, X_treino, y_treino, cv=5, scoring="accuracy")
   print("Model Name: ", type(model).__name__)
   print(scores.mean())
   print("--------------------------------------------------------------------------")

for model in models:
   display_cross_validate(model)

Model Name:  DecisionTreeClassifier
0.7544903225806452
--------------------------------------------------------------------------
Model Name:  RandomForestClassifier
0.8042451612903226
--------------------------------------------------------------------------
Model Name:  LogisticRegression
0.8057935483870967
--------------------------------------------------------------------------
Model Name:  SVC
0.8122193548387097
--------------------------------------------------------------------------
Model Name:  SGDClassifier
0.719341935483871
--------------------------------------------------------------------------
Model Name:  KNeighborsClassifier
0.8025935483870968
--------------------------------------------------------------------------


In [211]:
X_treino, X_valid, y_treino, y_valid = get_datasets_train_strat('Pronouns')
#X_treino, X_valid, y_treino, y_valid = get_datasets_train_split()

def display_stats(model):
    estimator = make_pipeline(preprocess, model)
    estimator.fit(X_treino, y_treino)
    y_predict = estimator.predict(X_valid)
    
    print("Model Name: ", type(model).__name__)
    print('\nPRECISION: {}\n'.format(precision_score(y_valid, y_predict)))
    print('RECALL: {}\n'.format(recall_score(y_valid, y_predict)))
    print('CONFUSION MATRIX: \n{}'.format(confusion_matrix(y_valid, y_predict)))
    print(classification_report(y_valid, y_predict))
    print("--------------------------------------------------------------------------")


for model in models:
    display_stats(model)

Model Name:  DecisionTreeClassifier

PRECISION: 0.7169811320754716

RECALL: 0.6846846846846847

CONFUSION MATRIX: 
[[127  30]
 [ 35  76]]
              precision    recall  f1-score   support

           0       0.78      0.81      0.80       157
           1       0.72      0.68      0.70       111

    accuracy                           0.76       268
   macro avg       0.75      0.75      0.75       268
weighted avg       0.76      0.76      0.76       268

--------------------------------------------------------------------------
Model Name:  RandomForestClassifier

PRECISION: 0.801980198019802

RECALL: 0.7297297297297297

CONFUSION MATRIX: 
[[137  20]
 [ 30  81]]
              precision    recall  f1-score   support

           0       0.82      0.87      0.85       157
           1       0.80      0.73      0.76       111

    accuracy                           0.81       268
   macro avg       0.81      0.80      0.80       268
weighted avg       0.81      0.81      0.81       2