## IMPORTS

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import zipfile
import os
import matplotlib as ml
import matplotlib.pyplot as plt

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, RandomTreesEmbedding
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

## GET DATA

In [3]:
def load_train():
    return pd.read_csv('datasets/train.csv')

In [4]:
def load_test():
    return pd.read_csv('datasets/test.csv')

In [32]:
train = load_train()
test = load_test()
df = load_train()

## INICIAL DATA ANALYSIS

##### TRANSFORMAÇÕES TEMPORÁRIAS

In [33]:
def encontrar_pronome_tratamento(texto):
    if ', ' in texto:
        texto = texto.split(', ')[1]
        return texto.split('.')[0]

In [34]:
# A extração estratificada por categoria não funciona se as categorias possuirem apenas um elemento. 
def ajuste(cat):
    s = df['Tratamento'].value_counts()
    if s[cat] <= 1:
        return 'another'
    else:
        return cat

In [35]:
# Transformando o sexo em valor binário
df['SexBin'] = df['Sex'].map({'male': 0, 'female': 1})

In [36]:
df['Tratamento'] = df['Name'].apply(lambda x: encontrar_pronome_tratamento(x))
df['Tratamento'] = df['Tratamento'].apply(lambda x: ajuste(x))
df['TratamentoCat'], mapping = df['Tratamento'].factorize()

In [37]:
df.corr()['Survived']

PassengerId     -0.005007
Survived         1.000000
Pclass          -0.338481
Age             -0.077221
SibSp           -0.035322
Parch            0.081629
Fare             0.257307
SexBin           0.543351
TratamentoCat    0.357823
Name: Survived, dtype: float64

##### SEX

In [38]:
df['Sex'].value_counts() 

male      577
female    314
Name: Sex, dtype: int64

In [39]:
# Percentual de sobreviventes por sexo.
sobreviventes = df[df['Survived']==1]
sobreviventes['Sex'].value_counts() / df['Sex'].value_counts()

female    0.742038
male      0.188908
Name: Sex, dtype: float64

In [13]:
sobreviventes['Tratamento'].value_counts()

Miss       127
Mrs         99
Mr          81
Master      23
another      5
Dr           3
Mlle         2
Major        1
Col          1
Name: Tratamento, dtype: int64

In [None]:
sns.catplot(data=df, kind='swarm', x="Sex", y='Age', hue='Survived')

In [None]:
sns.catplot(data=df[df['Survived']==1], kind='bar', x="Sex", y='Age', hue='Tratamento')

In [None]:
sns.catplot(data=df, kind='bar', x="Pclass", y='Age', hue='Survived')

In [None]:
df['Pclass'].value_counts() / len(train)

## SETS

In [77]:
labels = ['Age', 'Fare', 'Pclass', 'Sex', 'Embarked', 'TratamentoCat']

In [89]:
from sklearn.model_selection import train_test_split

treino, validacao = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)

X_treino = treino.drop("Survived", axis=1)
y_treino = treino['Survived']

X_valid = validacao.drop("Survived", axis=1)
y_valid = validacao['Survived']

In [65]:
# AMOSTRA ESTRATIFICADA POR CATEGORIA
from sklearn.model_selection import StratifiedShuffleSplit

strat_train_set, strat_valid_set = [], []

sp = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, valid_index in sp.split(df, df['TratamentoCat']):
    strat_train_set = df.loc[train_index]
    strat_valid_set = df.loc[valid_index]


X_treino = strat_train_set.drop("Survived", axis=1)
y_treino = strat_train_set['Survived']

X_valid = strat_valid_set.drop("Survived", axis=1)
y_valid = strat_valid_set['Survived']

## TRASFORMING DATA

#### Pipelines

In [80]:
pipe_fill_scaler = Pipeline([
    ('imp', SimpleImputer(strategy='median')), 
    ('scale', StandardScaler())
])

In [81]:
pipe_cat_attribs = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent'))
    #('num', OneHotEncoder()) 
])

In [86]:
transforms = ColumnTransformer([
    ('fill_scaler', pipe_fill_scaler, ['Age', 'Fare']), 
    ('cat_attribs', pipe_cat_attribs, ['Pclass', 'Sex', 'Embarked', 'TratamentoCat'])
])

## UTIL

In [23]:
def display_scores(y_valid, y_predict):
    print("precision_score: {}\n".format(precision_score(y_valid, y_predict)))
    print("recall_score: {}\n".format(recall_score(y_valid, y_predict)))
    print("confusion_matrix: \n{}\n".format(confusion_matrix(y_valid, y_predict)))
    print("classification_report: \n{}".format(classification_report(y_valid, y_predict)))


## CROS-VAL-SCORE

In [24]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_predict

In [25]:
models = [SVC()]#[DecisionTreeClassifier(), RandomForestClassifier(), SGDClassifier(), KNeighborsClassifier(), SVC(), LogisticRegression()]

for model in models:
  pipe = make_pipeline(transforms, model)
  val_scores = cross_val_score(pipe, X_treino[labels], y_treino, cv=3)
  nome_modelo = type(model).__name__ # somente para exibição
  print('Modelo: {} | Média: {:.2} | Desvio: {:.2}'.format(nome_modelo, np.mean(val_scores), np.std(val_scores)))

Modelo: SVC | Média: 0.81 | Desvio: 0.028


## SVC

In [87]:
model = SVC()
model.fit(transforms.fit_transform(X_treino[labels]), y_treino)

SVC()

In [88]:
y_predict = model.predict(transforms.fit_transform(X_valid[labels]))

ValueError: X.shape[1] = 18 should be equal to 19, the number of features at training time

In [70]:
display_scores(y_valid, y_predict)

precision_score: 0.9066666666666666

recall_score: 0.6126126126126126

confusion_matrix: 
[[150   7]
 [ 43  68]]

classification_report: 
              precision    recall  f1-score   support

           0       0.78      0.96      0.86       157
           1       0.91      0.61      0.73       111

    accuracy                           0.81       268
   macro avg       0.84      0.78      0.79       268
weighted avg       0.83      0.81      0.80       268



## SUBMISSÃO KAGGLE