## IMPORTS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import zipfile
import os
import matplotlib as ml
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, RandomTreesEmbedding
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, precision_recall_curve, roc_curve, classification_report

## DATA

In [None]:
PATH_TRAIN = '../datasets/train.csv'
PATH_TEST = '../datasets/test.csv'

In [None]:
def load_dataset(path):
    return pd.read_csv(path)

## FUNCTIONS AND HELPERS

In [None]:
def find_pronouns(text):
    if ', ' in text:
        text = text.split(', ')[1]
        return text.split('.')[0]

In [None]:
def create_col_pronouns(dataframe):
    dataframe['Pronouns'] = dataframe['Name'].apply(lambda x: find_pronouns(x))
    return dataframe

## TRANSFORMING DATA

#### DATAFRAME PANDAS TRANSFORMS

In [None]:
labels = ['Survived', 'Age', 'Fare', 'Parch', 'Pclass', 'Sex', 'SibSp', 'Embarked']

numerical_labels = ['Age', 'Fare']
categorical_labels = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

cols_to_drop = ['Name', 'Cabin', 'PassengerId', 'Ticket', 'Cabin']

In [None]:
# Criando transformador personalizado para dataframes pandas
class DataFrameFunctionTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, func):
        self.func = func
    
    def transform(self, input_df):
        return self.func(input_df)

    def fit(self, X, y=None):
        return self

In [None]:
# Como a obtenção de amostras estratificadas não funciona se houver categorias com apenas uma instância, é necessário incluir essas instâncias
# em uma classe chamada Another

def adder_pronouns_cat(input_df):
    input_df['Pronouns'] = input_df['Name'].apply(lambda x: find_pronouns(x))
    s = input_df['Pronouns'].value_counts()
    input_df['Pronouns'] = input_df['Pronouns'].apply(lambda x: 'another' if s[x]<=1 else x)
    return input_df

In [None]:
def drop_columns(input_df):
    input_df = input_df.drop(columns=cols_to_drop, axis=1)
    return input_df

In [None]:
adder_pipe = Pipeline([ 
    ('create-column-pronoun-cat', DataFrameFunctionTransformer(adder_pronouns_cat))
])

drop_pipe = Pipeline([
    ('drop-columns', DataFrameFunctionTransformer(drop_columns))
])

#transformer = ColumnTransformer([
#    ('add', adder_pipe, train.columns.values),
#    ('drop', drop_pipe, train.columns.values)
#])

#### PREPROCESSING

In [None]:
num_pipe = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
cat_pipe = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder())
])

In [None]:
preprocess = ColumnTransformer(transformers=[
    ('num', num_pipe, numerical_labels),
    ('cat', cat_pipe, categorical_labels)
])

## CROSS-VAL-SCORE

In [None]:
# Dividindo o conjunto de dados em treinamento e teste

train = load_dataset(path=PATH_TRAIN)
test = load_dataset(path=PATH_TEST)

treino, validacao = train_test_split(train[labels], test_size=0.3)

X_treino = treino.drop('Survived', axis=1)
y_treino = treino['Survived']

X_valid = validacao.drop(columns=['Survived'], axis=1)
y_valid = validacao['Survived']

In [None]:
models = [DecisionTreeClassifier(), RandomForestClassifier(), LogisticRegression(), SVC(), SGDClassifier(), KNeighborsClassifier()]

In [None]:
def display_cross_validate(model):
   estimator = make_pipeline(preprocess, model)
   scores = cross_val_score(estimator, X_treino, y_treino, cv=5, scoring="accuracy")
   print("Model Name: ", type(model).__name__)
   print(scores.mean())
   print("--------------------------------------------------------------------------")

#for model in models:
   #display_cross_validate(model)

In [None]:
def display_stats(model):
    estimator = make_pipeline(preprocess, model)
    estimator.fit(X_treino, y_treino)
    y_predict = estimator.predict(X_valid)
    
    print("Model Name: ", type(model).__name__)
    print(classification_report(y_valid, y_predict))
    print("--------------------------------------------------------------------------")

for model in models:
    display_stats(model)