## IMPORTS

In [99]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import zipfile
import os
import matplotlib as ml
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, RandomTreesEmbedding
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score, precision_recall_curve, roc_curve, classification_report

## GETING DATA

In [100]:
def load_train():
    return pd.read_csv('datasets/train.csv')

def load_test():
    return pd.read_csv('datasets/test.csv')

train = load_train()
test = load_test()

## FUNCTIONS AND HELPERS

In [117]:
def find_pronouns(text):
    if ', ' in text:
        text = text.split(', ')[1]
        return text.split('.')[0]

In [119]:
def create_col_pronouns(dataframe):
    dataframe['Pronouns'] = dataframe['Name'].apply(lambda x: find_pronouns(x))
    return dataframe

In [114]:
# A extração estratificada por categoria não funciona se as categorias possuirem apenas um elemento. 
def ajuste(cat):
    s = df['Tratamento'].value_counts()
    if s[cat] <= 1:
        return 'another'
    else:
        return cat

## SETS

In [101]:
# Dividindo o conjunto de dados em treinamento e teste

treino, validacao = train_test_split(train, test_size=0.3)

X_treino = treino.drop('Survived', axis=1)
y_treino = treino['Survived']

X_valid = validacao.drop(columns=['Survived'], axis=1)
y_valid = validacao['Survived']


In [None]:
# Dividindo conjunto de dados em amostras estratificadas.

## TRANSFORMING DATA

In [102]:
labels = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
numerical_labels = ['Age', 'Fare']
categorical_labels = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

In [98]:
num_pipe = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [107]:
cat_pipe = Pipeline(steps=[
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder())
])

In [108]:
preprocess = ColumnTransformer(transformers=[
    ('num_pipe', num_pipe, numerical_labels),
    ('cat_pipe', cat_pipe, categorical_labels)
])

## CROSS-VAL-SCORE

In [116]:
# 1. Criar coluna com pronome de tratamento
# 2. Transformar as categorias que só possuem um indivíduo em categoria chamada "anothers"
# 3. Remover coluna "Pronome de tratamento"

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

adder_pronoums = FunctionTransformer(create_col_pronouns(pd.DataFrame), validate=True)

In [109]:
def cross_validate_models(model):

   pipeline = Pipeline(steps=[
      ('preprocess', preprocess),
      ('classifier', model)
   ])
   
   pipeline.fit(X_treino, y_treino)
   y_predict = pipeline.predict(X_valid)
   print(precision_score(y_valid, y_predict))

In [110]:
cross_validate_models(RandomForestClassifier())

0.8539325842696629
