In [13]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv('titanic.csv')

X,y = df.loc[1:, 'PassengerId':'Embarked'],df.loc[1:,'Survived']


In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True):
        self.excluirName = excluirName
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        return Xdrop


In [15]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [16]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()


In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

voting = VotingClassifier([
        ('knn', KNeighborsClassifier()),
        ('rf', RandomForestClassifier(random_state=42)),
        ('lr', LogisticRegression())
    ])

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('voting', voting)
])


parametros = {'atributosDesejados__excluirName': [True, False],
    'voting__voting': ['hard', 'soft'],
    'voting__knn__n_neighbors': [5,8],
    #'voting__rf__max_depth': [None, 5,10],
    #'voting__rf__criterion': ['gini', 'entropy'],
    #'voting__rf__random_state': [None, 42],
    #'voting__lr__penalty': ['l1', 'l2']}
}
modelo = GridSearchCV(pipetotal, param_grid=parametros,n_jobs=-1)

scores = cross_validate(modelo, X, y, cv=10,n_jobs=-1)

print(f"Acurácia Voting: {np.mean(scores['test_score'])}")

In [None]:
stack = StackingClassifier([
    ('ada', AdaBoostClassifier(DecisionTreeClassifier(max_depth=25, splitter='random'))),
    ('ext', ExtraTreesClassifier(random_state=42)),
    ('bag', BaggingClassifier(n_jobs=-1,random_state=42))
])    

pipetotal2 = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('stack', stack)
])


parametros2 = {'atributosDesejados__excluirName': [True, False],
    'stack__cv': [3, 5],
    'stack__passthrough': [True, False],
    'stack__final_estimator': [LogisticRegression(), KNeighborsClassifier()],
    #'stack__ada__random_state': [None,42],
    #'stack__ada__learning_rate': [0.15,0.25],
    #'stack__ext__criterion': ['gini', 'entropy'],
    #'stack__ext__random_state': [None, 42],
    #'stack__bag__random_state': [None,42]}
}
modelo2 = GridSearchCV(pipetotal2, param_grid=parametros2,n_jobs=-1)

scores2 = cross_validate(modelo2, X, y, cv=10,n_jobs=-1)

print(f"Acurácia Stacking: {np.mean(scores2['test_score'])}")