In [200]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]

y = np.squeeze(y)


In [201]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

def extraiAgeBin(age):
    return pd.cut(age, bins=[0,12,20,40,120], labels=['Children','Teenage','Adult','Elder'])

def extraiFareBin(fare):
    return pd.cut(fare, bins=[0,7.91,14.45,31,120], labels=['Low_fare','median_fare','Average_fare','high_fare'])        

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True,incluirAgeBin=True,incluirFareBin=True):
        self.excluirName = excluirName
        self.incluirAgeBin = incluirAgeBin
        self.incluirFareBin = incluirFareBin
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')          
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        if self.incluirAgeBin:
            Xdrop['Age_Bin'] = extraiAgeBin(Xdrop['Age'])
        if self.incluirFareBin:
            Xdrop['Fare_bin'] = extraiFareBin(Xdrop['Fare'])
        return Xdrop

In [202]:
from sklearn.base import BaseEstimator, TransformerMixin

       

class RemoveAtributosTransformados(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['Age', 'Fare']       
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        return Xdrop

In [203]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [204]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()


In [205]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [206]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, make_scorer, precision_score

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('removeAtributosTransformados',RemoveAtributosTransformados()),
    ('trataAtributos', trataAtributos),
    ('variance',VarianceThreshold()),
    ('classificador', RandomForestClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'classificador__max_depth': [5],
    'trataAtributos__unecaracteristicas__pipenum__imputer__strategy': ['mean','most_frequent'],
    'trataAtributos__unecaracteristicas__pipecat__imputer__strategy': ['mean','most_frequent','median'],
    'trataAtributos__unecaracteristicas__pipenum__scaler__copy': [True,False],
    'variance__threshold': [0,3]    
}
scoring = {'precision': make_scorer(precision_score, average='macro'), 'accuracy': make_scorer(accuracy_score)}

modelo = GridSearchCV(pipetotal, param_grid=parametros,n_jobs=-1,scoring=scoring,refit="accuracy")

scores = cross_validate(modelo, X, y, cv=RepeatedKFold())
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

(array([0.83798883, 0.85393258, 0.83707865, 0.79775281, 0.8258427 ,
        0.83240223, 0.84269663, 0.84269663, 0.80898876, 0.79775281,
        0.88268156, 0.78651685, 0.84831461, 0.80898876, 0.82022472,
        0.84357542, 0.84269663, 0.82022472, 0.83146067, 0.80898876,
        0.75977654, 0.83146067, 0.83707865, 0.86516854, 0.83707865,
        0.79888268, 0.85393258, 0.81460674, 0.80898876, 0.85955056,
        0.82681564, 0.80337079, 0.84269663, 0.87640449, 0.79775281,
        0.79888268, 0.82022472, 0.82022472, 0.81460674, 0.88202247,
        0.8547486 , 0.80337079, 0.84269663, 0.76404494, 0.82022472,
        0.81564246, 0.79213483, 0.83146067, 0.82022472, 0.83146067]),
 0.8259268093653884,
 0.02621255330541184)

In [207]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['Survived'] = y_pred
