In [23]:
import pandas as pd
# import warnings
# warnings.filterwarnings('ignore')

train = pd.read_csv('../datasets/titanic/train.csv')
test = pd.read_csv('../datasets/titanic/test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [24]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True):
        self.excluirName = excluirName
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        return Xdrop


In [25]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [26]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()


In [27]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
#from sklearn.decomposition import PCA
import numpy as np

#pca = PCA(whiten=True,svd_solver='full')

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    #('pca', pca),
    ('classificador', RandomForestClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'classificador__bootstrap': [True, False],
    'classificador__max_depth': [5,10,15,20], 
    'classificador__min_samples_leaf': [1, 2, 4],
    'classificador__min_samples_split': [2, 5, 10],
    'classificador__n_estimators': [200, 400, 600, 800],
    'classificador__max_features': ['auto', 'sqrt']
    #"pca__n_components": [5, 15, 30, 45, 64],
}
modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y, cv=RepeatedKFold(), verbose=3)
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ......................................., score=0.799 total time=   7.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.6s remaining:    0.0s


[CV] END ......................................., score=0.820 total time=   7.6s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   15.3s remaining:    0.0s


[CV] END ......................................., score=0.854 total time=   7.7s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   23.1s remaining:    0.0s


[CV] END ......................................., score=0.854 total time=   7.8s


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   31.0s remaining:    0.0s


[CV] END ......................................., score=0.831 total time=   7.5s
[CV] END ......................................., score=0.821 total time=   7.5s
[CV] END ......................................., score=0.848 total time=   7.8s
[CV] END ......................................., score=0.848 total time=   7.6s
[CV] END ......................................., score=0.820 total time=   7.5s
[CV] END ......................................., score=0.775 total time=   7.5s
[CV] END ......................................., score=0.866 total time=   7.6s
[CV] END ......................................., score=0.831 total time=   7.5s
[CV] END ......................................., score=0.837 total time=   7.8s
[CV] END ......................................., score=0.775 total time=   8.0s
[CV] END ......................................., score=0.826 total time=   8.5s
[CV] END ......................................., score=0.872 total time=   7.7s
[CV] END ...................

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  6.5min finished


(array([0.79888268, 0.82022472, 0.85393258, 0.85393258, 0.83146067,
        0.82122905, 0.84831461, 0.84831461, 0.82022472, 0.7752809 ,
        0.86592179, 0.83146067, 0.83707865, 0.7752809 , 0.8258427 ,
        0.87150838, 0.84831461, 0.80337079, 0.7752809 , 0.84269663,
        0.79888268, 0.83146067, 0.83707865, 0.83146067, 0.8258427 ,
        0.81564246, 0.83707865, 0.82022472, 0.80898876, 0.84269663,
        0.84916201, 0.8258427 , 0.84831461, 0.79213483, 0.79775281,
        0.8547486 , 0.83707865, 0.79775281, 0.80337079, 0.80898876,
        0.83240223, 0.84831461, 0.83146067, 0.8258427 , 0.76966292,
        0.84357542, 0.84831461, 0.80898876, 0.7752809 , 0.86516854]),
 0.8252413533362626,
 0.02546689969131843)

In [None]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
