<a href="https://colab.research.google.com/github/glima91/mestrado-reconhecimento-de-padroes/blob/main/ajuste_de_caracteristicas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('/content/sample_data/train.csv')
test = pd.read_csv('/content/sample_data/test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]

In [79]:
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [80]:
X["Cabin"].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [81]:
from sklearn.base import BaseEstimator, TransformerMixin
import math

def getCabinLetter(x):
    x = str(x)
    if x != 'nan':
        cabin = x[0]
    else:
        cabin = 'X'
    return cabin

def hasCabin(x):
    if x == 'nan':
        return 0
    else:
        return 1

def criaColunaHasFamily(sibsp, parch):
    if sibsp + parch > 0:
        x = 1
    else:
        x = 0
    return x

def criaColunaTicketLen(ticket):
    return len(ticket)


def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True):
        self.excluirName = excluirName
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Embarked', "SibSp", "Parch", "Cabin"]
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        # Adiciona coluna 'hasFamily' se SibSp ou Parch são maiores que zero
        X['hasFamily'] = X.apply(lambda x: criaColunaHasFamily(x["SibSp"], x["Parch"]),axis=1)
        # Cria coluna Ticketlen
        X['TicketLen'] = X["Ticket"].apply(criaColunaTicketLen)
        # trata coluna de Cabin
        X['hasCabin'] = X['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
        X["Cabin"] = X["Cabin"].apply(getCabinLetter)
        # Remove Colunas indesejadas
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        # Cria coluna com pronomes de tratamento
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
       
        return Xdrop


class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()



class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [82]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np


pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [False, True],
    'classificador__criterion': ['entropy'],
    'classificador__max_depth': [None], 
    'classificador__min_samples_leaf': [1,2],
    'classificador__min_samples_split': [2,5],
    'classificador__n_estimators': [30],
    'classificador__max_features': ['auto'],
    'classificador__bootstrap': [True]
}
modelo = GridSearchCV(pipetotal, param_grid=parametros,n_jobs=-1)

scores = cross_validate(modelo, X, y, cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=10), verbose=10)
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

[CV] START .....................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ......................................., score=0.866 total time=   5.1s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.1s remaining:    0.0s


[CV] END ......................................., score=0.809 total time=   4.0s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    9.1s remaining:    0.0s


[CV] END ......................................., score=0.809 total time=   3.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   13.0s remaining:    0.0s


[CV] END ......................................., score=0.742 total time=   3.7s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   16.8s remaining:    0.0s


[CV] END ......................................., score=0.837 total time=   3.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   20.5s remaining:    0.0s


[CV] END ......................................., score=0.810 total time=   3.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   24.3s remaining:    0.0s


[CV] END ......................................., score=0.809 total time=   3.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   28.2s remaining:    0.0s


[CV] END ......................................., score=0.837 total time=   3.8s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   32.1s remaining:    0.0s


[CV] END ......................................., score=0.848 total time=   3.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   36.0s remaining:    0.0s


[CV] END ......................................., score=0.809 total time=   3.9s
[CV] START .....................................................................
[CV] END ......................................., score=0.838 total time=   3.8s
[CV] START .....................................................................
[CV] END ......................................., score=0.815 total time=   4.0s
[CV] START .....................................................................
[CV] END ......................................., score=0.865 total time=   3.9s
[CV] START .....................................................................
[CV] END ......................................., score=0.787 total time=   3.9s
[CV] START .....................................................................
[CV] END ......................................., score=0.837 total time=   4.0s
[CV] START .....................................................................
[CV] END ...................

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.6min finished


(array([0.86592179, 0.80898876, 0.80898876, 0.74157303, 0.83707865,
        0.81005587, 0.80898876, 0.83707865, 0.84831461, 0.80898876,
        0.83798883, 0.81460674, 0.86516854, 0.78651685, 0.83707865,
        0.84357542, 0.84269663, 0.85393258, 0.81460674, 0.82022472,
        0.84916201, 0.7752809 , 0.84831461, 0.82022472, 0.83707865]),
 0.824897369907727,
 0.028021273413885172)

In [83]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)