In [10]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.transformation import YeoJohnsonTransformer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import joblib

import myPreprocessors as mypp
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [11]:
dataTrain = pd.read_csv('train.csv')
dataTrain.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [12]:
dataTrain.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [13]:
dataTrain['Pclass'] = dataTrain['Pclass'].astype('O')
dataTrain['Sex'] = dataTrain['Sex'].astype('O')
dataTrain['SibSp'] = dataTrain['SibSp'].astype('O')
dataTrain['Parch'] = dataTrain['Parch'].astype('O')
dataTrain['Embarked'] = dataTrain['Embarked'].astype('O')
dataTrain['Survived'] = dataTrain['Survived'].astype('O')
dataTrain.dtypes

PassengerId      int64
Survived        object
Pclass          object
Name            object
Sex             object
Age            float64
SibSp           object
Parch           object
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## 2. Split para entrenamiento y prueba

In [14]:
X_train, X_test, y_train, y_test = train_test_split(dataTrain.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1,), dataTrain['Survived'], test_size=0.3, random_state=2022)

## 3. Configuración del Pipeline

In [15]:
#Imputación de variables categoricas
CATEGORICAL_VARS_WITH_NA_FREQUENT = ['Pclass', 'Sex', 'SibSp', 'Parch']

#Imputación de variables categoricas con indicador de faltante (Missing)
CATEGORICAL_VARS_WITH_NA_MISSING = ['Embarked']

#Imputación de variables númericas
NUMERICAL_VARS_WITH_NA = ['Age']

#Variables para transformación logaritmica
NUMERICAL_LOG_VARS=['Age', 'Fare']

#Variables para codificación ordinal
SEX_VARS = ['Sex']
EMBARKED_VARS = ['Embarked']

#Variables especiales

#Variables para codificacion por frecuencia
CATEGORICAL_VARS=['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

#Mapeo para variables categoricas
QUAL_SEX_MAPPINGS={'male': 0, 'female': 1}
QUAL_Embarked_MAPPINGS={'S': 1, 'C': 2, 'Q': 3, 'nan': 0, 'NA': 0, 'Missing': 0}

#Variables a utilizar en el entrenamiento
FEATURES = [
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked',
]

In [16]:
dataTrain['Sex'].unique()

array(['male', 'female'], dtype=object)

In [17]:
dataTrain['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

## 4. Construcción del Pipeline

In [30]:
titanic_pipeline_v27112022 = Pipeline([
    #Imputaciones
    
    # 1. Imputacion de variables categoricas con indicador de faltante
    ('missing_imputation',
         CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)
    ),
    
    # 2. Imputacion de variables categoricas basada en frecuencia
    ('frequent_imputation',
         CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)
    ),
    
    # 3. Indicador faltante en variables numericas para imputación
    ('missing_indicator_numeric',
         AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    # 4. Imputación de variables númericas
    ('mean_imputation',
         MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #Variables temporales
    
    #Codificación de variables categoricas
    ('sex_mapper',
         mypp.Mapper(variables=SEX_VARS, mappings=QUAL_SEX_MAPPINGS)
    ),
    
    
    ('embarked_mapper',
         mypp.Mapper(variables=EMBARKED_VARS, mappings=QUAL_Embarked_MAPPINGS)
    ),
    
    #Codificación de variables categoricas nominales
    #('rare_label_encoder',
         #RareLabelEncoder(n_categories=1, tol=0.01, variables=CATEGORICAL_VARS)
    #),
    
    #('categorical_encoder',
         #OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_VARS)
    #),
    
    #Transformación de variables continuas
    ('log_transformer',
         YeoJohnsonTransformer(variables=NUMERICAL_LOG_VARS)
    ),
    
    #Scaler
    ('scaler',
         MinMaxScaler()
    ),
    
    ('modelo_regresion_logistica',
    LogisticRegression(max_iter=1000))
])

In [31]:
#X_train_transform = titanic_pipeline_v27112022.fit_transform(X_train, y_train.astype('int'))
#X_test_transform = titanic_pipeline_v27112022.fit_transform(X_test, y_test.astype('int'))
titanic_pipeline_v27112022.fit(X_train, y_train.astype('int'))
joblib.dump(titanic_pipeline_v27112022, 'model_titanic_pipeline_v27112022.pkl')
joblib.dump(FEATURES, 'FEATURES.pkl')
#titanic_pipeline_v27112022.fit(X_train, y_train.astype('int'))

['FEATURES.pkl']

In [22]:
X_train_transform[0]

array([0.5       , 1.        , 0.35880462, 0.125     , 0.4       ,
       0.71845574, 0.33333333, 0.        ])

## Regresion logistica

In [23]:
lr=LogisticRegression(max_iter=1000)
lr.fit(X_train_transform, y_train.astype('int'))
lr_preds = lr.predict(X_test_transform)
lr_preds

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0])

In [37]:
lr_cm = accuracy_score(y_test.astype('int'), lr_preds)
lr_cm

0.8208955223880597

## K-Nearest Neighbours

In [38]:
knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)
knn.fit(X_train_transform, y_train.astype('int'))
knn_preds = knn.predict(X_test_transform)
knn_preds

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [39]:
knn_cm = accuracy_score(y_test.astype('int'), knn_preds)
knn_cm

0.7947761194029851

## SVM

In [40]:
svm = LinearSVC(C=0.0001)
svm.fit(X_train_transform, y_train.astype('int'))
svm_preds = svm.predict(X_test_transform)
svm_preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [41]:
svm_cm = accuracy_score(y_test.astype('int'), svm_preds)
svm_cm

0.6268656716417911

## Decision Tree

In [42]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transform, y_train.astype('int'))
clf_preds = clf.predict(X_test_transform)
clf_preds

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [43]:
clf_cm = accuracy_score(y_test.astype('int'), clf_preds)
clf_cm

0.7649253731343284

## Random Forest

In [44]:
rf = RandomForestClassifier(n_estimators=30, max_depth=9)
rf.fit(X_train_transform, y_train.astype('int'))
rf_preds = rf.predict(X_test_transform)
rf_preds

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [45]:
rf_cm = accuracy_score(y_test.astype('int'), rf_preds)
rf_cm

0.7985074626865671