In [1]:
#TODOS LOS IMPORTS
# data manipulation and plotting
import pandas as pd
import numpy as np

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import (
    MeanMedianImputer,
    AddMissingIndicator
)
from feature_engine.selection import DropFeatures
from feature_engine.encoding import OneHotEncoder
#to separate training and test
from sklearn.model_selection import train_test_split
#the model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score



In [2]:
import os
os.chdir('D:/UNI/Data Science/9-AE/PEC2')

In [3]:
# CARGAR DATA
file_name = "datasets/titanic.csv"
df = pd.read_csv(file_name, delimiter = ";")
df.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,2113375.0,B5,S,1
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,1
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,0
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,0
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,0


In [4]:
columns = ["pclass", "name", "sex", "age","sibsp", "parch", "ticket", "fare", "cabin", "embarked"]
target = ["survived"]
for i in columns:
    print(f"{i} NaN = {df[i].isna().sum()}")

pclass NaN = 0
name NaN = 0
sex NaN = 0
age NaN = 263
sibsp NaN = 0
parch NaN = 0
ticket NaN = 0
fare NaN = 0
cabin NaN = 1013
embarked NaN = 2


In [5]:
df["sex"] = df.apply(lambda row: 0 if row.sex == "male" else 1, axis=1)

In [6]:
X = df[columns]
Y = df[target[0]]

In [7]:
X

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,29.0000,0,0,24160,2113375.00,B5,S
1,1,"Allison, Master. Hudson Trevor",0,0.9167,1,2,113781,151.55,C22 C26,S
2,1,"Allison, Miss. Helen Loraine",1,2.0000,1,2,113781,151.55,C22 C26,S
3,1,"Allison, Mr. Hudson Joshua Creighton",0,30.0000,1,2,113781,151.55,C22 C26,S
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,25.0000,1,2,113781,151.55,C22 C26,S
...,...,...,...,...,...,...,...,...,...,...
1303,3,"Zabour, Miss. Hileni",1,14.5000,1,0,2665,144542.00,,C
1304,3,"Zabour, Miss. Thamine",1,,1,0,2665,144542.00,,C
1305,3,"Zakarian, Mr. Mapriededer",0,26.5000,0,0,2656,7225.00,,C
1306,3,"Zakarian, Mr. Ortin",0,27.0000,0,0,2670,7225.00,,C


In [8]:
# Let's separate into train and test set

X_train, X_test, Y_train, Y_test = train_test_split(
    X, #features
    Y, #labels
    test_size=0.3, #portion to test
    random_state=0 #seed definition
)

X_train = pd.DataFrame(X_train, columns = columns)
X_test = pd.DataFrame(X_test, columns = columns)
Y_train = pd.DataFrame(Y_train, columns = target)
Y_test = pd.DataFrame(Y_test, columns = target)


In [9]:
#CONFIGURATION

# numerical variables with NA in train set
NUMERICAL_VARS_WITH_NA = ["age"]

# the selected variables
FEATURES = [
    "pclass",
    "sex",
    "age",
    "fare"
]

DROP_FEATURES = [
    "name",
    "sibsp",
    "parch",
    "ticket",
    "cabin",
    "embarked"
    
]

In [10]:
X_train

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
587,2,"Wells, Master. Ralph Lester",0,2.0000,1,1,29103,23.00,,S
427,2,"Hamalainen, Master. Viljo",0,0.6667,1,1,250649,14.50,,S
478,2,"Laroche, Miss. Louise",1,1.0000,1,2,SC/Paris 2123,415792.00,,C
1228,3,"Strilic, Mr. Ivan",0,27.0000,0,0,315083,86625.00,,S
453,2,"Hood, Mr. Ambrose Jr",0,21.0000,0,0,S.O.C. 14879,73.50,,S
...,...,...,...,...,...,...,...,...,...,...
763,3,"Dean, Miss. Elizabeth Gladys ""Millvina""",1,0.1667,1,2,C.A. 2315,20575.00,,S
835,3,"Guest, Mr. Robert",0,,0,0,376563,8.05,,S
1216,3,"Smyth, Miss. Julia",1,,0,0,335432,77333.00,,Q
559,2,"Sincock, Miss. Maude",1,20.0000,0,0,C.A. 33112,36.75,,S


In [12]:
# set up the pipeline
pipe = Pipeline([

    # ===== IMPUTATION =====
    ( "drop_features", DropFeatures(features_to_drop=DROP_FEATURES)),
    ('missing_indicator', AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)),

    # impute numerical variables with the mean
    ('mean_imputation', MeanMedianImputer(
        imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA
    )),
    ('GaussianNB', GaussianNB())
   
   ])

In [13]:
# train the pipeline
pipe.fit(X_train, Y_train.values.ravel())


Pipeline(steps=[('drop_features',
                 DropFeatures(features_to_drop=['name', 'sibsp', 'parch',
                                                'ticket', 'cabin',
                                                'embarked'])),
                ('missing_indicator', AddMissingIndicator(variables=['age'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['age'])),
                ('GaussianNB', GaussianNB())])

In [14]:
ejemplo = pd.DataFrame(df.loc[1]).T
ejemplo

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
1,1,"Allison, Master. Hudson Trevor",0,0.9167,1,2,113781,151.55,C22 C26,S,1


In [15]:
ejemplo = pd.DataFrame(df.loc[1, columns]).T
pred = pipe.predict(ejemplo)
print(pred[0])
print(ejemplo)

0
  pclass                            name sex     age sibsp parch  ticket  \
1      1  Allison, Master. Hudson Trevor   0  0.9167     1     2  113781   

     fare    cabin embarked  
1  151.55  C22 C26        S  


In [16]:
joblib.dump(pipe, open('pipe_m.pkl', 'wb'))

In [17]:
y_test_pred = pipe.predict(X_test)

In [18]:
print(y_test_pred)

[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0]


In [19]:
print("Modelo de Regresión Logistica:")
print(f'Precisión: { accuracy_score(Y_test, y_test_pred) }')
print(f'Exactitud: { precision_score(Y_test, y_test_pred)}')
print(f'Exhaustividad: { recall_score(Y_test, y_test_pred) }')
print(f'F1: { f1_score(Y_test, y_test_pred) }')

Modelo de Regresión Logistica:
Precisión: 0.6564885496183206
Exactitud: 0.71875
Exhaustividad: 0.15436241610738255
F1: 0.2541436464088398


In [21]:
test = pd.concat([X_test, Y_test], axis = 1)
test

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
1278,3,"Vestrom, Miss. Hulda Amanda Adolfina",1,14.0,0,0,350406,78542.00,,S,0
412,2,"Fynney, Mr. Joseph J",0,35.0,0,0,239865,26.00,,S,0
528,2,"Parkes, Mr. Francis ""Frank""",0,,0,0,239853,0.00,,S,0
1149,3,"Riordan, Miss. Johanna ""Hannah""",1,,0,0,334915,77208.00,,Q,1
722,3,"Coleff, Mr. Satio",0,24.0,0,0,349209,74958.00,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
579,2,"Ware, Mr. William Jeffery",0,23.0,1,0,28666,10.50,,S,0
1079,3,"Ohman, Miss. Velin",1,22.0,0,0,347085,7775.00,,S,1
1142,3,"Rice, Master. Arthur",0,4.0,4,1,382652,29125.00,,Q,0
88,1,"Daniels, Miss. Sarah",1,33.0,0,0,113781,151.55,,S,1


In [22]:
train = pd.concat([X_train, Y_train], axis = 1)
train

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
587,2,"Wells, Master. Ralph Lester",0,2.0000,1,1,29103,23.00,,S,1
427,2,"Hamalainen, Master. Viljo",0,0.6667,1,1,250649,14.50,,S,1
478,2,"Laroche, Miss. Louise",1,1.0000,1,2,SC/Paris 2123,415792.00,,C,1
1228,3,"Strilic, Mr. Ivan",0,27.0000,0,0,315083,86625.00,,S,0
453,2,"Hood, Mr. Ambrose Jr",0,21.0000,0,0,S.O.C. 14879,73.50,,S,0
...,...,...,...,...,...,...,...,...,...,...,...
763,3,"Dean, Miss. Elizabeth Gladys ""Millvina""",1,0.1667,1,2,C.A. 2315,20575.00,,S,1
835,3,"Guest, Mr. Robert",0,,0,0,376563,8.05,,S,0
1216,3,"Smyth, Miss. Julia",1,,0,0,335432,77333.00,,Q,1
559,2,"Sincock, Miss. Maude",1,20.0000,0,0,C.A. 33112,36.75,,S,1


In [25]:
train.to_csv("train.csv", index = False)
test.to_csv("test.csv", index = False)