Cesar Garcia Data Science

In [1]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


In [2]:
# import dataset
df = pd.read_csv('/Users/gcesar/Desktop/DataScience/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Create column Deck from Cabin
def get_deck(Cabin):
    if pd.notna(Cabin):
        return Cabin[0]
    else:
        return np.nan
df['Deck'] = df['Cabin'].apply(get_deck)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [4]:
# Separate X and y target
X = df.drop(['Survived'], axis=1)
y = df['Survived']


In [5]:
# Split train test data into X and y

X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=1)


In [6]:
# Set pipeline for numerical data

num_feat = ['Age']

num_trans = Pipeline(steps=[('si', SimpleImputer(missing_values=np.nan, strategy='median'))])


In [7]:
# Set pipeline for categorical data

cat_feat = ['Pclass', 'Sex', 'Deck']

cat_trans = Pipeline(steps=[('si', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X')), ('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))])


In [8]:
# Set column transformer for numeric and categorical data
preprocessor = ColumnTransformer(transformers=[('num', num_trans, num_feat), ('cat', cat_trans, cat_feat)],remainder='drop')
# remainder='passthrough'


In [9]:
# Set preprocessing and model pipeline

clf = Pipeline(steps=[('pp', preprocessor), ('lr', LogisticRegression(solver='liblinear'))])


In [10]:
#Hyper
param_grid = {'lr__penalty': ['l1', 'l2']}

gscv = GridSearchCV(clf, param_grid, cv=5, return_train_score=False)



In [11]:

gscv.fit(X_train, y_train)
print('-'*100)
print(gscv.best_estimator_, "\n")
print('-'*100)
print(gscv.best_score_, "\n")
print('-'*100)
print(gscv.best_params_,"\n")
print('-'*100)
print(gscv.cv_results_,"\n")
print('-'*100)




----------------------------------------------------------------------------------------------------
Pipeline(steps=[('pp',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('si',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['Age']),
                                                 ('cat',
                                                  Pipeline(steps=[('si',
                                                                   SimpleImputer(fill_value='X',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(dtype=<class 'int'>,
                                                                           

