Cesar Garcia Data Science

In [13]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_validate


In [5]:
# import dataset
df = pd.read_csv('/Users/gcesar/Desktop/DataScience/train.csv')
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# Create column Deck from Cabin
def get_deck(Cabin):
    if pd.notna(Cabin):
        return Cabin[0]
    else:
        return np.nan
df['Deck'] = df['Cabin'].apply(get_deck)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Deck
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,


In [7]:
# Separate X and y target
X = df.drop(['Survived'], axis=1)
y = df['Survived']


In [8]:
# Split train test data into X and y

X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.2, random_state=1)


In [9]:
# Set pipeline for numerical data

num_feat = ['Age']

num_trans = Pipeline(steps=[('si', SimpleImputer(missing_values=np.nan, strategy='median'))])


In [10]:
# Set pipeline for categorical data

cat_feat = ['Pclass', 'Sex', 'Deck']

cat_trans = Pipeline(steps=[('si', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='X')), ('ohe', OneHotEncoder(sparse=False, dtype=int, handle_unknown='ignore'))])


In [11]:
# Set column transformer for numeric and categorical data
preprocessor = ColumnTransformer(transformers=[('num', num_trans, num_feat), ('cat', cat_trans, cat_feat)],remainder='drop')
# remainder='passthrough'


In [12]:
# Set preprocessing and model pipeline

clf = Pipeline(steps=[('pp', preprocessor), ('lr', LogisticRegression(solver='liblinear'))])


In [16]:
# Perform cross-validation on train data and see score mean
score = cross_validate(clf, X_train, y_train, cv=5, return_train_score=True)

score['test_score'].mean()




0.8020289569585344

In [17]:
score


{'fit_time': array([0.01991105, 0.01162314, 0.01105881, 0.01044917, 0.01083589]),
 'score_time': array([0.00525188, 0.00582409, 0.00540709, 0.00470114, 0.00474524]),
 'test_score': array([0.77622378, 0.78321678, 0.81690141, 0.80985915, 0.82394366]),
 'train_score': array([0.82425308, 0.81370826, 0.81754386, 0.81578947, 0.80701754])}