In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import set_config


In [2]:
set_config(display='diagram')

In [4]:
df = pd.read_csv('data/titanic.csv')
df['Pclass_new'] = df['Pclass'].map({1:'First', 2:'Second', 3:'Third'})
df.drop(['Pclass', 'PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch'], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Sex,Age,Fare,Embarked,Pclass_new
0,0,male,22.0,7.25,S,Third
1,1,female,38.0,71.2833,C,First
2,1,female,26.0,7.925,S,Third
3,1,female,35.0,53.1,S,First
4,0,male,35.0,8.05,S,Third


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    int64  
 1   Sex         891 non-null    object 
 2   Age         714 non-null    float64
 3   Fare        891 non-null    float64
 4   Embarked    889 non-null    object 
 5   Pclass_new  891 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 41.9+ KB


In [6]:
df.isna().sum()

Survived        0
Sex             0
Age           177
Fare            0
Embarked        2
Pclass_new      0
dtype: int64

In [7]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [8]:
df.Pclass_new.value_counts()

Third     491
First     216
Second    184
Name: Pclass_new, dtype: int64

In [9]:
df.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
X = df.drop('Survived', axis=1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
num_features=['Age','Fare']
num_trans = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
           ("scaler", StandardScaler())])

cat_ohe_features = ["Embarked", "Sex"]
cat_ohe_trans = Pipeline(
    steps=[("cat1_impute", SimpleImputer(strategy="most_frequent")),
           ("ohe", OneHotEncoder(handle_unknown="ignore"))])

cat_ord_features = ["Pclass_new"]
ord_cats = list(np.array(["Third", "Second", "First"]).reshape(1,3))
cat_ord_trans = Pipeline(
    steps=[("cat2_imputer", SimpleImputer(strategy="most_frequent")),
          ("ord", OrdinalEncoder(categories=ord_cats)),
          ("scaler", StandardScaler())])

In [12]:
ct = ColumnTransformer(
    transformers=[
        ("num", num_trans, num_features),
        ("cat_ohe", cat_ohe_trans, cat_ohe_features),
        ("cat_ord", cat_ord_trans, cat_ord_features)])

In [13]:
log_reg = Pipeline(
    steps=[('transformers', ct), ('classifier', LogisticRegression())])

In [14]:
log_reg

In [15]:
log_reg.fit(X_train, y_train);
cv_score = cross_val_score(log_reg, X_train, y_train).mean()
train_score = log_reg.score(X_train, y_train)
test_score = log_reg.score(X_test, y_test)
print(f"Train Score: {train_score}")
print(f"Cross Val Score: {cv_score}")
print(f"Test Score: {test_score}")

Train Score: 0.7935393258426966
Cross Val Score: 0.7864670540726879
Test Score: 0.7988826815642458
