In [9]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
# Read the data

df = pd.read_csv("../data/titanic_preprocessed.csv")
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,male,Q,S
0,0,22.0,1,0,7.25,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,1,26.0,0,0,7.925,0,0,1
3,1,35.0,1,0,53.1,0,0,1
4,0,35.0,0,0,8.05,1,0,1


In [None]:
# Make sure the data is clean

df.isna().sum()

Survived    0
Age         0
SibSp       0
Parch       0
Fare        0
male        0
Q           0
S           0
dtype: int64

In [5]:
# Split the data into train/test and create prepocessing pipeline

X = df.drop("Survived", axis=1)
y = df["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train 

Unnamed: 0,Age,SibSp,Parch,Fare,male,Q,S
707,22.0,0,0,151.5500,0,0,1
239,24.0,1,0,14.4542,0,0,0
381,32.0,0,0,7.9250,1,0,1
791,24.0,8,2,69.5500,0,0,1
682,14.0,5,2,46.9000,1,0,1
...,...,...,...,...,...,...,...
106,24.0,0,0,7.7750,1,0,1
270,25.0,0,0,0.0000,1,0,1
860,48.0,0,0,25.9292,0,0,1
435,21.0,2,2,34.3750,0,0,1


In [10]:
# Column transformer for scaling
transformer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ['Age', 'Fare'])
    ],
    remainder="passthrough"
)

pipeline = Pipeline(
    [
        ("transformer", transformer),
        ("classifier", LogisticRegression())
    ]
)

In [11]:
# Run the pipeline to train the model

pipeline.fit(X_train, y_train)


In [13]:
pipeline.score(X_test, y_test)

0.797752808988764

In [15]:
# Save (Store) the model
import joblib, pickle

with open("../models/model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

with open("../models/model_joblib.pkl", "wb") as f:
    joblib.dump(pipeline, f)