In [1]:
%load_ext autoreload
%autoreload 2

# Autoreload is important, otherwise .py scripts won't be reloaded after changes.

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import data_loader

from sklearn.model_selection import  train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import common.columns as columns

from features.title_adder import TitleAdder
from features.person_type_adder import (
  PersonTypeAdder,
  CHILD_TYPE,
  MAN_TYPE,
  WOMAN_TYPE
)

from features.column_dropper import ColumnDropper

In [2]:
loader = data_loader.DataLoader()

train_set, _ = loader.get_data()

# Experiment with removing null values - surprisingly, Logistic Regression returns better predictions
# including 2 rows with null EMBARKED column value.
train_set = train_set.dropna(subset=[columns.EMBARKED])

labels = train_set[[columns.SURVIVED]]
train_set = train_set.drop(columns.SURVIVED, axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train_set, labels, random_state=42)

In [4]:
numerical_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy='mean')),
  ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
  ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

trasformer = ColumnTransformer([
  ('numerical', numerical_pipeline, [columns.AGE, columns.FARE]),
  ('categorical', categorical_pipeline, [columns.SEX, columns.EMBARKED])
], remainder='passthrough')

preparation_pipeline = Pipeline([
  ('column_dropper', ColumnDropper([columns.CABIN, columns.PASSENGER_ID, columns.NAME, columns.TICKET])),
  ('column_transformer', trasformer)
])

In [5]:
X_train_prepared = preparation_pipeline.fit_transform(X_train)

pd.DataFrame(X_train_prepared)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.606747,-0.500108,1.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0
1,1.522942,-0.435393,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
2,0.000000,-0.644473,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0
3,0.000000,-0.115799,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-1.139170,-0.356656,1.0,0.0,1.0,0.0,0.0,3.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
661,0.000000,-0.489654,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0
662,-0.378566,-0.644473,0.0,1.0,0.0,0.0,1.0,3.0,0.0,0.0
663,1.370821,-0.128161,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
664,-0.682808,0.040015,1.0,0.0,0.0,0.0,1.0,3.0,2.0,2.0


In [6]:
logistion_regression_model = Pipeline([
  ('preparation', preparation_pipeline),
  ('model', LogisticRegression(max_iter=1000, random_state=42))
])

logistion_regression_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [7]:
y_preds = logistion_regression_model.predict(X_test)

In [8]:
score = accuracy_score(y_test, y_preds)

score

0.7757847533632287