In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('../data/train.csv', index_col=["id"])

df.head().T


id,0,1,2,3,4
age,55.0,70.0,20.0,35.0,30.0
height(cm),165.0,165.0,170.0,180.0,165.0
weight(kg),60.0,65.0,75.0,95.0,60.0
waist(cm),81.0,89.0,81.0,105.0,80.5
eyesight(left),0.5,0.6,0.4,1.5,1.5
eyesight(right),0.6,0.7,0.5,1.2,1.0
hearing(left),1.0,2.0,1.0,1.0,1.0
hearing(right),1.0,2.0,1.0,1.0,1.0
systolic,135.0,146.0,118.0,131.0,121.0
relaxation,87.0,83.0,75.0,88.0,76.0


# Section 1: logistic regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [6]:
X = df.copy()
y = X.pop('smoking')

X.shape, y.shape

((159256, 22), (159256,))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

X_train.shape, y_train.shape

((119442, 22), (119442,))

In [10]:
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

X_train_scaled[0]

array([ 2.17463457, -1.73044212, -1.76029025, -2.01136156, -0.51268345,
       -0.25912768, -0.15589628, -0.15436815,  0.59147198, -0.76107355,
        0.174706  , -0.90681591, -0.83823003,  0.29616726, -0.68938495,
       -0.41331228, -0.21414886, -1.62396111,  0.78527155, -0.46685371,
       -0.45331834, -0.49673527])

In [11]:
logistic = LogisticRegression().fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)
y_pred = logistic.predict(X_test_scaled)

accuracy_score(y_test, y_pred)

0.7459687547093987

In [24]:
roc_auc_score(y_test, logistic.predict_proba(X_test_scaled)[:, 1])

0.8324749502894868

## Pipeline


In [16]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())

pipe.fit(X_train, y_train)


In [17]:
y_pipe = pipe.predict(X_test)

accuracy_score(y_test, y_pipe)

0.7459687547093987

# Saving the model


In [12]:
import pickle

In [19]:
with open('../models/default_logistic.bin', 'wb') as f_out: 
    pickle.dump(pipe, f_out)


## Loading the model

In [20]:
with open('../models/default_logistic.bin', 'rb') as f_in: 
    model = pickle.load(f_in)


In [21]:
accuracy_score(y_test, model.predict(X_test))

0.7459687547093987