## 2.3.2. Example using Scikit-learn

In [1]:
# pip install --no-input sklearn

In [2]:
# !pip install sklearn

In [3]:
# pip list sklearn

In [4]:
from sklearn.datasets import make_moons
from sklearn.linear_model import LogisticRegression

X, y = make_moons(n_samples=200, noise=0.3)

m = LogisticRegression(C=1e5)
m.fit(X, y)

print(m.score(X, y))

0.84


## 2.3.3. Example using Statsmodels

In [6]:
# Question: sklearn and statsmodels - are they just different interfaces or they also calculate differently ?

import statsmodels.discrete.discrete_model as sm

logit = sm.Logit(y, X)
f = logit.fit()
print(f.params)
print(f.summary())

Optimization terminated successfully.
         Current function value: 0.365604
         Iterations 7
[ 1.23000372 -3.03390435]
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  200
Model:                          Logit   Df Residuals:                      198
Method:                           MLE   Df Model:                            1
Date:                Tue, 05 Oct 2021   Pseudo R-squ.:                  0.4725
Time:                        13:36:53   Log-Likelihood:                -73.121
converged:                       True   LL-Null:                       -138.63
Covariance Type:            nonrobust   LLR p-value:                 2.454e-30
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             1.2300      0.210      5.848      0.000       0.818       1.642
x2 

## Challenge :: Build a Logistic Regression model

In [8]:
import pandas as pd

#### Start with the Pclass column only (because it is numerical and complete).

In [14]:
df = pd.read_csv("./data/train.csv", sep=",")
df = df[['Pclass', 'Survived']]
df

Unnamed: 0,Pclass,Survived
0,3,0
1,1,1
2,3,1
3,1,1
4,3,0
...,...,...
886,2,0
887,1,1
888,3,0
889,1,1


#### Print the coefficients calculated by the model.

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns="Survived"), 
    df["Survived"], 
    test_size=0.25, 
    random_state = 101
)

In [19]:
# normalize the data
X_train = (X_train-X_train.mean())/X_train.std()
X_test = (X_test-X_test.mean())/X_test.std()

In [20]:
# instantiate the model
m = LogisticRegression()

In [21]:
# train the model
m.fit(X_train, y_train)

LogisticRegression()

In [22]:
m.coef_, m.intercept_

(array([[-0.69498637]]), array([-0.58814192]))

#### Calculate the probabilities for your data points belonging to the positive class.

In [50]:
pd.DataFrame(m.predict_proba(X_train)[:,1], columns=["probability to survive"])

Unnamed: 0,probability to survive
0,0.238939
1,0.238939
2,0.238939
3,0.238939
4,0.238939
...,...
663,0.238939
664,0.238939
665,0.627128
666,0.627128


In [52]:
# Suppose you classify all points with a probability > 0.9 as positive.
pd.DataFrame(m.predict_proba(X_train)[:,1] > 0.9, columns=["Survived"])

Unnamed: 0,Survived
0,False
1,False
2,False
3,False
4,False
...,...
663,False
664,False
665,False
666,False


In [53]:
# How does the result of your prediction change?
sum(m.predict_proba(X_train)[:,1] > 0.9)

0

In [54]:
# How does it change if you change the threshold to > 0.1?
sum(m.predict_proba(X_train)[:,1] > 0.1)

668

## Challenge :: Balance Classes

In [55]:
# Add the option class_weight='balanced' to even out the bias caused by the non-survivors outnumbering the survivors.

In [56]:
# instantiate the model
m = LogisticRegression(class_weight='balanced')

In [57]:
# train the model
m.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

In [58]:
m.coef_, m.intercept_

(array([[-0.70078082]]), array([-0.04928286]))

In [59]:
pd.DataFrame(m.predict_proba(X_train)[:,1], columns=["probability to survive"])

Unnamed: 0,probability to survive
0,0.348782
1,0.348782
2,0.348782
3,0.348782
4,0.348782
...,...
663,0.348782
664,0.348782
665,0.744219
666,0.744219


In [66]:
sum(m.predict_proba(X_train)[:,1] > 0.5)

298