This is an introduction to classification with sklearn and the LogisticRegression model.

In [61]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

Load passengers data into a DataFrame

In [62]:
passengers = pd.read_csv('passengers.csv')

Analyse which features can be useful for a survival prediction. Here we can see that the Pclass, Age and Sex features are relevant for our prediction.

In [63]:
passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


Mapping the sex of the passengers to either 0 for males or 1 for females (the framework needs to work with numerical data)

In [64]:
passengers['Sex'] = passengers['Sex'].map({'male':0, 'female':1})

Fill the empty cells with the mean age of the passengers.

In [65]:
passengers['Age'].fillna(value=np.mean(passengers['Age']), inplace=True)

Map the FirstClass passengers to their own column. Do the same with the SecondClass 

In [69]:
passengers['FirstClass'] = passengers['Pclass'].map({1:1, 2:0, 3:0})
passengers['SecondClass'] = passengers['Pclass'].map({1:0, 2:1, 3:0})

In [70]:
passengers

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FirstClass,SecondClass
0,1,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,A/5 21171,7.2500,,S,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,PC 17599,71.2833,C85,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,STON/O2. 3101282,7.9250,,S,0,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,113803,53.1000,C123,S,1,0
4,5,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,373450,8.0500,,S,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,211536,13.0000,,S,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,112053,30.0000,B42,S,1,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,W./C. 6607,23.4500,,S,0,0
889,890,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,111369,30.0000,C148,C,1,0


Create two news arrays: features and labels

In [77]:
features = passengers[['Sex', 'Age', 'FirstClass', 'SecondClass']]
survival = passengers[['Survived']]

Split the data into training and testing sets

In [122]:
features_train, features_test, labels_train, labels_test = train_test_split(features, survival, test_size=0.2, random_state=100)

The Logistic Regression implementation from sklearn uses Regularization so we need to scale the feature data.

In [123]:
scaler = StandardScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)

Now we can create the Logistic Regression Model

In [124]:
model = LogisticRegression()
model.fit(features_train, labels_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Let's check the accuracy scores for our model

In [126]:
model.score(features_train, labels_train)

0.7963483146067416

And compare that against our test data

In [127]:
model.score(features_test, labels_test)

0.7932960893854749

Listing the coefficients will give us the most important feature for the survival

In [129]:
print(list(zip(['Sex', 'Age', 'FirstClass', 'SecondClass'], model.coef_[0])))

[('Sex', 1.2343640053517189), ('Age', -0.43343958110690295), ('FirstClass', 1.0032324177546368), ('SecondClass', 0.5360914810423328)]


We have our model, we should try out some predictions with our favorites figures.

In [134]:
Jack = np.array([0.0,20.0,0.0,0.0])
Rose = np.array([1.0,17.0,1.0,0.0])
David = np.array([0.0,34.0,1.0,0.0])

Concatenate our passengers into a single array

In [141]:
sample_passengers = np.array((Jack, Rose, David))
sample_passengers

array([[ 0., 20.,  0.,  0.],
       [ 1., 17.,  1.,  0.],
       [ 0., 34.,  1.,  0.]])

As before, we need the scale the feature data we are making predictions on

In [142]:
sample_passengers = scaler.transform(sample_passengers)

Now we can do our prediction on who's gonna survive or not.
First column is the probability of perishing, second columns is the probability of surviving.

In [144]:
model.predict_proba(sample_passengers)

array([[0.89307026, 0.10692974],
       [0.0531742 , 0.9468258 ],
       [0.57046712, 0.42953288]])