In [4]:
import pandas as pd

In [5]:
titanic_df = pd.read_csv('c:/projects/datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,28.0,0,0,7.8958,0,0,1
1,0,3,1,26.0,1,2,20.575,0,0,1
2,1,2,0,25.0,1,1,30.0,0,0,1
3,0,3,1,28.0,0,0,7.8958,0,0,1
4,0,3,1,29.0,1,0,7.0458,0,0,1


In [6]:
titanic_df.shape

(712, 10)

In [7]:
from sklearn.model_selection import train_test_split

X = titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [5]:
x_train.shape, y_train.shape

((569, 9), (569,))

In [8]:
x_test.shape, y_test.shape

((143, 9), (143,))

In [9]:
from sklearn.linear_model import LogisticRegression
# penalty - regulaize the model by applying a penalty on overly complex models, so the model is more robust. Options are l1 and l2.
# C - inverse of regularization strength - smaller values indicate stronger regularization
# solver - algorithm to use in the optimization problem - the liblinear solver works well on small datasets
# fit function starts the training for the model
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear').fit(x_train, y_train)

In [10]:
y_pred = logistic_model.predict(x_test)

In [11]:
pred_results = pd.DataFrame({'y_test' : y_test, 'y_pred' : y_pred })

In [12]:
pred_results.head()

Unnamed: 0,y_test,y_pred
256,0,0
483,1,0
485,0,0
269,0,1
155,1,1


In [13]:
titanic_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
titanic_crosstab

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,78,19
1,10,36


In [14]:
# we get to know how well our model works by using the right metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [16]:
# each of these are functions that can be invoked to calculate accuracy, precision and recall
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("accuracy_score :", acc) # how many of the predicted values did the model get right?
print("precision_score :", prec) # how many of the passengers that the model thought survived actually did survice?
print("recall_score :", recall) # how many of the actual survivors did the model correctly predict?

accuracy_score : 0.7972027972027972
precision_score : 0.782608695652174
recall_score : 0.6545454545454545


In [17]:
titanic_crosstab

y_test,0,1
y_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
0,78,19
1,10,36


In [19]:
TP = titanic_crosstab[1][1] # true positives
TN = titanic_crosstab[0][0] # true negatives
FP = titanic_crosstab[0][1] # false positives
FN = titanic_crosstab[1][0] # false negatives

In [20]:
# let's calculate manually the values that scikit-learn library calculated earlier
accuracy_score_verified = (TP + TN) / (TP + FP + TN + FN)
accuracy_score_verified

0.7972027972027972

In [22]:
precision_score_verified = TP / (TP +  FP)
precision_score_verified

0.782608695652174

In [23]:
recall_score_survived = TP / (TP + FN)
recall_score_survived

0.6545454545454545