In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#ignore warning
import warnings
warnings.filterwarnings("ignore")

from aquire import get_titanic_data
from prepare import prep_titanic_data

df=prep_titanic_data(get_titanic_data())
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,embarked_encode
0,0,0,3,male,22.0,1,0,7.25,S,Third,Southampton,0,3
1,1,1,1,female,38.0,1,0,71.2833,C,First,Cherbourg,0,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,Southampton,1,3
3,3,1,1,female,35.0,1,0,53.1,S,First,Southampton,0,3
4,4,0,3,male,35.0,0,0,8.05,S,Third,Southampton,1,3


In [4]:
# Handle missing values in the `age` column.
df.dropna(inplace=True)

In [5]:
X = df[['pclass','age','fare','sibsp','parch']]
y = df[['survived']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state = 123)

X_train.head()

Unnamed: 0,pclass,age,fare,sibsp,parch
60,3,22.0,7.2292,0,0
348,3,3.0,15.9,1,1
606,3,30.0,7.8958,0,0
195,1,58.0,146.5208,0,0
56,2,21.0,10.5,0,0


In [6]:
# from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(C=1, class_weight={1:2}, random_state = 123, solver='saga')

In [7]:
logit.fit(X_train, y_train)

LogisticRegression(C=1, class_weight={1: 2}, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [8]:
print('Coefficient: \n', logit.coef_)
print('Intercept: \n', logit.intercept_)

Coefficient: 
 [[-0.03183343 -0.00819677  0.01882492 -0.02030296  0.01317287]]
Intercept: 
 [0.00230414]


In [9]:
y_pred = logit.predict(X_train)

In [10]:
y_pred_proba = logit.predict_proba(X_train)

In [15]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))

Accuracy of Logistic Regression classifier on training set: 0.65


In [16]:
print(confusion_matrix(y_train, y_pred))

[[181 112]
 [ 64 142]]


In [17]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.62      0.67       293
           1       0.56      0.69      0.62       206

   micro avg       0.65      0.65      0.65       499
   macro avg       0.65      0.65      0.65       499
weighted avg       0.66      0.65      0.65       499

