In [216]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import exp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve
import seaborn as sns
from scipy.stats import ttest_ind
import matplotlib.colors as mcolors

<h2>Data Preparation</h2>

In [217]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

In [218]:
sum(df.duplicated())
sum(df_test.duplicated())

0

Remove unwanted features: <br>
<ul>
    <li>Name (irrelevant to survival rate)</li>
    <li>Ticket Number (irrelevant to survival rate)</li>
    <li>Cabin Number (too many missing data)</li>
    <li>PassengerId (only for indexing purposes)</li>
</ul>

In [219]:
df.drop('Name', axis = 1, inplace = True)
df.drop('Cabin', axis = 1, inplace = True)
df.drop('Ticket', axis = 1, inplace = True)
df.drop('PassengerId', axis = 1, inplace = True)

In [220]:
df_test.drop('Name', axis = 1, inplace = True)
df_test.drop('Cabin', axis = 1, inplace = True)
df_test.drop('Ticket', axis = 1, inplace = True)
df_test.drop('PassengerId', axis = 1, inplace = True)

Remove all rows with blank/ missing data from both dataframe

In [221]:
num_rows_train = df.shape[0]
num_rows_test = df_test.shape[0]

df.dropna(inplace=True)
df_test.dropna(inplace=True)

num_rows_train_clean = df.shape[0]
num_rows_test_clean = df_test.shape[0]

print("Num of removed rows from train dataset: " + str(num_rows_train - num_rows_train_clean))
print("Num of removed rows from test dataset: " + str(num_rows_test - num_rows_test_clean))
print("Num of rows of clean train dataset: " + str(num_rows_train_clean))
print("Num of rows of clean test dataset: " + str(num_rows_test_clean))

Num of removed rows from train dataset: 179
Num of removed rows from test dataset: 87
Num of rows of clean train dataset: 712
Num of rows of clean test dataset: 331


Replace "Sex" and "Embarked" values to integer values

In [222]:
df['Sex'] = df['Sex'].replace({'male': 0, 'female': 1})
df['Embarked']= df['Embarked'].replace({'C': 0, 'Q': 1, 'S': 2})

df_test['Sex'] = df_test['Sex'].replace({'male': 0, 'female': 1})
df_test['Embarked']= df_test['Embarked'].replace({'C': 0, 'Q': 1, 'S': 2})

In [223]:
y_train = df.Survived;
X_train = df.iloc[:, df.columns != 'Survived']

scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(df_test), columns = df_test.columns)

<h2>Building the Model, Training, and Prediction</h2>

In [224]:
model = LogisticRegression();
model.fit(X_train_scaled, y_train)

train_accuracy = model.score(X_train_scaled, y_train)
print(f'The accuracy for the training set is {100 * train_accuracy:.2f}%')

The accuracy for the training set is 80.20%


In [225]:
prob = model.predict_proba(X_test_scaled[:20])
np.set_printoptions(precision=4, suppress=True)
prob

array([[0.9087, 0.0913],
       [0.6844, 0.3156],
       [0.9059, 0.0941],
       [0.8961, 0.1039],
       [0.4417, 0.5583],
       [0.8327, 0.1673],
       [0.385 , 0.615 ],
       [0.7848, 0.2152],
       [0.2409, 0.7591],
       [0.9293, 0.0707],
       [0.6335, 0.3665],
       [0.0606, 0.9394],
       [0.9429, 0.0571],
       [0.1553, 0.8447],
       [0.1458, 0.8542],
       [0.7543, 0.2457],
       [0.8257, 0.1743],
       [0.4822, 0.5178],
       [0.4978, 0.5022],
       [0.7058, 0.2942]])

In [226]:
y_pred = model.predict(X_test_scaled)
prob = model.predict_proba(X_test_scaled)
np.set_printoptions(precision=4, suppress=True)
rounded_prob = np.round(prob[:,1], 3)

df_test['Sex'] = df_test['Sex'].replace({0: 'male', 1: 'female'})
df_test['Embarked']= df_test['Embarked'].replace({0: 'C', 1: 'Q', 2: 'S'})

output_df = pd.DataFrame(data=df_test, columns = df_test.columns)
output_df['Survived'] = y_pred
output_df['Survival Rate'] = rounded_prob
output_df.to_csv('output.csv', index=False)

<h2>Most significant Features</h2>

In [227]:
coef = model.coef_
coef_abs = np.abs(coef)
coef_df = pd.DataFrame({'Feature': X.columns, 'Coef': coef[0], 'Abs_Coef': coef_abs[0]})
coef_df = coef_df.sort_values('Abs_Coef', ascending=False)
print(coef_df)

    Feature      Coef  Abs_Coef
1       Sex  1.239641  1.239641
0    Pclass -0.995533  0.995533
2       Age -0.611144  0.611144
3     SibSp -0.325458  0.325458
6  Embarked -0.135475  0.135475
5      Fare  0.085026  0.085026
4     Parch -0.047146  0.047146
