In [313]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import exp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_curve
import seaborn as sns
from scipy.stats import ttest_ind
import matplotlib.colors as mcolors

<h2>Data Preparation</h2>

In [314]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

In [315]:
sum(df.duplicated())
sum(df_test.duplicated())

0

Remove unwanted features: <br>
<ul>
    <li>Name (irrelevant to survival rate)</li>
    <li>Ticket Number (irrelevant to survival rate)</li>
    <li>Cabin Number (too many missing data)</li>
    <li>PassengerId (only for indexing purposes)</li>
</ul>

In [316]:
df.drop('Name', axis = 1, inplace = True)
df.drop('Cabin', axis = 1, inplace = True)
df.drop('Ticket', axis = 1, inplace = True)

In [317]:
df_test.drop('Name', axis = 1, inplace = True)
df_test.drop('Cabin', axis = 1, inplace = True)
df_test.drop('Ticket', axis = 1, inplace = True)

Remove all rows with blank/ missing data from both dataframe

In [329]:
mean_age_train = df['Age'].mean()
mean_age_test = df['Age'].mean()

#fill missing age with mean
df.fillna(mean_age_train, inplace=True)
df_test.fillna(mean_age_test, inplace=True)

#check for any rows with missing data
if df.isnull().sum().sum() == 0:
    print("There is no NaN values")
else:
    print("There is NaN values")
    
if df_test.isnull().sum().sum() == 0:
    print("There is no NaN values")
else:
    print("There is NaN values")

There is no NaN values
There is no NaN values


Replace "Sex" and "Embarked" values to integer values

In [330]:
df['Sex'] = df['Sex'].replace({'male': 0, 'female': 1})
df['Embarked']= df['Embarked'].replace({'C': 0, 'Q': 1, 'S': 2})

df_test['Sex'] = df_test['Sex'].replace({'male': 0, 'female': 1})
df_test['Embarked']= df_test['Embarked'].replace({'C': 0, 'Q': 1, 'S': 2})

In [331]:
y_train = df.Survived;
X_train = df.iloc[:, df.columns != 'Survived']
X_test = df_test

X_train.drop('PassengerId', axis = 1)
X_test.drop('PassengerId', axis = 1)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

<h2>Building the Model, Training, and Prediction</h2>

In [332]:
model = LogisticRegression();
model.fit(X_train_scaled, y_train)

train_accuracy = model.score(X_train_scaled, y_train)
print(f'The accuracy for the training set is {100 * train_accuracy:.2f}%')

The accuracy for the training set is 80.06%


In [333]:
prob = model.predict_proba(X_test_scaled[:20])
np.set_printoptions(precision=4, suppress=True)
prob

array([[0.8953, 0.1047],
       [0.6457, 0.3543],
       [0.8925, 0.1075],
       [0.8807, 0.1193],
       [0.4009, 0.5991],
       [0.8098, 0.1902],
       [0.3467, 0.6533],
       [0.7575, 0.2425],
       [0.2121, 0.7879],
       [0.9168, 0.0832],
       [0.7023, 0.2977],
       [0.5977, 0.4023],
       [0.0514, 0.9486],
       [0.9335, 0.0665],
       [0.1338, 0.8662],
       [0.1255, 0.8745],
       [0.7245, 0.2755],
       [0.802 , 0.198 ],
       [0.4374, 0.5626],
       [0.456 , 0.544 ]])

In [326]:
y_pred = model.predict(X_test_scaled)

output_df = pd.DataFrame(data=df_test['PassengerId'])
output_df['Survived'] = y_pred
output_df.to_csv('output.csv', index=False)