In [136]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

In [100]:
# Import train and test datasets

train = pd.read_csv('titanic_train.csv')
print(train.head(5))

test = pd.read_csv('titanic_test.csv')
print(test.head(5))



   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [101]:
# Examine the data

print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pcl

In [102]:
# check if 'age' has any null

# age_null = train['Age'].isnull().reset_index().groupby('Age').count()
# print(age_null)


age_null = train['Age'].isnull().sum()
print(age_null)

# There are 177 rows that do not have 'age' info. We will replace it by the average age

# Replace the missing ages with average age
train = train.fillna(value = {'Age': train['Age'].mean()})
test = test.fillna(value = {'Age': test['Age'].mean()})

print(train['Age'].isnull().sum())
print(test['Age'].isnull().sum())

177
0
0


In [103]:
# Encoding 'male' and 'female' in Sex column 

train['Sex'] = pd.get_dummies(train['Sex']) # 1 is male, 0 is female
test['Sex'] = pd.get_dummies(test['Sex'])


print(train.head(10))
print(test.head(10))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name  Sex        Age  SibSp  \
0                            Braund, Mr. Owen Harris    0  22.000000      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.000000      1   
2                             Heikkinen, Miss. Laina    1  26.000000      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.000000      1   
4                           Allen, Mr. William Henry    0  35.000000      0   
5                                   Moran, Mr. James    0  29.699118      0   
6                            McCarthy, Mr. Timothy

In [104]:
# Inspect the Pclass column 

pclass_null = train['Pclass'].isnull().sum()
print(pclass_null)

# No null values 

# Onehot encoding Pclass with 1 as 2nd-class and 0 otherwise. 
        
train['SecondClass'] =  train['Pclass'].apply(lambda x: 1 if x == 2 else 0)
test['SecondClass'] =  test['Pclass'].apply(lambda x: 1 if x == 2 else 0)
                                            
train['FirstClass'] = train['Pclass'].apply(lambda x: 1 if x == 1 else 0)
test['FirstClass'] = test['Pclass'].apply(lambda x: 1 if x == 1 else 0)
print(train.head(10))
print(test.head(10))

0
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   
5            6         0       3   
6            7         0       1   
7            8         0       3   
8            9         1       3   
9           10         1       2   

                                                Name  Sex        Age  SibSp  \
0                            Braund, Mr. Owen Harris    0  22.000000      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.000000      1   
2                             Heikkinen, Miss. Laina    1  26.000000      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.000000      1   
4                           Allen, Mr. William Henry    0  35.000000      0   
5                                   Moran, Mr. James    0  29.699118      0   
6                            McCarthy, Mr. Timot

In [105]:
# Select the features columns 

train_features = train[['Sex', 'Age', 'SecondClass', 'FirstClass']]
train_survival = train['Survived']

# Split train test 

x_train, x_test, y_train, y_test = train_test_split(train_features, train_survival, train_size = 0.8, test_size = 0.2) 

In [107]:
# Scale the feature columns 

scaler = StandardScaler()
scaler.fit_transform(x_train)
scaler.transform(x_test)

array([[-7.42427274e-01, -8.23020038e-01, -5.07888305e-01,
        -5.75187901e-01],
       [ 1.34693328e+00, -1.31662899e-01,  1.96893685e+00,
        -5.75187901e-01],
       [-7.42427274e-01, -1.14099695e-03, -5.07888305e-01,
        -5.75187901e-01],
       [-7.42427274e-01,  1.40468630e+00,  1.96893685e+00,
        -5.75187901e-01],
       [-7.42427274e-01, -1.14099695e-03, -5.07888305e-01,
        -5.75187901e-01],
       [ 1.34693328e+00,  8.66964080e-01, -5.07888305e-01,
        -5.75187901e-01],
       [ 1.34693328e+00, -5.92567658e-01,  1.96893685e+00,
        -5.75187901e-01],
       [ 1.34693328e+00, -8.99837497e-01, -5.07888305e-01,
         1.73856230e+00],
       [ 1.34693328e+00,  6.36511700e-01, -5.07888305e-01,
         1.73856230e+00],
       [ 1.34693328e+00, -1.14099695e-03,  1.96893685e+00,
        -5.75187901e-01],
       [ 1.34693328e+00,  9.87894810e-02,  1.96893685e+00,
        -5.75187901e-01],
       [-7.42427274e-01, -1.14099695e-03, -5.07888305e-01,
      

In [108]:
# Apply the logistic regression to train the model 

model = LogisticRegression()
model.fit(x_train, y_train)

# Score the model 
print('train score: ', model.score(x_train, y_train))
print('test score: ', model.score(x_test, y_test))

train score:  0.7949438202247191
test score:  0.8156424581005587


In [135]:
# test['predicted survival'] = test[['Sex', 'Age', 'SecondClass', 'FirstClass']].apply(lambda x: model.predict(scaler.transform(x), axis = 1))


test_features = test[['Sex', 'Age', 'SecondClass', 'FirstClass']]
scaler.transform(test_features)

gender_submission = pd.DataFrame(test['PassengerId'])

gender_submission['Survival'] = model.predict(test_features)

print(gender_submission.head(10))
print(gender_submission.info())

print(gender_submission.groupby('Survival').count())

   PassengerId  Survival
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1
5          897         0
6          898         1
7          899         0
8          900         1
9          901         0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Survival     418 non-null    int64
dtypes: int64(2)
memory usage: 6.7 KB
None
          PassengerId
Survival             
0                 262
1                 156


In [139]:
# ROC and AUC scores 

auc = roc_auc_score(y_test, model.predict_proba(x_test)[:, 1])
print(auc)

0.8716310086135037
