


*   In this assignment, you are going to use your model to predict survival of the Titanic disaster. For this assignment, send a link to a Jupyter notebook containing solutions to the following tasks.
*   Download Titanic data from Kaggle. The data in the train.csv file meets your need.
*   Split your data into training and test sets.
*   Predict the survival based on the test data you split by creating your model.
*   Is your model's performance satisfactory? Explain.
*   Try to improve your model's performance by adding or subtracting some variables.
*   Explore the advantages and disadvantages of Logistic Regression and discuss with your mentor.







In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df['Age'] = df['Age'].fillna(int(df['Age'].mean())).astype('int64')
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].value_counts().index[0])
df.drop('PassengerId', axis=1, inplace=True) #since all unique and it does not affect survival rate 
df.drop('Cabin', axis=1, inplace=True) #almost all are empty and again no effect
df.drop('Name', axis=1, inplace=True)  #no effect so drop
df.drop('Embarked', axis=1, inplace=True)

df['Sex'] = pd.get_dummies(df['Sex'], drop_first=True).rename(columns={'male':'Sex'})
df['Fare'] = pd.get_dummies(df['Fare'], drop_first=True)
df['Ticket'] = pd.get_dummies(df['Ticket'], drop_first=True)
df['Fare'] = pd.get_dummies(df['Fare'], drop_first=True)

In [5]:
X = df.drop('Survived', axis=1)
Y = df['Survived']

In [6]:
log_reg = LogisticRegression()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [8]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
train_accuracy = log_reg.score(X_train, y_train)
test_accuracy = log_reg.score(X_test, y_test)
print('One-vs-rest', '-'*25, 
      "Train accuracy: {:.2f}".format(train_accuracy), 
      "Test accuracy: {:.2f}".format(test_accuracy), sep='\n')

One-vs-rest
-------------------------
Train accuracy: 0.79
Test accuracy: 0.82


In [10]:
log_reg_mnm = LogisticRegression(multi_class='multinomial', solver='lbfgs')
log_reg_mnm.fit(X_train, y_train)
train_accuracy = log_reg_mnm.score(X_train, y_train)
test_accuracy = log_reg_mnm.score(X_train, y_train)
print('Multinomial (Softmax)', '-'*25, 
      "Train accuracy: {:.2f}".format(train_accuracy), 
      "Test accuracy: {:.2f}".format(test_accuracy), sep='\n')

Multinomial (Softmax)
-------------------------
Train accuracy: 0.79
Test accuracy: 0.79


In [11]:
C_values = [0.001,0.01, 0.1,1,10,100, 1000]

accuracy_values = pd.DataFrame(columns=['C_values', 'Train Accuracy', 'Test Accuracy'])

for c in C_values:
    lr = LogisticRegression(max_iter=1000, penalty = 'l2', C = c, random_state = 0, solver='lbfgs')
    lr.fit(X_train, y_train)
    accuracy_values = accuracy_values.append({'C_values': c,
                                              'Train Accuracy': lr.score(X_train, y_train),
                                              'Test Accuracy': lr.score(X_test, y_test)
                                             }, ignore_index=True)
display(accuracy_values)

Unnamed: 0,C_values,Train Accuracy,Test Accuracy
0,0.001,0.623596,0.586592
1,0.01,0.755618,0.765363
2,0.1,0.786517,0.821229
3,1.0,0.785112,0.815642
4,10.0,0.786517,0.810056
5,100.0,0.786517,0.810056
6,1000.0,0.786517,0.810056


In [12]:
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]
Y = df['Survived']

In [13]:
log_reg = LogisticRegression()

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [15]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
train_accuracy = log_reg.score(X_train, y_train)
test_accuracy = log_reg.score(X_test, y_test)
print('One-vs-rest', '-'*25, 
      "Train accuracy: {:.2f}".format(train_accuracy), 
      "Test accuracy: {:.2f}".format(test_accuracy), sep='\n')

One-vs-rest
-------------------------
Train accuracy: 0.79
Test accuracy: 0.82


In [17]:
log_reg_mnm = LogisticRegression(multi_class='multinomial', solver='lbfgs')
log_reg_mnm.fit(X_train, y_train)
train_accuracy = log_reg_mnm.score(X_train, y_train)
test_accuracy = log_reg_mnm.score(X_train, y_train)
print('Multinomial (Softmax)', '-'*25, 
      "Train accuracy: {:.2f}".format(train_accuracy), 
      "Test accuracy: {:.2f}".format(test_accuracy), sep='\n')

Multinomial (Softmax)
-------------------------
Train accuracy: 0.79
Test accuracy: 0.79


In [18]:
C_values = [0.001,0.01, 0.1,1,10,100, 1000]

accuracy_values = pd.DataFrame(columns=['C_values', 'Train Accuracy', 'Test Accuracy'])

for c in C_values:
    lr = LogisticRegression(max_iter=1000, penalty = 'l2', C = c, random_state = 0, solver='lbfgs')
    lr.fit(X_train, y_train)
    accuracy_values = accuracy_values.append({'C_values': c,
                                              'Train Accuracy': lr.score(X_train, y_train),
                                              'Test Accuracy': lr.score(X_test, y_test)
                                             }, ignore_index=True)
display(accuracy_values)

Unnamed: 0,C_values,Train Accuracy,Test Accuracy
0,0.001,0.623596,0.586592
1,0.01,0.755618,0.765363
2,0.1,0.786517,0.821229
3,1.0,0.785112,0.815642
4,10.0,0.785112,0.810056
5,100.0,0.786517,0.810056
6,1000.0,0.786517,0.810056


#Normalization

In [19]:
normalizer = Normalizer()
X_normalized = normalizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_normalized, Y, test_size=0.20, random_state=42)

log_reg  = LogisticRegression(max_iter=1000)
log_reg .fit(X_train, y_train)

train_accuracy_norm = log_reg .score(X_train, y_train)
test_accuracy_norm = log_reg .score(X_test, y_test)

print("Train accuracy: {}".format(train_accuracy))
print("Test accuracy: {}\n".format(test_accuracy))
print("Train accuracy (Normalized): {}".format(train_accuracy_norm))
print("Test accuracy (Normalized): {}".format(test_accuracy_norm))

Train accuracy: 0.7851123595505618
Test accuracy: 0.7851123595505618

Train accuracy (Normalized): 0.6446629213483146
Test accuracy (Normalized): 0.5921787709497207


In [20]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.20, random_state=42)

log_reg  = LogisticRegression(max_iter=1000)
log_reg .fit(X_train, y_train)

train_accuracy_std = lr.score(X_train, y_train)
test_accuracy_std = log_reg .score(X_test, y_test)

print("Train accuracy: {}".format(train_accuracy))
print("Test accuracy: {}\n".format(test_accuracy))
print("Train accuracy (Standardized): {}".format(train_accuracy_std))
print("Test accuracy (Standardized): {}".format(test_accuracy_std))

Train accuracy: 0.7851123595505618
Test accuracy: 0.7851123595505618

Train accuracy (Standardized): 0.38202247191011235
Test accuracy (Standardized): 0.8100558659217877
