In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Reading the dataset

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

### EDA

In [3]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data Dictionary
Variable	Definition	Key
survival	Survival	0 = No, 1 = Yes
pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
sex	Sex	
Age	Age in years	
sibsp	# of siblings / spouses aboard the Titanic	
parch	# of parents / children aboard the Titanic	
ticket	Ticket number	
fare	Passenger fare	
cabin	Cabin number	
embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes
pclass: A proxy for socio-economic status (SES)
1st = Upper
2nd = Middle
3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...
Sibling = brother, sister, stepbrother, stepsister
Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...
Parent = mother, father
Child = daughter, son, stepdaughter, stepson
Some children travelled only with a nanny, therefore parch=0 for them.

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Let's see and handle null values

In [6]:
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Firstly let's handle age
Since the number of records not consisting of the age value is quite large ( close to 20% we simply cannot remove those records). As it would lead to data loss

In [7]:
mean = train_df['Age'].mean()
median = train_df['Age'].median()
print(mean)
print(median)

29.69911764705882
28.0


substituting null values with mean, u could also have substituted with the median values

In [8]:
train_df['Age'].fillna(mean, inplace=True)
test_df['Age'].fillna(mean, inplace=True)
print(train_df['Age'].isnull().sum())

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Age'].fillna(mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Age'].fillna(mean, inplace=True)


### Handle Embarked
As the value is only 2 we can delete those records or simply put there embarked location as the mode of the embarked values

In [9]:
train_df[train_df['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [10]:
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)
print(train_df['Embarked'].isnull().sum())

0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df['Embarked'].fillna(test_df['Embarked'].mode()[0], inplace=True)


### Handle Cabin empty values

In [11]:
train_df['Cabin'].value_counts()

Cabin
G6             4
C23 C25 C27    4
B96 B98        4
F2             3
D              3
              ..
E17            1
A24            1
C50            1
B42            1
C148           1
Name: count, Length: 147, dtype: int64

#### Since there are a large number of Cabin types and also since i feel it does not have a significant effect on the output. I'll remove the entire column

In [12]:
train_df.drop('Cabin', axis=1, inplace=True)
test_df.drop('Cabin', axis=1, inplace=True)

In [13]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


# Feature Engineering

### let's drop unnecessary columns according to me

In [14]:
unnecessary = ['PassengerId', 'Name', 'Ticket', 'Embarked']
train_df.drop(columns=unnecessary, axis=1, inplace=True)
test_passenger = test_df['PassengerId']
test_df.drop(columns=unnecessary, axis=1, inplace=True)

In [15]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


### Since pclass has 3 output classes 1st, 2nd, 3rd we can divide it into diff columns using OneHotEncoding

In [16]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

encoder.fit_transform(train_df[['Pclass']]).toarray()
encoder.fit_transform(test_df[['Pclass']]).toarray()

new_train_df = pd.DataFrame(encoder.fit_transform(train_df[['Pclass']]).toarray(),columns=encoder.get_feature_names_out())
new_test_df = pd.DataFrame(encoder.fit_transform(test_df[['Pclass']]).toarray(),columns=encoder.get_feature_names_out())

In [17]:
df_encoded_train = pd.concat([train_df, new_train_df], axis=1)
df_encoded_test = pd.concat([test_df, new_test_df], axis=1)
train_df = df_encoded_train.drop('Pclass', axis=1)    ## Removing the Pclass column now
test_df = df_encoded_test.drop('Pclass', axis=1)


In [18]:
train_df.head()
test_df.head()


Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
0,male,34.5,0,0,7.8292,0.0,0.0,1.0
1,female,47.0,1,0,7.0,0.0,0.0,1.0
2,male,62.0,0,0,9.6875,0.0,1.0,0.0
3,male,27.0,0,0,8.6625,0.0,0.0,1.0
4,female,22.0,1,1,12.2875,0.0,0.0,1.0


### For Gender I am encoding it to binary classification

In [19]:
train_df['Sex'] = train_df['Sex'].map({'male': 1, 'female': 0})
test_df['Sex'] = test_df['Sex'].map({'male': 1, 'female': 0})


In [20]:
train_df.head()

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3
0,0,1,22.0,1,0,7.25,0.0,0.0,1.0
1,1,0,38.0,1,0,71.2833,1.0,0.0,0.0
2,1,0,26.0,0,0,7.925,0.0,0.0,1.0
3,1,0,35.0,1,0,53.1,1.0,0.0,0.0
4,0,1,35.0,0,0,8.05,0.0,0.0,1.0


### Merging sibsp and parch to a single column named family size

In [21]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1
train_df.drop(columns=['Parch', 'SibSp'], axis = 1, inplace=True)
test_df.drop(columns=['Parch', 'SibSp'], axis = 1, inplace=True)


In [22]:
train_df.head()

Unnamed: 0,Survived,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,FamilySize
0,0,1,22.0,7.25,0.0,0.0,1.0,2
1,1,0,38.0,71.2833,1.0,0.0,0.0,2
2,1,0,26.0,7.925,0.0,0.0,1.0,1
3,1,0,35.0,53.1,1.0,0.0,0.0,2
4,0,1,35.0,8.05,0.0,0.0,1.0,1


In [23]:
test_df.head()

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,FamilySize
0,1,34.5,7.8292,0.0,0.0,1.0,1
1,0,47.0,7.0,0.0,0.0,1.0,2
2,1,62.0,9.6875,0.0,1.0,0.0,1
3,1,27.0,8.6625,0.0,0.0,1.0,1
4,0,22.0,12.2875,0.0,0.0,1.0,3


## Now let's go to Model Training

In [24]:
x_train = train_df.drop(columns=['Survived']) ## independent features
y_train = train_df['Survived'] ## dependent feature
x_test = test_df ## independent features as it is test data it does not consist of survived feature we need to predict it




In [25]:
x_test.isnull().sum()

Sex           0
Age           0
Fare          1
Pclass_1      0
Pclass_2      0
Pclass_3      0
FamilySize    0
dtype: int64

In [26]:
x_test['Fare'].fillna(x_test['Fare'].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x_test['Fare'].fillna(x_test['Fare'].mean(), inplace=True)


In [27]:
x_test.isnull().sum()

Sex           0
Age           0
Fare          0
Pclass_1      0
Pclass_2      0
Pclass_3      0
FamilySize    0
dtype: int64

In [28]:
x_train

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,FamilySize
0,1,22.000000,7.2500,0.0,0.0,1.0,2
1,0,38.000000,71.2833,1.0,0.0,0.0,2
2,0,26.000000,7.9250,0.0,0.0,1.0,1
3,0,35.000000,53.1000,1.0,0.0,0.0,2
4,1,35.000000,8.0500,0.0,0.0,1.0,1
...,...,...,...,...,...,...,...
886,1,27.000000,13.0000,0.0,1.0,0.0,1
887,0,19.000000,30.0000,1.0,0.0,0.0,1
888,0,29.699118,23.4500,0.0,0.0,1.0,4
889,1,26.000000,30.0000,1.0,0.0,0.0,1


In [29]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [30]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test) 


 

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

model.fit(x_train_scaled, y_train)
test_pred = model.predict(x_test_scaled)



In [32]:
output_df = pd.DataFrame({'PassengerId': test_passenger.to_numpy(), 'Survived': test_pred})
output_df.to_csv('submission.csv', index=False)