In [217]:
#Standard imports

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [218]:
#Model Evaluation libraries

from sklearn.model_selection import train_test_split

In [219]:
#Machine Learning libraries

from sklearn.linear_model import LinearRegression

#Performance validation libraries

from sklearn.metrics import mean_squared_error

In [220]:
#Reading the .csv file of the data

df_titanic = pd.read_csv("titanic.csv")

df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [221]:
#DATA CLEANING
#Finding the missing values in the data

df_titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [222]:
#Name and Titanic is unique for each passenger

df_titanic.drop(["Name", "Ticket"], axis=1, inplace=True)

In [223]:
#More than 77% missing values

df_titanic.drop(["Cabin"], axis=1, inplace=True)

In [224]:
#Filling the missing values by the mode of the data

df_titanic.Embarked = df_titanic.Embarked.fillna(value='S')

In [225]:
#Filling the missing values by the mean age

mean = df["Age"].mean()

df_titanic.Age= df_titanic.Age.fillna(value=mean)


In [226]:
#Cleaned Data

df_titanic.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [227]:
df_titanic

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.000000,1,0,7.2500,S
1,2,1,1,female,38.000000,1,0,71.2833,C
2,3,1,3,female,26.000000,0,0,7.9250,S
3,4,1,1,female,35.000000,1,0,53.1000,S
4,5,0,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.000000,0,0,13.0000,S
887,888,1,1,female,19.000000,0,0,30.0000,S
888,889,0,3,female,29.699118,1,2,23.4500,S
889,890,1,1,male,26.000000,0,0,30.0000,C


In [228]:
#Categorizing the data

numerical_columns = ['Age', 'Fare']

categorical_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']

In [None]:
#Converting sex to a numerical attribute

for i in range(len(df_titanic.Sex)):
  if(df_titanic['Sex'][i] == 'male'):
    df_titanic['Sex'][i] = 0
  else:
    df_titanic['Sex'][i] = 1

In [None]:
#Converting Embarked to a numerical attribute

for i in range(len(df_titanic.Embarked)):
  if(df_titanic['Embarked'][i] == 'S'):
    df_titanic['Embarked'][i] = 0
  if(df_titanic['Embarked'][i] == 'C'):
    df_titanic['Embarked'][i] = 1
  else:
    df_titanic['Embarked'][i] = 2

In [231]:
#Converting Age from float to int

for i in range(len(df_titanic.Age)):
    df_titanic["Age"] = df_titanic["Age"].astype(int)

In [232]:
#Converting Fare from float to int

for i in range(len(df_titanic.Fare)):
    df_titanic["Fare"] = df_titanic["Fare"].astype(int)

In [233]:
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,0,22,1,0,7,2
1,2,1,1,1,38,1,0,71,1
2,3,1,3,1,26,0,0,7,2
3,4,1,1,1,35,1,0,53,2
4,5,0,3,0,35,0,0,8,2


In [208]:
#Selecting the features of the data

X = df_titanic.drop(['Survived'], axis=1)

X

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,22,1,0,7,2
1,2,1,1,38,1,0,71,1
2,3,3,1,26,0,0,7,2
3,4,1,1,35,1,0,53,2
4,5,3,0,35,0,0,8,2
...,...,...,...,...,...,...,...,...
886,887,2,0,27,0,0,13,2
887,888,1,1,19,0,0,30,2
888,889,3,1,29,1,2,23,2
889,890,1,0,26,0,0,30,1


In [209]:
#Selecting the target class of the data

y = df_titanic.Survived

y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [211]:
#Performing a train test split on the data

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=40)



In [212]:
#Model Build and fitting

reg = LinearRegression().fit(X_train, y_train)

In [213]:
#Model Predictions

y_pred = reg.predict(X_test)

y_pred[y_pred > 0.5] = 1
y_pred[y_pred < 0.5] = 0

In [214]:
#Model Validation

from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[109,  19],
       [ 26,  69]])