# Titanic - Machine Learning from Disaster

Dataset: [Available on Kaggle](https://www.kaggle.com/competitions/titanic)

# Part 1: Data Base Analysis

In [78]:
import pandas as pd

In [79]:
train = pd.read_csv('train.csv')
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [81]:
train.dtypes.value_counts()

int64      5
object     5
float64    2
dtype: int64

In [82]:
train.isnull().sum().sort_values(ascending=False).head(5)

Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
dtype: int64

In [83]:
test = pd.read_csv('test.csv')
test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [84]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [85]:
test.isnull().sum().sort_values(ascending=False).head(5)

Cabin          327
Age             86
Fare             1
PassengerId      0
Pclass           0
dtype: int64

# Part 2: Data Cleaning

In [86]:
train.nunique().sort_values(ascending=False)

PassengerId    891
Name           891
Ticket         681
Fare           248
Cabin          147
Age             88
SibSp            7
Parch            7
Pclass           3
Embarked         3
Survived         2
Sex              2
dtype: int64

In [87]:
train = train.drop(['Name','Ticket','Cabin'],axis=1)

In [88]:
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S


In [89]:
test = test.drop(['Name','Ticket','Cabin'],axis=1)

In [90]:
test.head(3)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q


In [91]:
train.Age.mean()

29.69911764705882

In [92]:
train.loc[train.Age.isnull(),'Age'] = train.Age.mean()

In [93]:
test.loc[test.Age.isnull(),'Age'] = test.Age.mean()

In [94]:
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [95]:
train.Embarked.mode()[0]

'S'

In [96]:
train.loc[train.Embarked.isnull(),'Embarked'] = train.Embarked.mode()[0]

In [97]:
test.loc[test.Fare.isnull(),'Fare'] = test.Fare.mean()

In [98]:
train.isnull().sum().sort_values(ascending=False).head(5)

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
dtype: int64

In [99]:
test.isnull().sum().sort_values(ascending=False).head(5)

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
dtype: int64

In [100]:
col_train_nr = train.columns[train.dtypes != 'object']
col_train_nr

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [101]:
train_nr = train.loc[:,col_train_nr]

In [102]:
col_test_nr = test.columns[test.dtypes != 'object']
col_test_nr

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [103]:
test_nr = test.loc[:,col_test_nr]

# Part 3: Separating the database into training and validation

In [104]:
from sklearn.model_selection import train_test_split

In [105]:
X = train_nr.drop(['PassengerId','Survived'],axis=1)
y = train.Survived

In [106]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

# Part 4: Machine Learning

# Decision Tree

In [107]:
from sklearn import tree

In [108]:
clf_dt = tree.DecisionTreeClassifier(random_state=42)

In [109]:
clf_dt = clf_dt.fit(X_train,y_train)

In [110]:
y_pred_dt = clf_dt.predict(X_val)

# KNeighborsClassifier

In [111]:
from sklearn.neighbors import KNeighborsClassifier

In [112]:
clf_knc = KNeighborsClassifier(n_neighbors=3)

In [113]:
clf_knc = clf_knc.fit(X_train,y_train)

In [114]:
y_pred_knc = clf_knc.predict(X_val)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# Logistic Regression

In [154]:
from sklearn.linear_model import LogisticRegression

In [155]:
clf_lr = LogisticRegression(random_state=42)

In [156]:
clf_lr = clf_lr.fit(X_train,y_train)

In [157]:
y_pred_lr = clf_lr.predict(X_val)

# Accuracy Score 

In [119]:
from sklearn.metrics import accuracy_score

###  Decision Tree

In [120]:
accuracy_score(y_val, y_pred_dt)

0.6169491525423729

### KNeighborsClassifier

In [121]:
accuracy_score(y_val, y_pred_knc)

0.6542372881355932

### Logistic Regression

In [159]:
accuracy_score(y_val, y_pred_lr)

0.7254237288135593

# Confusion Matrix

In [123]:
from sklearn.metrics import confusion_matrix

### Decision Tree

In [124]:
confusion_matrix(y_val, y_pred_dt)

array([[125,  50],
       [ 63,  57]])

### KNeighborsClassifier

In [125]:
confusion_matrix(y_val, y_pred_knc)

array([[133,  42],
       [ 60,  60]])

### Logistic Regression

In [158]:
confusion_matrix(y_val, y_pred_lr)

array([[156,  19],
       [ 62,  58]])

# Prediction for the test data

###### I am using the model with the best accuracy for the test basis (logistic regression)

In [127]:
X_train.head(3)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
6,1,54.0,0,0,51.8625
718,3,29.699118,0,0,15.5
685,2,25.0,1,2,41.5792


In [128]:
test_nr.head(3)

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,892,3,34.5,0,0,7.8292
1,893,3,47.0,1,0,7.0
2,894,2,62.0,0,0,9.6875


In [129]:
X_test = test_nr.drop("PassengerId",axis=1)

In [130]:
y_pred = clf_lr.predict(X_test)

In [131]:
test['Survived'] = y_pred

In [132]:
base_for_submission = test[['PassengerId','Survived']]

In [133]:
base_for_submission.to_csv('results.csv' ,index=False)