In [1]:
import pandas as pd # data processing
import numpy as np # linear algebra
import datetime as dt # to convert dates

# data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
# some functions that we'll need
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

from sklearn.ensemble import RandomForestClassifier

r = RandomForestClassifier() 

from sklearn.tree import DecisionTreeClassifier

d = DecisionTreeClassifier()

from sklearn.linear_model import LogisticRegression

log = LogisticRegression()

from sklearn.neighbors import KNeighborsClassifier

k = KNeighborsClassifier()

from sklearn.naive_bayes import GaussianNB

from sklearn.naive_bayes import BernoulliNB

g = GaussianNB()

b = BernoulliNB()

from xgboost import XGBClassifier

xgbc = XGBClassifier()

In [3]:
# import data and merge them into one (which we'll split later)
train=pd.read_csv('ttrain.csv')
test=pd.read_csv('ttest.csv')
df =train.append(test)
df.shape # dataset contains 1309 rows and 12 columns

(1309, 12)

In [4]:
train.shape, test.shape

((891, 12), (418, 11))

In [5]:
df.head() # first 5 rows of the dataset

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
df.isnull().sum() # there are some empty data we'll fill the blanks of course

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [7]:
# Values in 'Title' column sorted by counts
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'].value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Major         2
Ms            2
Lady          1
Sir           1
Mme           1
Don           1
Capt          1
Countess      1
Jonkheer      1
Dona          1
Name: Title, dtype: int64

In [8]:
# The column 'Title' contains so many different names we don't need this variety
df["Title"] = df["Name"].str.extract(" ([A-Za-z]+)\.")
df["Title"] = df["Title"].replace(["Ms","Mlle"] , "miss")
df["Title"] = df["Title"].replace(["Mme","Countess","Lady","Dona"],"Mrs") # evli kadın
df["Title"] = df["Title"].replace(["Dr","Major","Col","Sir","Rev","Jonkheer","Capt","Don"],"Mr")

In [9]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1  
df.FamilySize.value_counts() # siblings, parents and himself/herself values makes a new column 'FamilySize'

1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: FamilySize, dtype: int64

In [10]:
df['Age'].fillna(df.groupby("Title")["Age"].transform("median"), inplace=True)
df.groupby("Title")["Age"].transform("median")

0      30.0
1      35.5
2      22.0
3      35.5
4      30.0
       ... 
413    30.0
414    35.5
415    30.0
416    30.0
417     4.0
Name: Age, Length: 1309, dtype: float64

In [11]:
df.Embarked.value_counts() # Ports where passengers embarked

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [12]:
df['Embarked'] = df['Embarked'].fillna('S')

In [13]:
del df['Cabin']

In [14]:
del df['Fare']

In [15]:
df = pd.get_dummies(df, drop_first=True) 

In [16]:
# split data as train and test
df_train = df[:891]
df_test = df[891:]
df_train.shape , df_test.shape

((891, 2248), (418, 2248))

In [17]:
del df_train["Survived"]

In [18]:
y=train["Survived"]

In [19]:
algorithms = [g,b,k,log,gbc,r,d,xgbc]
names = ['GaussianNB', 'BernoulliNB','K Nearest', 'Logistic', 'Gradient Boosting', 'RandomForest', 'Decision Tree',"XGBC"]

In [20]:
def regression(x, y, algorithms = algorithms, names = names):

    #fit the data
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(x, y)

    accuracy = []
    precision = [] 
    recall = []
    f1 = []

    for i in range(len(algorithms)):

        accuracy.append(accuracy_score(y, algorithms[i].predict(x))) 
        precision.append(precision_score (y, algorithms[i].predict(x))) 
        recall.append(recall_score (y, algorithms[i].predict(x)))
        f1.append(f1_score (y, algorithms[i].predict(x))) 

    metrics = pd.DataFrame(columns = ['Accuracy', 'Precision', 'Recall',"F1"], index = names)
    metrics['Accuracy'] = accuracy
    metrics['Precision'] = precision
    metrics['Recall'] = recall
    metrics['F1'] = f1 

    return metrics.sort_values('F1', ascending = False)

In [21]:
regression(df_train , y)

Unnamed: 0,Accuracy,Precision,Recall,F1
GaussianNB,1.0,1.0,1.0,1.0
Decision Tree,1.0,1.0,1.0,1.0
RandomForest,0.998878,0.997085,1.0,0.99854
XGBC,0.995511,1.0,0.988304,0.994118
Gradient Boosting,0.857464,0.847896,0.766082,0.804916
Logistic,0.843996,0.797654,0.795322,0.796486
BernoulliNB,0.811448,0.77707,0.71345,0.743902
K Nearest,0.695847,0.641434,0.47076,0.543002


In [22]:
del df_test["Survived"]

In [23]:
prediction = gbc.predict(df_test)
prediction

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [24]:
final = test[["PassengerId"]]
final["Survived"] = prediction

In [25]:
final.to_csv("final_titanic.csv" , index = False)