In [None]:
#1. Importing libraries
import pandas as pd
import numpy as np
import statistics as st
from matplotlib import pyplot as plt
import seaborn as sb

#machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
dataset = pd.read_csv("train.csv",sep=',')
dataset.head()

In [None]:
dataset.describe()

In [None]:
#2. Cleaning "train.csv"

#Treating empty values

dataset.isnull().sum()

#Age column
empty = dataset['Age'].isna().sum()
median_age = dataset['Age'].median()
dataset['Age'] = dataset['Age'].fillna(median_age)
dataset.isnull().sum()

#Cabin column
cabin_grouped = dataset.groupby(dataset['Cabin']).size().sort_values(ascending=False)
most_common_values = ["C23 C25 C27","G6","B96 B98"] #group of most shown values on the dataset for Cabin
dataset['Cabin'] = dataset['Cabin'].fillna(pd.Series(np.random.choice(most_common_values,size=len(dataset.index))))
dataset.isnull().sum()

#Embarked
embarked_grouped = dataset.groupby(dataset['Embarked']).size() #S is the higher 
dataset['Embarked'] = dataset['Embarked'].fillna('S')
dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2}).astype(int)

dataset.head()

In [None]:
#Checking and treating the Titles

dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(dataset['Title'],dataset['Sex'])

dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')

dataset['Title'] = dataset['Title'].replace('Mlle','Miss')
dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

pd.crosstab(dataset['Title'],dataset['Sex'])
dataset[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

#Setting titles to ints
title_numbers = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
dataset['Title'] = dataset['Title'].map(title_numbers)
dataset['Title'] = dataset['Title'].fillna(0)

In [None]:
#Grouping ages in ranges

dataset['Age'] = dataset['Age'].astype(int)

dataset.loc[dataset['Age'] <= 18, 'Age'] = 0
dataset.loc[(dataset['Age'] >= 18) & (dataset['Age'] <= 30), 'Age'] = 1
dataset.loc[(dataset['Age'] >= 31) & (dataset['Age'] <= 40), 'Age'] = 2
dataset.loc[(dataset['Age'] >= 41) & (dataset['Age'] <= 50), 'Age'] = 3
dataset.loc[(dataset['Age'] >= 51) & (dataset['Age'] <= 60), 'Age'] = 4
dataset.loc[dataset['Age'] >= 61, 'Age'] = 5

age_grouped = dataset.groupby(dataset['Age']).size()
age_grouped

In [None]:
#Treating 'Sibsp' and 'Parch

dataset['Family'] = dataset['SibSp'] + dataset['Parch'] + 1

dataset['IsAlone'] = 0
dataset.loc[dataset['Family'] == 1, 'IsAlone'] = 1

dataset.head()

In [None]:
#Treating Fare

fare_size = dataset.groupby(dataset['Fare']).size()
fare_size

dataset.loc[ dataset['Fare'] <= 8, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 8) & (dataset['Fare'] <= 15), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 15) & (dataset['Fare'] <= 31), 'Fare']   = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3

fare_size = dataset.groupby(dataset['Fare']).size()
fare_size

dataset['Fare'] = dataset['Fare'].astype(int)

fare_size

In [None]:
#Treating Cabin letter

#Transforming Cabin in only letters

dataset['Cabin_letter'] = dataset['Cabin'].str[0]
grouped_cabin = dataset.groupby('Cabin_letter').size()
grouped_cabin

cabin_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
dataset['Cabin_letter'] = dataset['Cabin_letter'].map(cabin_mapping).astype(int)

In [None]:
#Treating sex

dataset['Sex'] = dataset['Sex'].map({'female': 1, 'male':0}).astype(int)

sex_size = dataset.groupby(dataset['Sex']).size()
sex_size

In [None]:
#Checking for duplicated data in id
duplicated = dataset.duplicated().sum()

In [None]:
#Checking for wrong values on columns we have values specified:

checking_survived = dataset.groupby(dataset['Survived']).size()
checking_survived

checking_pclass = dataset.groupby(dataset['Pclass']).size()
checking_pclass

checking_sex = dataset.groupby(dataset['Sex']).size()
checking_sex

In [None]:
#Checking for correlations - pivoting features against each other

dataset[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
dataset[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
dataset[['Age', 'Survived']].groupby(['Age'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
#3. Creating some graphs to understand our dataset
dataset.head()

'''Not interesting for graphs:
Name, Ticket, SibSp, Parch, PassengerId
'''

#Graph1: Sex x Survived
#Graph2: Ages x Survived
#Graph3: Fare x Survived
#Graph4: PClass x Survived

survived = (dataset[dataset['Survived'] == 1])
survived_by_sex = (survived).groupby('Sex').size()

bins = [0, 18, 30, 40, 50, 60, 80, float('inf')]
labels = ['0-18','19-30','31-40','41-50','51-60','61-80','81+']

survived['age_1']=pd.cut(survived['Age'],bins=bins, labels=labels, right=False)
survived_by_age = survived.groupby('age_1').size()

bins_fare = [0, 100, 200, 300, 400, 500, 600]
labels_fare = ['0-100','101-200','201-300','301-400','401-500','501-600']

survived['fare_bins'] = pd.cut(survived['Fare'],bins=bins_fare, labels=labels_fare, right=False)
survived_by_fare = survived.groupby('fare_bins').size()

pclass_survived = (survived).groupby('Pclass').size()

plt.figure(figsize=(15,9))
plt.title('Informations of survivors')
plt.subplot(2,2,1)
survived_by_sex.plot(kind='bar',color='black')
plt.subplot(2,2,2)
survived_by_age.plot(kind='bar',color='black')
plt.subplot(2,2,3)
survived_by_fare.plot(kind='bar',color='black')
plt.subplot(2,2,4)
pclass_survived.plot(kind='bar',color='black')

plt.tight_layout()

In [None]:
#4. Creating our x and y for train.csv

drop_from_main = ["Name","Ticket","PassengerId","Cabin","Survived","SibSp","Parch","Family","SibSp","Parch"]
dataset_clean = dataset.drop(drop_from_main,axis=1)

In [None]:
#4. Importing and testing the "test.csv" dataset to predict "Survived"

test_dataset = pd.read_csv("test.csv", sep=',')
test_dataset.head()
test_dataset.describe()
test_dataset.isnull().sum()

In [None]:
#Treating data for our test dataset:

#Age

td_median_age = test_dataset['Age'].median()
td_median_age
test_dataset['Age'] = test_dataset['Age'].fillna(td_median_age)

#Cabin
cabin_grouped_test = test_dataset.groupby(test_dataset['Cabin']).size().sort_values(ascending=False)
test_dataset['Cabin'] = test_dataset['Cabin'].fillna("B57 B59 B63 B66")
test_dataset['Cabin_letter'] = test_dataset['Cabin'].str[0]
grouped_cabin_test = dataset.groupby('Cabin_letter').size()

#Fare
td_fare_median = test_dataset['Fare'].median()
td_fare_median
test_dataset['Fare'] = test_dataset['Fare'].fillna(td_fare_median)

#Checking for wrong values

checking_pclass = test_dataset.groupby(test_dataset['Pclass']).size()
checking_pclass

checking_sex = test_dataset.groupby(test_dataset['Sex']).size()
checking_sex


In [None]:
#Treating all columns of our test.csv

#Embarked
embarked_grouped_test = test_dataset.groupby(test_dataset['Embarked']).size() #S is the higher 
test_dataset['Embarked'] = test_dataset['Embarked'].fillna('S')
test_dataset['Embarked'] = test_dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2}).astype(int)

#Checking and treating the Titles

test_dataset['Title'] = test_dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
pd.crosstab(test_dataset['Title'],test_dataset['Sex'])

test_dataset['Title'] = test_dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'], 'Rare')

test_dataset['Title'] = test_dataset['Title'].replace('Mlle','Miss')
test_dataset['Title'] = test_dataset['Title'].replace('Ms', 'Miss')
test_dataset['Title'] = test_dataset['Title'].replace('Mme', 'Mrs')

pd.crosstab(test_dataset['Title'],test_dataset['Sex'])

#Setting titles to ints
title_numbers_test = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
test_dataset['Title'] = test_dataset['Title'].map(title_numbers_test)
test_dataset['Title'] = test_dataset['Title'].fillna(0)

#Grouping ages in ranges

test_dataset['Age'] = test_dataset['Age'].astype(int)

test_dataset.loc[test_dataset['Age'] <= 18, 'Age'] = 0
test_dataset.loc[(test_dataset['Age'] >= 18) & (test_dataset['Age'] <= 30), 'Age'] = 1
test_dataset.loc[(test_dataset['Age'] >= 31) & (test_dataset['Age'] <= 40), 'Age'] = 2
test_dataset.loc[(test_dataset['Age'] >= 41) & (test_dataset['Age'] <= 50), 'Age'] = 3
test_dataset.loc[(test_dataset['Age'] >= 51) & (test_dataset['Age'] <= 60), 'Age'] = 4
test_dataset.loc[test_dataset['Age'] >= 61, 'Age'] = 5

age_grouped_test = test_dataset.groupby(test_dataset['Age']).size()
age_grouped_test

#Treating 'Sibsp' and 'Parch

test_dataset['Family'] = test_dataset['SibSp'] + test_dataset['Parch'] + 1

test_dataset['IsAlone'] = 0
test_dataset.loc[test_dataset['Family'] == 1, 'IsAlone'] = 1

#Treating Fare

fare_size = test_dataset.groupby(test_dataset['Fare']).size()
fare_size

test_dataset.loc[ test_dataset['Fare'] <= 8, 'Fare'] = 0
test_dataset.loc[(test_dataset['Fare'] > 8) & (test_dataset['Fare'] <= 15), 'Fare'] = 1
test_dataset.loc[(test_dataset['Fare'] > 15) & (test_dataset['Fare'] <= 31), 'Fare']   = 2
test_dataset.loc[ test_dataset['Fare'] > 31, 'Fare'] = 3

fare_size = test_dataset.groupby(test_dataset['Fare']).size()
fare_size

test_dataset['Fare'] = test_dataset['Fare'].astype(int)

#Treating Cabin letter

#Transforming Cabin in only letters

test_dataset['Cabin_letter'] = test_dataset['Cabin'].str[0]
grouped_cabin_test = test_dataset.groupby('Cabin_letter').size()
grouped_cabin_test

cabin_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
test_dataset['Cabin_letter'] = test_dataset['Cabin_letter'].map(cabin_mapping).astype(int)

#Treating sex

test_dataset['Sex'] = test_dataset['Sex'].map({'female': 1, 'male':0}).astype(int)

sex_size = test_dataset.groupby(test_dataset['Sex']).size()
sex_size

In [None]:
#Dropping some columns that we don't need

keys_to_drop = ['Name','Ticket','Cabin','SibSp','Parch','Family']

test_dataset_clean = test_dataset.drop(keys_to_drop, axis=1)
test_dataset_clean.head()

In [None]:
#Reordering test table

new_order = ['Pclass','Sex','Age','Fare','Embarked','Title','IsAlone','Cabin_letter']
test_dataset_clean = test_dataset_clean[new_order]

test_dataset_clean.head()

In [None]:
#TRAINING AND PREDICTING

#Train.csv
#x = dataset_clean
#y = dataset['Survived']

#Test.csv
#x = test_dataset_clean

x_tr = dataset_clean
y_tr = dataset['Survived']
x_te = test_dataset_clean
x_tr.shape, y_tr.shape, x_te.shape

In [None]:
'''
Logistic Regression
KNN or k-Nearest Neighbors
Support Vector Machines
Naive Bayes classifier
Decision Tree
Random Forrest
Perceptron
Artificial neural network
RVM or Relevance Vector Machine
'''

#a. Logistic Regression
l_reg = LogisticRegression()
l_reg.fit(x_tr,y_tr)
y_pred = l_reg.predict(x_te)
score_log = round(l_reg.score(x_tr,y_tr)*100,2)
score_log

In [None]:
#b. Support Vector Machines
svc = SVC()
svc.fit(x_tr,y_tr)
y_pred = svc.predict(x_te)
score_svc = round(svc.score(x_tr,y_tr)*100,2)
score_svc

In [None]:
#c. k-Nearest Neighbors

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_tr,y_tr)
y_pred = knn.predict(x_te)
score_knn = round(knn.score(x_tr,y_tr)*100,2)
score_knn

In [None]:
#d. Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(x_tr,y_tr)
y_pred = gaussian.predict(x_te)
score_gaussian = round(gaussian.score(x_tr,y_tr)*100,2)
score_gaussian

In [None]:
#e. Perceptron

perceptron = Perceptron()
perceptron.fit(x_tr,y_tr)
y_pred = perceptron.predict(x_te)
score_perceptron = round(perceptron.score(x_tr,y_tr)*100,2)
score_perceptron

In [None]:
#f. Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(x_tr,y_tr)
y_pred = linear_svc.predict(x_te)
score_linear_svc = round(linear_svc.score(x_tr,y_tr)*100,2)
score_linear_svc

In [None]:
#g. Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(x_tr,y_tr)
y_pred = sgd.predict(x_te)
score_sgd = round(sgd.score(x_tr,y_tr)*100,2)
score_sgd

In [None]:
#h. Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_tr, y_tr)
y_pred = decision_tree.predict(x_te)
score_decision_tree = round(decision_tree.score(x_tr, y_tr) * 100, 2)
score_decision_tree

In [None]:
#i. Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_tr, y_tr)
y_pred = random_forest.predict(x_te)
score_random_forest = round(random_forest.score(x_tr, y_tr) * 100, 2)
score_random_forest

In [None]:
#Models evaluation

models_evaluation = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 'Random Forest', 'Naive Bayes', 'Perceptron', 
    'Stochastic Gradient Decent', 'Linear SVC', 'Decision Tree'],
    'Score': [score_random_forest, score_decision_tree, score_sgd, score_linear_svc, score_perceptron, score_gaussian, score_knn, 
                score_svc,score_log]})

models_evaluation.sort_values(by='Score',ascending=False)

In [None]:
#Creating our submission dataset

submission = pd.DataFrame({
    "PassengerId": test_dataset['PassengerId'],
    "Survived": y_pred
})

In [None]:
submission.to_csv('Submission file.csv',index=False)