In [None]:
#1. Importing libraries
import pandas as pd
import numpy as np
import statistics as st
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sb

In [None]:
dataset = pd.read_csv("train.csv",sep=',')
dataset.head()

In [None]:
dataset.describe()

In [None]:
#2. Cleaning "train.csv"

#Treating empty values

dataset.isnull().sum()

#Age column
empty = dataset['Age'].isna().sum()
median_age = dataset['Age'].median()
dataset['Age'] = dataset['Age'].fillna(median_age)
dataset.isnull().sum()

#Cabin column
cabin_grouped = dataset.groupby(dataset['Cabin']).size().sort_values(ascending=False)
most_common_values = ["C23 C25 C27","G6","B96 B98"] #group of most shown values on the dataset for Cabin
dataset['Cabin'] = dataset['Cabin'].fillna(pd.Series(np.random.choice(most_common_values,size=len(dataset.index))))
dataset.isnull().sum()

#Embarked
embarked_grouped = dataset.groupby(dataset['Embarked']).size() #S is the higher 
dataset['Embarked'] = dataset['Embarked'].fillna("S")
dataset.isnull().sum()

In [None]:
#Checking for duplicated data in id
duplicated = dataset.duplicated().sum()

In [None]:
#Checking for wrong values on columns we have values specified:

checking_survived = dataset.groupby(dataset['Survived']).size()
checking_survived

checking_pclass = dataset.groupby(dataset['Pclass']).size()
checking_pclass

checking_sex = dataset.groupby(dataset['Sex']).size()
checking_sex

In [None]:
#3. Creating some graphs to understand our dataset
dataset.head()

'''Not interesting for graphs:
Name, Ticket, SibSp, Parch, PassengerId
'''

#Graph1: Sex x Survived
#Graph2: Ages x Survived
#Graph3: Fare x Survived
#Graph4: PClass x Survived

survived = (dataset[dataset['Survived'] == 1])
survived_by_sex = (survived).groupby('Sex').size()

bins = [0, 18, 30, 40, 50, 60, 80, float('inf')]
labels = ['0-18','19-30','31-40','41-50','51-60','61-80','81+']

survived['age_1']=pd.cut(survived['Age'],bins=bins, labels=labels, right=False)
survived_by_age = survived.groupby('age_1').size()

bins_fare = [0, 100, 200, 300, 400, 500, 600]
labels_fare = ['0-100','101-200','201-300','301-400','401-500','501-600']

survived['fare_bins'] = pd.cut(survived['Fare'],bins=bins_fare, labels=labels_fare, right=False)
survived_by_fare = survived.groupby('fare_bins').size()

pclass_survived = (survived).groupby('Pclass').size()

plt.figure(figsize=(15,9))
plt.title('Informations of survivors')
plt.subplot(2,2,1)
survived_by_sex.plot(kind='bar',color='black')
plt.subplot(2,2,2)
survived_by_age.plot(kind='bar',color='black')
plt.subplot(2,2,3)
survived_by_fare.plot(kind='bar',color='black')
plt.subplot(2,2,4)
pclass_survived.plot(kind='bar',color='black')

plt.tight_layout()



In [None]:
#Transforming Cabin in only letters

dataset['Cabin_letter'] = dataset['Cabin'].str[0]
grouped_cabin = dataset.groupby('Cabin_letter').size()

In [None]:
#4. Train our predict

drop_from_main = ["Name","Ticket","PassengerId","Cabin","Survived"]
dataset_clean = dataset.drop(drop_from_main,axis=1)

#Preparing category columns
data_encoded = pd.get_dummies(dataset_clean, columns=['Sex','Cabin_letter','Embarked'])

x = data_encoded
y = dataset['Survived']

In [None]:
#Create train and test splits with 20% for test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_test = scaler.transform(x_test)
x_train = scaler.transform(x_train)

In [None]:
#Creating our MLP 

mlp = MLPClassifier(hidden_layer_sizes=(9), activation='logistic',max_iter=10000,alpha=1e-6, solver='sgd', verbose=10,tol = 1e-9,
                    random_state=1,learning_rate='adaptive', momentum=0.3,learning_rate_init=0.8 )

In [None]:
#training
mlp.fit(x_train,y_train)

In [None]:
#predicting with test
predictions = mlp.predict(x_test)
predictions

In [None]:
#confusion matrix
print(confusion_matrix(y_test,predictions))

In [None]:
#classification report
print(classification_report(y_test,predictions))

In [None]:
#4. Importing and testing the "test.csv" dataset to predict "Survived"

test_dataset = pd.read_csv("test.csv", sep=',')
test_dataset.head()
test_dataset.describe()
test_dataset.isnull().sum()

In [None]:
#Treating data for our test dataset:

#Age

td_median_age = test_dataset['Age'].median()
td_median_age
test_dataset['Age'] = test_dataset['Age'].fillna(td_median_age)

#Cabin
cabin_grouped_test = test_dataset.groupby(test_dataset['Cabin']).size().sort_values(ascending=False)
test_dataset['Cabin'] = test_dataset['Cabin'].fillna("B57 B59 B63 B66")
test_dataset['Cabin_letter'] = test_dataset['Cabin'].str[0]
grouped_cabin_test = dataset.groupby('Cabin_letter').size()

#Fare
td_fare_median = test_dataset['Fare'].median()
td_fare_median
test_dataset['Fare'] = test_dataset['Fare'].fillna(td_fare_median)

#Checking for wrong values

checking_pclass = test_dataset.groupby(test_dataset['Pclass']).size()
checking_pclass

checking_sex = test_dataset.groupby(test_dataset['Sex']).size()
checking_sex


In [None]:
#Dropping some columns that we don't need

keys_to_drop = ['Name','Ticket','Cabin', 'PassengerId']

test_dataset_clean = test_dataset.drop(keys_to_drop, axis=1)
test_dataset_clean.head()

test_data_encoded = pd.get_dummies(test_dataset_clean, columns=['Sex','Cabin_letter','Embarked'])

test_data_encoded['Cabin_letter_T'] = False

x_new = test_data_encoded
x_new

In [70]:
#Predict 'Survived'
test_dataset_clean['Survived'] = mlp.predict(x_new)



In [71]:
survived_counts = test_dataset_clean['Survived'].value_counts()
print(survived_counts)

Survived
0    216
1    202
Name: count, dtype: int64


In [75]:
test_dataset_clean['PassengerId'] = test_dataset['PassengerId']
test_dataset_clean

columns_to_drop = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Cabin_letter']

final = test_dataset_clean.drop(columns_to_drop,axis=1)

In [77]:
#Saving our final table with the predictions
final.to_csv('final_prediction.csv', index=False)