In [None]:
#1. Importing libraries
import pandas as pd
import numpy as np
import statistics as st
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sb

In [None]:
dataset = pd.read_csv("train.csv",sep=',')
dataset.head()

In [None]:
dataset.describe()

In [None]:
#2. Cleaning "train.csv"

#Treating empty values

dataset.isnull().sum()

#Age column
empty = dataset['Age'].isna().sum()
median_age = dataset['Age'].median()
dataset['Age'] = dataset['Age'].fillna(median_age)
dataset.isnull().sum()

#Cabin column
cabin_grouped = dataset.groupby(dataset['Cabin']).size().sort_values(ascending=False)
most_common_values = ["C23 C25 C27","G6","B96 B98"] #group of most shown values on the dataset for Cabin
dataset['Cabin'] = dataset['Cabin'].fillna(pd.Series(np.random.choice(most_common_values,size=len(dataset.index))))
dataset.isnull().sum()

#Embarked
embarked_grouped = dataset.groupby(dataset['Embarked']).size() #S is the higher 
dataset['Embarked'] = dataset['Embarked'].fillna("S")
dataset.isnull().sum()

In [None]:
#Checking for duplicated data in id
duplicated = dataset.duplicated().sum()

In [None]:
#Checking for wrong values on columns we have values specified:

checking_survived = dataset.groupby(dataset['Survived']).size()
checking_survived

checking_pclass = dataset.groupby(dataset['Pclass']).size()
checking_pclass

checking_sex = dataset.groupby(dataset['Sex']).size()
checking_sex

In [None]:
#3. Creating some graphs to understand our dataset
dataset.head()

'''Not interesting for graphs:
Name, Ticket, SibSp, Parch, PassengerId
'''

#Graph1: Sex x Survived
#Graph2: Ages x Survived
#Graph3: Fare x Survived
#Graph4: PClass x Survived

survived = (dataset[dataset['Survived'] == 1])
survived_by_sex = (survived).groupby('Sex').size()

bins = [0, 18, 30, 40, 50, 60, 80, float('inf')]
labels = ['0-18','19-30','31-40','41-50','51-60','61-80','81+']

survived['age_1']=pd.cut(survived['Age'],bins=bins, labels=labels, right=False)
survived_by_age = survived.groupby('age_1').size()

bins_fare = [0, 100, 200, 300, 400, 500, 600]
labels_fare = ['0-100','101-200','201-300','301-400','401-500','501-600']

survived['fare_bins'] = pd.cut(survived['Fare'],bins=bins_fare, labels=labels_fare, right=False)
survived_by_fare = survived.groupby('fare_bins').size()

pclass_survived = (survived).groupby('Pclass').size()

plt.figure(figsize=(15,9))
plt.title('Informations of survivors')
plt.subplot(2,2,1)
survived_by_sex.plot(kind='bar',color='black')
plt.subplot(2,2,2)
survived_by_age.plot(kind='bar',color='black')
plt.subplot(2,2,3)
survived_by_fare.plot(kind='bar',color='black')
plt.subplot(2,2,4)
pclass_survived.plot(kind='bar',color='black')

plt.tight_layout()



In [64]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin_letter
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S,B
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,G6,S,G
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S,B


In [None]:
#Transforming Cabin in only letters

dataset['Cabin_letter'] = dataset['Cabin'].str[0]
grouped_cabin = dataset.groupby('Cabin_letter').size()

In [65]:
#4. Train our predict

drop_from_main = ["Name","Ticket","PassengerId","Cabin","Survived"]
dataset_clean = dataset.drop(drop_from_main,axis=1)

#Preparing category columns
data_encoded = pd.get_dummies(dataset_clean, columns=['Sex','Cabin_letter','Embarked'])

x = data_encoded
y = dataset['Survived']

In [66]:
#Create train and test splits with 20% for test
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [67]:
scaler = StandardScaler()
scaler.fit(x_train)
x_test = scaler.transform(x_test)
x_train = scaler.transform(x_train)