In [54]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [55]:
# Get the data from training dataset
train_data = pd.read_csv("./Trab1/Data/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [56]:
# Remove data that will not be utilized
train_data = train_data.drop(["Name", "PassengerId","Ticket", "Fare", "Cabin"],axis="columns")
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [57]:
train_data.count()

Survived    891
Pclass      891
Sex         891
Age         714
SibSp       891
Parch       891
Embarked    889
dtype: int64

In [58]:
# Removing the line that has some error, like Age and Embarked
train_data.isnull().sum()
train_data = train_data.dropna()

In [59]:
#Verify if there is no null values
train_data.count()

Survived    712
Pclass      712
Sex         712
Age         712
SibSp       712
Parch       712
Embarked    712
dtype: int64

In [60]:
# The age has a high deviation, for now I will mantain and later will be changed
train_data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
count,712.0,712.0,712.0,712.0,712.0
mean,0.404494,2.240169,29.642093,0.514045,0.432584
std,0.491139,0.836854,14.492933,0.930692,0.854181
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,1.0,20.0,0.0,0.0
50%,0.0,2.0,28.0,0.0,0.0
75%,1.0,3.0,38.0,1.0,1.0
max,1.0,3.0,80.0,5.0,6.0


In [61]:
# Some method doesnd accepted string, like sex and embarked
train_data['Sex'].replace('female', 0,inplace=True)
train_data['Sex'].replace('male', 1,inplace=True)
train_data['Embarked'].replace('C', 0,inplace=True)
train_data['Embarked'].replace('S', 1,inplace=True)
train_data['Embarked'].replace('Q', 2,inplace=True)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,22.0,1,0,1
1,1,1,0,38.0,1,0,0
2,1,3,0,26.0,0,0,1
3,1,1,0,35.0,1,0,1
4,0,3,1,35.0,0,0,1


In [62]:
# Separated the label from training dataset
label = train_data["Survived"]
train_data_in = train_data.drop(["Survived"], axis="columns")
X_train, X_test, y_train, y_test = train_test_split( train_data_in, label, test_size=0.3, stratify=label)

In [63]:
#Prepare the training data, the same way then the training
test_data = pd.read_csv("./Trab1/Data/test.csv")
test_data = test_data.drop(["Name", "PassengerId","Ticket", "Fare", "Cabin"],axis="columns")
test_data.isnull().sum()
test_data = test_data.dropna()
test_data['Sex'].replace('female', 0,inplace=True)
test_data['Sex'].replace('male', 1,inplace=True)
test_data['Embarked'].replace('C', 0,inplace=True)
test_data['Embarked'].replace('S', 1,inplace=True)
test_data['Embarked'].replace('Q', 2,inplace=True)
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,1,34.5,0,0,2
1,3,0,47.0,1,0,1
2,2,1,62.0,0,0,2
3,3,1,27.0,0,0,1
4,3,0,22.0,1,1,1


In [64]:
# Training, First Method - Perceptron
ppn = Perceptron(max_iter=1000, eta0=0.9, random_state=2)
ppn.fit(X_train, y_train)

In [65]:
y_pred_ppn = ppn.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_ppn))

Accuracy: 0.72


In [66]:
# Training, Second Method - SVM
svm = SVC(kernel='rbf', random_state=1, gamma=1.4, C=20.0)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_svm))


Accuracy: 0.73


In [67]:
# Training, Third Method - Decision Tree
tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=3, 
                              random_state=1)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_tree))

Accuracy: 0.80


In [68]:
# It was observed that, running some times the code the perceptron give instable results, while the SVM and decision tree stabilized around 0.75 and 0.8.
# Now I will classify the age in groups and test again
bins= [0,12,24,36,48,60,100]
labels = [0,1,2,3,4,5]
train_data_in['Age'] = pd.cut(train_data_in['Age'], bins=bins, labels=labels, right=False)
train_data_in.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,1,1,1,0,1
1,1,0,3,1,0,0
2,3,0,2,0,0,1
3,1,0,2,1,0,1
4,3,1,2,0,0,1


In [69]:
X_train, X_test, y_train, y_test = train_test_split( train_data_in, label, test_size=0.3, stratify=label)

ppn.fit(X_train, y_train)
y_pred_ppn2 = ppn.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_ppn2))

svm.fit(X_train, y_train)
y_pred_svm2 = svm.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_svm2))

tree.fit(X_train, y_train)
y_pred_tree2 = tree.predict(X_test)
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred_tree2))

Accuracy: 0.78
Accuracy: 0.78
Accuracy: 0.79


In [20]:
# The results with the age group is more stable for all the methods, around 0.8, the perceptron still with a little instability.