# 13. Machine learning techniques

In [None]:
import pandas
import matplotlib.pyplot as plt

In [None]:
import random as rd
rd.seed(0)

## 13.1 Loading and exploring the dataset

First, we use pandas to load the dataset from a csv file.

In [None]:
raw_data = pandas.read_csv('./titanic.csv')
raw_data

Next, we can explore the dataset.

In [None]:
# Examining the length of the dataset
print("The dataset has", len(raw_data), "rows")

In [None]:
# Examining the columns in the dataset
print("Columns (features of the dataset)")
raw_data.columns

In [None]:
# Examining the labels
print("Labels")
raw_data["Survived"]

In [None]:
# Examining how many passengers survived
print(sum(raw_data['Survived']),'passengers survived out of',len(raw_data))

In [None]:
# One can look at several columns together
raw_data[["Name", "Age"]]

## 13.2. Cleaning up the data

Now, let's look at how many columns have missing data

In [None]:
raw_data.isna().sum()

The Cabin column is missing too many values to be useful. Let's drop it altogether.

In [None]:
raw_data['Cabin']

In [None]:
print("The Cabin column is missing", sum(raw_data['Cabin'].isna()), "values out of",len(raw_data['Cabin']))

In [None]:
clean_data = raw_data.drop('Cabin', axis=1)

In [None]:
clean_data.head()

Other columns such as Age or Embarked are missing some values, but they can still be useful.

For the age column, let's fill in the missing values with the median of all ages.

For the Embarked column, let's make a new category called 'U', for Unknown port of embarkment.

In [None]:
clean_data['Age']

In [None]:
median_age = raw_data["Age"].median()
median_age

In [None]:
clean_data["Age"] = clean_data["Age"].fillna(median_age)

In [None]:
clean_data["Embarked"] = clean_data["Embarked"].fillna('U')

In [None]:
clean_data.isna().sum()

In [None]:
clean_data.head()

### 12.2.3 Saving our data for the future

In [None]:
clean_data.to_csv('./clean_titanic_data.csv', index=None)

## 12.3 Manipulating the features

- One-hot encoding
- Binning
- Feature selection

### 13.3.1 One-hot encoding

In [None]:
preprocessed_data = pandas.read_csv('clean_titanic_data.csv')
preprocessed_data.head()

One-hot encoding the gender feature

In [None]:
gender_columns = pandas.get_dummies(preprocessed_data['Sex'], prefix='Sex')
print(gender_columns)
embarked_columns = pandas.get_dummies(preprocessed_data["Embarked"], prefix="Embarked")
print(embarked_columns)

In [None]:
preprocessed_data = pandas.concat([preprocessed_data, gender_columns], axis=1)
preprocessed_data = pandas.concat([preprocessed_data, embarked_columns], axis=1)

In [None]:
preprocessed_data = preprocessed_data.drop(['Sex', 'Embarked'], axis=1)

In [None]:
preprocessed_data.head()

### A rule of thumb for when to one-hot encode or not

In [None]:
class_survived = preprocessed_data[['Pclass', 'Survived']]

first_class = class_survived[class_survived['Pclass'] == 1]
second_class = class_survived[class_survived['Pclass'] == 2]
third_class = class_survived[class_survived['Pclass'] == 3]

print("In first class", sum(first_class['Survived'])/len(first_class)*100, "% of passengers survived")
print("In second class", sum(second_class['Survived'])/len(first_class)*100, "% of passengers survived")
print("In third class", sum(third_class['Survived'])/len(first_class)*100, "% of passengers survived")

In [None]:
categorized_pclass_columns = pandas.get_dummies(preprocessed_data['Pclass'], prefix='Pclass')
preprocessed_data = pandas.concat([preprocessed_data, categorized_pclass_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Pclass'], axis=1)

In [None]:
preprocessed_data.head()

### 13.3.3 Binning

In [None]:
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
categorized_age = pandas.cut(preprocessed_data['Age'], bins)
preprocessed_data['Categorized_age'] = categorized_age
preprocessed_data = preprocessed_data.drop(["Age"], axis=1)

In [None]:
preprocessed_data.head()

In [None]:
cagegorized_age_columns = pandas.get_dummies(preprocessed_data['Categorized_age'], prefix='Categorized_age')
preprocessed_data = pandas.concat([preprocessed_data, cagegorized_age_columns], axis=1)
preprocessed_data = preprocessed_data.drop(['Categorized_age'], axis=1)

In [None]:
preprocessed_data.head()

### 13.3.3 Feature selection

In [None]:
preprocessed_data = preprocessed_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)

In [None]:
preprocessed_data.head()

### 12.3.5 Saving for future use

In [None]:
preprocessed_data.to_csv('preprocessed_titanic_data.csv', index=None)

# 13.4 Training models

In [None]:
data = pandas.read_csv('./preprocessed_titanic_data.csv')
data.head()

### 13.4.1 Features-labels split and train-validation split

In [None]:
features = data.drop(["Survived"], axis=1)
labels = data["Survived"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# remark: we fix random_state the end, to make sure we always get the same split
features_train, features_validation_test, labels_train, labels_validation_test = train_test_split(
    features, labels, test_size=0.4, random_state=100)

In [None]:
features_validation, features_test, labels_validation, labels_test = train_test_split(
    features_validation_test, labels_validation_test, test_size=0.5, random_state=100)

In [None]:
print(len(features_train))
print(len(features_validation))
print(len(features_test))
print(len(labels_train))
print(len(labels_validation))
print(len(labels_test))

### 13.4.2 Training different models on our dataset

We'll train four models:
- Logistic regression (perceptron)
- Decision tree
- Naive Bayes
- Support vector machine (SVM)

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(features_train, labels_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(features_train, labels_train)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(features_train, labels_train)

In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(features_train, labels_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(features_train, labels_train)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
gb_model.fit(features_train, labels_train)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier()
ab_model.fit(features_train, labels_train)

### 13.4.3 Evaluating the models

#### Accuracy

In [None]:
print("Scores of the models")
print("Logistic regression:", lr_model.score(features_validation, labels_validation))
print("Decision tree:", dt_model.score(features_validation, labels_validation))
print("Naive Bayes:", nb_model.score(features_validation, labels_validation))
print("SVM:", svm_model.score(features_validation, labels_validation))
print("Random forest:", rf_model.score(features_validation, labels_validation))
print("Gradient boosting:", gb_model.score(features_validation, labels_validation))
print("AdaBoost:", ab_model.score(features_validation, labels_validation))

#### F1-score

In [None]:
from sklearn.metrics import f1_score

print("F1-scores of the models:")

lr_predicted_labels = lr_model.predict(features_validation)
print("Logistic regression:", f1_score(labels_validation, lr_predicted_labels))

dt_predicted_labels = dt_model.predict(features_validation)
print("Decision Tree:", f1_score(labels_validation, dt_predicted_labels))

nb_predicted_labels = nb_model.predict(features_validation)
print("Naive Bayes:", f1_score(labels_validation, nb_predicted_labels))

svm_predicted_labels = svm_model.predict(features_validation)
print("Support Vector Machine:", f1_score(labels_validation, svm_predicted_labels))

rf_predicted_labels = rf_model.predict(features_validation)
print("Random Forest:", f1_score(labels_validation, rf_predicted_labels))

gb_predicted_labels = gb_model.predict(features_validation)
print("Gradient boosting:", f1_score(labels_validation, gb_predicted_labels))

ab_predicted_labels = ab_model.predict(features_validation)
print("AdaBoost:", f1_score(labels_validation, ab_predicted_labels))

### 13.4.4 Testing the model

Finding the accuracy and the F1-score of the model in the testing set.

In [None]:
gb_model.score(features_test, labels_test)

In [None]:
gb_predicted_test_labels = gb_model.predict(features_test)
f1_score(labels_test, gb_predicted_test_labels)

# 13.5 Grid search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Grid search with an rbf kernel

print("SVM grid search with a radial basis function kernel")

# rbf, C=1, gamma=0.1
svm_1_01 = SVC(kernel='rbf', C=1, gamma=0.1)
svm_1_01.fit(features_train, labels_train)
print("C=1, gamma=0.1", svm_1_01.score(features_validation, labels_validation))

# rbf, C=1, gamma=1
svm_1_1 = SVC(kernel='rbf', C=1, gamma=1)
svm_1_1.fit(features_train, labels_train)
print("C=1, gamma=1", svm_1_1.score(features_validation, labels_validation))

# rbf, C=1, gamma=10
svm_1_10 = SVC(kernel='rbf', C=1, gamma=10)
svm_1_10.fit(features_train, labels_train)
print("C=1, gamma=10", svm_1_10.score(features_validation, labels_validation))

# rbf, C=10, gamma=0.1
svm_10_01 = SVC(kernel='rbf', C=10, gamma=0.1)
svm_10_01.fit(features_train, labels_train)
print("C=10, gamma=0.1", svm_10_01.score(features_validation, labels_validation))

# rbf, C=10, gamma=1
svm_10_1 = SVC(kernel='rbf', C=10, gamma=1)
svm_10_1.fit(features_train, labels_train)
print("C=10, gamma=1", svm_10_1.score(features_validation, labels_validation))

# rbf, C=10, gamma=10
svm_10_10 = SVC(kernel='rbf', C=10, gamma=10)
svm_10_10.fit(features_train, labels_train)
print("C=10, gamma=10", svm_10_10.score(features_validation, labels_validation))

In [None]:
svm_parameters = {'kernel': ['rbf'],
                  'C': [0.01, 0.1, 1 , 10, 100],
                  'gamma': [0.01, 0.1, 1, 10, 100]
                }
svm = SVC()
svm_gs = GridSearchCV(estimator = svm,
                      param_grid = svm_parameters)
svm_gs.fit(features_train, labels_train)

svm_winner = svm_gs.best_estimator_
svm_winner

svm_winner.score(features_validation, labels_validation)

In [None]:
svm_winner

# 13.6 Cross validation

In [None]:
svm_gs.cv_results_

# Exercise 13.1

In [None]:
test_data = pandas.read_csv('test.csv')

In [None]:
test_data.isna().sum()

In [None]:
# Cleaning the data
test_data = test_data.drop('Cabin', axis=1)
test_data["Age"] = test_data["Age"].fillna(28.0)

# Catch! The test data has one missing fare. Let's fix that
average_fare = test_data["Fare"].mean()
test_data['Fare'] = test_data['Fare'].fillna(average_fare)

# Preprocessing the data
test_gender_columns = pandas.get_dummies(test_data['Sex'], prefix='Sex')
test_embarked_columns = pandas.get_dummies(test_data["Embarked"], prefix="Embarked")
test_data = pandas.concat([test_data, test_gender_columns], axis=1)
test_data = pandas.concat([test_data, test_embarked_columns], axis=1)
test_data = test_data.drop(['Sex', 'Embarked'], axis=1)

# Another small catch, the test data has no missing 'Embarked' fields. Therefore, the processed test data will not
# have an 'Embarked_Q' column. We need to artificially add one filled with zeros.
test_data['Embarked_U'] = pandas.DataFrame([0 for i in range(len(test_data))])

test_categorized_pclass_columns = pandas.get_dummies(test_data['Pclass'], prefix='Pclass')
test_data = pandas.concat([test_data, test_categorized_pclass_columns], axis=1)
test_data = test_data.drop(['Pclass'], axis=1)

bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
test_categorized_age = pandas.cut(test_data['Age'], bins)
test_data['Categorized_age'] = categorized_age
test_data = test_data.drop(["Age"], axis=1)

test_cagegorized_age_columns = pandas.get_dummies(test_data['Categorized_age'], prefix='Categorized_age')
test_data = pandas.concat([test_data, test_cagegorized_age_columns], axis=1)
test_data = test_data.drop(['Categorized_age'], axis=1)

test_data = test_data.drop(['Name', 'Ticket', 'PassengerId'], axis=1)
test_data

Now, to check how many survivors were predicted by each model

In [None]:
# Logistic regression
sum(lr_model.predict(test_data))

In [None]:
# Decision tree
sum(dt_model.predict(test_data))

In [None]:
# Naive Bayes
sum(nb_model.predict(test_data))

In [None]:
# Support vector machine
sum(svm_model.predict(test_data))

In [None]:
# Random forest
sum(rf_model.predict(test_data))

In [None]:
# Gradient boosting
sum(gb_model.predict(test_data))

In [None]:
# AdaBoost
sum(ab_model.predict(test_data))

Since the three strongest models in terms of accuracy were random forests, gradient boosting, and adaboost, and they predicted that 154, 156, and 155 passengers survived out of the 418 in the test set, a good estimate for the number of survivors is the average of these three predictions, or 155.