** Import necessary libraries for data-wrangling **

In [1]:
import pandas as pd
import numpy as np

** Import Naive Bayes implementation **

In [2]:
from gaussian_nb import GaussianNaiveBayes
from evaluation import Evaluator

** Import Titanic dataset and wrangle it into shape **

In [3]:
# Titanic dataset
df = pd.read_csv("data/titanic/train.csv")

# extract the relevant columns
df_cols = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Survived']]

# get rid of missing values (for simplicity)
df_cols.dropna(axis=0, inplace=True)

# turn categorical features into numbers
gender = list(np.unique(df_cols['Sex'].values))
df_cols['Sex'] = df_cols['Sex'].apply(lambda s: gender.index(s))

embarked = list(np.unique(df_cols['Embarked'].values))
df_cols['Embarked'] = df_cols['Embarked'].apply(lambda e: embarked.index(e))

# extract features and targets into 2 different variables
targets = df_cols['Survived']
features = df_cols.drop(['Survived'], axis=1)

# extract training and test sets
x_train = features.iloc[:500,:]
y_train = targets[:500]

x_test = features.iloc[500:,:]
y_test = targets[500:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


** Instantiate Naive Bayes class and train on training data **

Options include:
* for each feature specifying whether it is a categorical variable or not
* supplying prior class probabilities e.g. **priors=[0.7, 0.3]**
* debug mode (to print calculations for individual data points)

In [4]:
nb = GaussianNaiveBayes(x_train,
                        y_train,
                        categoricals=[True, True, False, True, True, True],
                        debug_mode=False)

nb.train()

** Now make predictions on new data **

In [5]:
predictions = nb.predict(x_test)

** Calculate accuracy metrics **

Note:
* for binary classification, the BinaryMetrics class is used which gives accuracy, precision, recall and F-score
* for multi-class classification, the MultiClassMetrics class only returns accuracy (% of correctly classified items)

In [8]:
ev = Evaluator()
metrics = ev.evaluate(predictions, y_test, binary=True)
print("Accuracy: {:.3f}\nPrecision: {:.3f}\nRecall: {:.3f}\nF-Score: {:.3f}".format(metrics.accuracy,
                                                                    metrics.precision,
                                                                    metrics.recall,
                                                                    metrics.f_score))

Accuracy: 0.802
Precision: 0.760
Recall: 0.704
F-Score: 0.731


** To check the implementation, run it through sklearn **

In [9]:
# for benchmarking, try sklearn implementation
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics as sk_metrics

gnb = GaussianNB()
gnb.fit(x_train, y_train)
gnb_predictions = gnb.predict(x_test)

print("\nSKLearn accuracy: {:.3f}".format(sk_metrics.accuracy_score(y_test, gnb_predictions)))


SKLearn accuracy: 0.802
