In [11]:
# Import pandas library
import pandas as pd

In [12]:
# Read csv data file
# Data without feature standardization
df = pd.read_csv('data/titanic_unscaled.csv')

In [13]:
# View the number of rows and columns
df.shape

(887, 7)

In [14]:
# View the first 5 rows
df.head()

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare
0,0,3,1,22.0,1,0,7.25
1,1,1,0,38.0,1,0,71.2833
2,1,3,0,26.0,0,0,7.925
3,1,1,0,35.0,1,0,53.1
4,0,3,1,35.0,0,0,8.05


In [15]:
df.describe()

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare
count,887.0,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,0.645998,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,0.47848,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,0.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [6]:
# Create another data frame and to make sure df_new does not modify df
df_new = df.copy()

# Divide the data into 4 bins having roughly equal number of instances
# Precision defines how many decimal points to use for calculating the bin precision
df_new['age_cat4a'] = pd.qcut(df_new['age'], q = 4, precision = 0)

# View the count of each bin
df_new['age_cat4a'].value_counts()

(20.0, 28.0]    243
(-1.0, 20.0]    222
(38.0, 80.0]    214
(28.0, 38.0]    208
Name: age_cat4a, dtype: int64

In [7]:
# View the first 5 instances
df_new.head()

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare,age_cat4a
0,0,3,1,22.0,1,0,7.25,"(20.0, 28.0]"
1,1,1,0,38.0,1,0,71.2833,"(28.0, 38.0]"
2,1,3,0,26.0,0,0,7.925,"(20.0, 28.0]"
3,1,1,0,35.0,1,0,53.1,"(28.0, 38.0]"
4,0,3,1,35.0,0,0,8.05,"(28.0, 38.0]"


In [8]:
# Use the labels parameter to name the bins
# The "\" at the end of a line allows us to split our code over different lines
df_new['age_cat4b'] = pd.qcut(df_new['age'], q = 4, precision = 0, \
labels = ['Junior', 'Youth', 'Adult', 'Senior'])

# View the count of each bin
df_new['age_cat4b'].value_counts()

Youth     243
Junior    222
Senior    214
Adult     208
Name: age_cat4b, dtype: int64

In [9]:
# View the first 5 instances
df_new.head()

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare,age_cat4a,age_cat4b
0,0,3,1,22.0,1,0,7.25,"(20.0, 28.0]",Youth
1,1,1,0,38.0,1,0,71.2833,"(28.0, 38.0]",Adult
2,1,3,0,26.0,0,0,7.925,"(20.0, 28.0]",Youth
3,1,1,0,35.0,1,0,53.1,"(28.0, 38.0]",Adult
4,0,3,1,35.0,0,0,8.05,"(28.0, 38.0]",Adult


In [10]:
# Divide the data into 4 bins with equal range automatically
df_new['age_cat4c'] = pd.cut(df_new['age'], bins = 4)

# View the count of each bin
df_new['age_cat4c'].value_counts()

(20.315, 40.21]    487
(0.34, 20.315]     222
(40.21, 60.105]    152
(60.105, 80.0]      26
Name: age_cat4c, dtype: int64

In [18]:
# View the first 5 instances
df_new.head()

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare,age_cat4a,age_cat4b,age_cat4c
0,0,3,1,22.0,1,0,7.25,"(20.0, 28.0]",Youth,"(20.315, 40.21]"
1,1,1,0,38.0,1,0,71.2833,"(28.0, 38.0]",Adult,"(20.315, 40.21]"
2,1,3,0,26.0,0,0,7.925,"(20.0, 28.0]",Youth,"(20.315, 40.21]"
3,1,1,0,35.0,1,0,53.1,"(28.0, 38.0]",Adult,"(20.315, 40.21]"
4,0,3,1,35.0,0,0,8.05,"(28.0, 38.0]",Adult,"(20.315, 40.21]"


In [19]:
# Define the name of each bin
cut_labels_4 = ['Junior', 'Youth', 'Adult', 'Senior']

# Define the boundaries of the 4 bins
# Min age is 0.42 and max age is 80
cut_bins = [0, 20, 30, 50, 80]

# Divide the data into 4 bins with the manually-defined range
df_new['age_cat4d'] = pd.cut(df_new['age'], bins = cut_bins, labels = cut_labels_4)

# View the count of each bin
df_new['age_cat4d'].value_counts()

Youth     303
Adult     290
Junior    222
Senior     72
Name: age_cat4d, dtype: int64

In [20]:
df_new

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare,age_cat4a,age_cat4b,age_cat4c,age_cat4d
0,0,3,1,22.0,1,0,7.2500,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth
1,1,1,0,38.0,1,0,71.2833,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult
2,1,3,0,26.0,0,0,7.9250,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth
3,1,1,0,35.0,1,0,53.1000,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult
4,0,3,1,35.0,0,0,8.0500,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult
...,...,...,...,...,...,...,...,...,...,...,...
882,0,2,1,27.0,0,0,13.0000,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth
883,1,1,0,19.0,0,0,30.0000,"(-1.0, 20.0]",Junior,"(0.34, 20.315]",Junior
884,0,3,0,7.0,1,2,23.4500,"(-1.0, 20.0]",Junior,"(0.34, 20.315]",Junior
885,1,1,1,26.0,0,0,30.0000,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth


In [21]:
# View the first 5 instances
df_new.head()

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare,age_cat4a,age_cat4b,age_cat4c,age_cat4d
0,0,3,1,22.0,1,0,7.25,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth
1,1,1,0,38.0,1,0,71.2833,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult
2,1,3,0,26.0,0,0,7.925,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth
3,1,1,0,35.0,1,0,53.1,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult
4,0,3,1,35.0,0,0,8.05,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult


In [22]:
df_new.head()

Unnamed: 0,survived,pclass,sex,age,sibspouse,parchild,fare,age_cat4a,age_cat4b,age_cat4c,age_cat4d
0,0,3,1,22.0,1,0,7.25,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth
1,1,1,0,38.0,1,0,71.2833,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult
2,1,3,0,26.0,0,0,7.925,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth
3,1,1,0,35.0,1,0,53.1,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult
4,0,3,1,35.0,0,0,8.05,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult


In [23]:
# Use get_dummies() function to perform one hot encoding on sex column
df_enc = pd.get_dummies(df_new, prefix = ['sex'], columns = ['sex'])

df_enc.head()

Unnamed: 0,survived,pclass,age,sibspouse,parchild,fare,age_cat4a,age_cat4b,age_cat4c,age_cat4d,sex_0,sex_1
0,0,3,22.0,1,0,7.25,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth,0,1
1,1,1,38.0,1,0,71.2833,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult,1,0
2,1,3,26.0,0,0,7.925,"(20.0, 28.0]",Youth,"(20.315, 40.21]",Youth,1,0
3,1,1,35.0,1,0,53.1,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult,1,0
4,0,3,35.0,0,0,8.05,"(28.0, 38.0]",Adult,"(20.315, 40.21]",Adult,0,1


In [16]:
# Indicate the target column
target = df['survived']

# Indicate the columns that will serve as features
features = df.drop('survived', axis = 1)

In [17]:
# Split data into train and test sets
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split the dataset into training + development set and test set
x_train, x_test, y_train, y_test = train_test_split(features, target, \
test_size = 0.2, random_state = 0)

In [18]:
# Import function for k-fold cross validation
from sklearn.model_selection import cross_val_score

# Import the Gaussian Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB

# Create a Gaussian Naive Bayes classifier with default parameters
gnb = GaussianNB()

# Use 10-fold cross validation to perform training and validation on the training set
# Parameter scoring = 'accuracy' will compute accuracy
scores = cross_val_score(gnb, x_train, y_train, cv = 10, scoring = 'accuracy')

# Display the array containing accuracy from 10 folds or iterations
scores

array([0.84507042, 0.8028169 , 0.87323944, 0.84507042, 0.71830986,
       0.74647887, 0.77464789, 0.71830986, 0.8028169 , 0.81428571])

In [19]:
# Print the mean accuracy score
print('Accuracy (Validation) =', scores.mean())

Accuracy (Validation) = 0.7941046277665996


In [20]:
# Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Running prediction
gnb.fit(x_train, y_train)

# Predict the target for the test dataset
test_predict = gnb.predict(x_test)

# Compute the model accuracy on the development set: How often is the classifier correct?
print("Accuracy (Test): ", metrics.accuracy_score(y_test, test_predict))

Accuracy (Test):  0.7752808988764045


In [21]:
# Import the Bernoulli Naive Bayes classifier
from sklearn.naive_bayes import BernoulliNB

# Create a Bernoulli Naive Bayes classifier with default parameters
bnb = BernoulliNB(binarize = 0.0)

# Use 10-fold cross validation to perform training and validation on the training set
scores = cross_val_score(bnb, x_train, y_train, cv = 10, scoring = 'accuracy')

# Display the array containing accuracy from 10 folds or iterations
scores

array([0.83098592, 0.77464789, 0.88732394, 0.83098592, 0.76056338,
       0.73239437, 0.78873239, 0.71830986, 0.81690141, 0.8       ])

In [22]:
# Print the mean accuracy score
print('Accuracy =', scores.mean())

Accuracy = 0.7940845070422535


In [23]:
# Running prediction
bnb.fit(x_train, y_train)

# Predict the target for the test dataset
test_predict = bnb.predict(x_test)

# Compute the model accuracy on the development set: How often is the classifier correct?
print("Accuracy (Test): ", metrics.accuracy_score(y_test, test_predict))

Accuracy (Test):  0.7528089887640449


In [24]:
# Import the Multinomial Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

# Create a Multinomial Naive Bayes classifier with default parameters
mnb = MultinomialNB()

# Use 10-fold cross validation to perform training and validation on the training set
scores = cross_val_score(mnb, x_train, y_train, cv = 10, scoring = 'accuracy')

# Display the array containing accuracy from 10 folds or iterations
scores

array([0.73239437, 0.71830986, 0.74647887, 0.77464789, 0.57746479,
       0.64788732, 0.5915493 , 0.63380282, 0.81690141, 0.64285714])

In [25]:
# Print the mean accuracy score
print('Accuracy =', scores.mean())

Accuracy = 0.6882293762575453


In [26]:
# Running prediction
mnb.fit(x_train, y_train)

# Predict the target for the test dataset
test_predict = mnb.predict(x_test)

# Compute the model accuracy on the development set: How often is the classifier correct?
print("Accuracy (Test): ", metrics.accuracy_score(y_test, test_predict))

Accuracy (Test):  0.6685393258426966
