### Load Data

In [25]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [26]:
df = pd.read_csv('hw2_data.csv')

In [27]:
df.shape

(2243, 56)

### Question-1: Keep only the labels between -1 and 3

In [28]:
df = df.loc[(df['label'] >= -1) & (df['label'] <= 2)]

In [29]:
df.shape

(2211, 56)

In [30]:
df_train = df[df['fyear']==2018]
df_train_label = df_train['label']
df_train = df_train.drop('label', axis=1)
df_train.shape

(1069, 55)

In [31]:
df_valid = df[df['fyear']==2019]
df_valid_label = df_valid['label']
df_valid = df_valid.drop('label', axis=1)
df_valid.shape

(1045, 55)

In [32]:
df_test = df[df['fyear']==2020]
df_test_label = df_test['label']
df_test = df_test.drop('label', axis=1)
df_test_label.shape

(97,)

### Question-2: Compute and report the prior probabilities

In [33]:
# Initialize Gaussian Naive Bayes
gnb = GaussianNB()
# Train the classifier
gnb.fit(df_train, df_train_label)
# Make predictions on test data
df_pred = gnb.predict(df_test)
df_train_pred = gnb.predict(df_train)

# print the accuracy
print ('Training accuracy = ' + str(accuracy_score(df_train_pred, df_train_label)))
print ('Test accuracy = ' + str(accuracy_score(df_pred, df_test_label)))

Training accuracy = 0.30121608980355474
Test accuracy = 0.30927835051546393


In [34]:
print (gnb.class_prior_)

[0.07015903 0.82039289 0.0888681  0.02057998]


### Question-3

In [35]:
from scipy.stats import norm
import matplotlib.pyplot as plt

In [36]:
for i in [-1, 0, 1, 2]:
    ret = df_train['ret']
    index = list(df_train_label[df_train_label==i].index)
    ret = ret[index]
    ret_mean = ret.mean()
    ret_std = ret.std()
    x = 0.1
    density = 1/((2*np.pi)**0.5*ret_std)*np.exp(-((x-ret_mean)**2)/(2*ret_std**2))
    print(density)

2.634437299590731
2.658212280980677
2.898867165694418
2.12368662511862


### Question-4: Guassian naive bayes

In [37]:
df_full = pd.concat([df_train, df_valid])
df_full_label = pd.concat([df_train_label, df_valid_label])

In [38]:
# Initialize Gaussian Naive Bayes
gnb = GaussianNB()
# Train the classifier
gnb.fit(df_full, df_full_label)
# Make predictions on test data
df_pred = gnb.predict(df_test)
df_full_pred = gnb.predict(df_full)

# print the accuracy
print ('full accuracy = ' + str(accuracy_score(df_full_pred, df_full_label)))
print ('Test accuracy = ' + str(accuracy_score(df_pred, df_test_label)))

full accuracy = 0.24881740775780511
Test accuracy = 0.28865979381443296


### Question-5: Confusion matrix

In [39]:
from sklearn import metrics

In [40]:
print('The confusion matrix is: \n')
print(metrics.confusion_matrix(df_test_label, df_pred))

The confusion matrix is: 

[[ 1  0  0  7]
 [ 1 22  2 52]
 [ 0  0  1  7]
 [ 0  0  0  4]]


### Question-6

In [41]:
from sklearn.mixture import GaussianMixture

In [45]:
# Model
for i in ['tied', 'full', 'diag', 'spherical']:
    clf = GaussianMixture(n_components=3, covariance_type=i, init_params='kmeans', random_state=34)  
    clf.means_init = np.array([df_train[df_train_label == i].mean(axis=0) for i in range(3)]) # Maximum Likelihood Estimate
    clf.fit(df_train, df_train_label)
    pred = clf.predict(df_valid)
    print ('Validation accuracy for covariance type '+ i + ' = ' + str(accuracy_score(df_valid_label, pred)))

Validation accuracy for covariance type tied = 0.7990430622009569
Validation accuracy for covariance type full = 0.6794258373205742
Validation accuracy for covariance type diag = 0.8143540669856459
Validation accuracy for covariance type spherical = 0.14354066985645933


In [44]:
# Best for tied

clf = GaussianMixture(n_components=3, covariance_type='tied')
clf.means_init = np.array([df_full[df_full_label == i].mean(axis=0)
                                for i in range(3)])
clf.fit(df_full, df_full_label)
pred = clf.predict(df_test)
print ('Test accuracy = ' + str(accuracy_score(df_test_label, pred)))

Test accuracy = 0.6391752577319587


### Question-7