1. Create a classification problem with 3 classes, 15 features and 5000 rows
2. Take the last 1000 rows to be the test set
3. Run Gaussian naive bayes on this problem and report test accuracy
4. Calculate class prior probabilities for each class in training data (first 4k rows)
5. Calculate the probability of the samples for each class in the test set

References:
    http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
    http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB

In [22]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [23]:
# Create a classification problem with 3 classes, 15 features, 5000 rows, and n_informative=3
X, y = make_classification(n_samples=5000, n_features=15, n_classes=3, n_informative=3)


In [24]:
# Take the first 4000 rows to be the training set
X_training = X[:4000]
y_training = y[:4000]

# Take the last 1000 rows to be the test set
X_test = X[4000:]
y_test = y[4000:]

In [25]:
a = [1, 2]
a[1:]

[2]

In [26]:
# Run Gaussian naive bayes on the training set
for vs in np.logspace(0, -12, num =13):

    # Initialize Gaussian Naive Bayes
    gnb = GaussianNB(var_smoothing = vs)

    # Train the classifier
    gnb.fit(X_training, y_training)
    
    # Make predictions on test data
    y_pred = gnb.predict(X_test)
    y_train_pred = gnb.predict(X_training)
    
    print ('vs = ' + str(vs))
    print ('Training accuracy = ' + str(np.sum(y_train_pred == y_training)/len(y_training)))
    print ('Test accuracy = ' + str(np.sum(y_pred == y_test)/len(y_test)))

# Print out accuracy on the test set



vs = 1.0
Training accuracy = 0.60625
Test accuracy = 0.616
vs = 0.1
Training accuracy = 0.6765
Test accuracy = 0.665
vs = 0.01
Training accuracy = 0.67975
Test accuracy = 0.674
vs = 0.001
Training accuracy = 0.6795
Test accuracy = 0.673
vs = 0.0001
Training accuracy = 0.6795
Test accuracy = 0.674
vs = 1e-05
Training accuracy = 0.6795
Test accuracy = 0.674
vs = 1e-06
Training accuracy = 0.6795
Test accuracy = 0.674
vs = 1e-07
Training accuracy = 0.6795
Test accuracy = 0.674
vs = 1e-08
Training accuracy = 0.6795
Test accuracy = 0.674
vs = 1e-09
Training accuracy = 0.6795
Test accuracy = 0.674
vs = 1e-10
Training accuracy = 0.6795
Test accuracy = 0.674
vs = 1e-11
Training accuracy = 0.6795
Test accuracy = 0.674
vs = 1e-12
Training accuracy = 0.6795
Test accuracy = 0.674


In [27]:
# Calculate class prior probabilities for each class in training data (first 4k rows)

print(gnb.class_prior_)

[0.33125 0.3345  0.33425]


In [28]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-2.185400,-2.268125,1.471058,0.852025,-0.585893,-0.570415,0.953545,-0.061085,0.752906,-0.661170,-0.648557,0.032811,-2.421047,0.680146,-0.425356
1,3.978667,1.441801,-1.867044,-2.293248,-0.377531,-0.355928,-0.551592,-0.157098,-0.326736,0.335674,2.056070,-0.243245,0.720400,-1.954706,2.634074
2,-0.001583,0.823914,1.698318,-0.044165,-1.773439,1.462553,-0.535248,-0.585478,0.679717,-0.707518,3.220894,0.074833,-0.438772,0.532623,3.886967
3,-2.039223,-1.175897,1.039442,0.784002,0.089170,-0.563596,-0.004177,-0.118483,0.667508,-0.396132,-1.255892,0.013602,0.670755,0.728953,-1.160035
4,-0.574743,-0.362768,-0.490729,2.632225,-1.084361,0.926670,0.672728,-0.911591,1.151753,-1.454650,0.361694,0.874635,-1.501767,-0.900712,-2.128366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-1.331038,-0.467974,0.509831,2.473545,-1.001153,0.188123,-0.450882,1.323311,0.622852,0.245232,0.664046,-0.633442,2.217506,-1.000526,-1.148726
996,-0.457591,-0.062891,-0.321511,1.195705,-0.347207,-0.635124,0.125659,-1.384575,-0.052248,0.517768,-0.408427,2.208230,0.784644,-1.969376,-1.533620
997,2.089806,0.435431,0.025017,-0.746293,-0.603733,0.769773,0.015053,0.270783,-0.316314,-0.312315,3.435496,-0.670376,1.192361,-0.280272,3.685595
998,-1.594547,-1.936210,1.243174,0.021527,1.121703,-0.049130,0.198790,0.551870,-1.571210,0.565254,-0.699807,-2.257920,-1.520595,-0.682802,0.080248


In [29]:
# Calculate the probability of the samples for each class in the test set
print(gnb.predict_proba(X_test))

[[9.65630463e-03 2.73831898e-03 9.87605376e-01]
 [4.35946293e-01 5.64051716e-01 1.99082485e-06]
 [9.87848052e-01 1.19230118e-02 2.28935766e-04]
 ...
 [8.33588315e-01 1.66408035e-01 3.64950979e-06]
 [7.05184952e-02 7.61012723e-03 9.21871378e-01]
 [3.37408807e-01 5.64638929e-01 9.79522640e-02]]
