# Outline:
- Load Wine data
- Implement Gaussian Naive Bayes
- Introduce GridSearchCV on Random Forest
- Introduce make_classification function 

In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# !gdown --id 1WA0I2LdXT_v0GqyYQ6b92ad7cHm_fmUT

# Load the wine dataset
data = pd.read_csv('wine_original.csv')
labels = data['class']
del data['class']

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=10)

In [2]:
print (X_train.shape)
X_train.head()

(142, 13)


Unnamed: 0,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315,Proline
95,12.47,1.52,2.2,19.0,162,2.5,2.27,0.32,3.28,2.6,1.16,2.63,937
91,12.0,1.51,2.42,22.0,86,1.45,1.25,0.5,1.63,3.6,1.05,2.65,450
24,13.5,1.81,2.61,20.0,96,2.53,2.61,0.28,1.66,3.52,1.12,3.82,845
109,11.61,1.35,2.7,20.0,94,2.74,2.92,0.29,2.49,2.65,0.96,3.26,680
121,11.56,2.05,3.23,28.5,119,3.18,5.08,0.47,1.87,6.0,0.93,3.69,465


[NB doc](https://scikit-learn.org/stable/modules/naive_bayes.html)

GaussianNB doc:
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB

**priors:** array-like of shape (n_classes,) \
Prior probabilities of the classes. If specified the priors are not adjusted according to the data.

**var_smoothing:** float, default=1e-9 \
Portion of the largest variance of all features that is added to variances for calculation stability.



In [3]:
np.logspace(0, -12, num =13)

array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06, 1.e-07,
       1.e-08, 1.e-09, 1.e-10, 1.e-11, 1.e-12])

In [4]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

for vs in np.logspace(0, -12, num =13):

    # Initialize Gaussian Naive Bayes
    gnb = GaussianNB(var_smoothing = vs)

    # Train the classifier
    gnb.fit(X_train, y_train)
    
    # Make predictions on test data
    y_pred = gnb.predict(X_test)
    y_train_pred = gnb.predict(X_train)
    
    print ('vs = ' + str(vs))
    print ('Training accuracy = ' + str(np.sum(y_train_pred == y_train)/len(y_train)))
    print ('Test accuracy = ' + str(np.sum(y_pred == y_test)/len(y_test)))

vs = 1.0
Training accuracy = 0.6690140845070423
Test accuracy = 0.6944444444444444
vs = 0.1
Training accuracy = 0.7183098591549296
Test accuracy = 0.7222222222222222
vs = 0.01
Training accuracy = 0.7394366197183099
Test accuracy = 0.7222222222222222
vs = 0.001
Training accuracy = 0.7676056338028169
Test accuracy = 0.7222222222222222
vs = 0.0001
Training accuracy = 0.9084507042253521
Test accuracy = 0.9166666666666666
vs = 1e-05
Training accuracy = 0.9788732394366197
Test accuracy = 0.9444444444444444
vs = 1e-06
Training accuracy = 0.9929577464788732
Test accuracy = 0.9166666666666666
vs = 1e-07
Training accuracy = 0.9929577464788732
Test accuracy = 0.8888888888888888
vs = 1e-08
Training accuracy = 0.9859154929577465
Test accuracy = 0.8888888888888888
vs = 1e-09
Training accuracy = 0.9859154929577465
Test accuracy = 0.8888888888888888
vs = 1e-10
Training accuracy = 0.9859154929577465
Test accuracy = 0.8888888888888888
vs = 1e-11
Training accuracy = 0.9859154929577465
Test accuracy = 0.8

In [5]:
print (gnb.class_prior_)

[0.34507042 0.37323944 0.28169014]


In [6]:
print(gnb.predict_proba(X_test))

[[7.77879490e-11 1.00000000e+00 1.54786654e-18]
 [1.03627208e-11 1.00000000e+00 3.03140716e-10]
 [9.99999981e-01 1.86393579e-08 1.07425384e-34]
 [2.23035951e-11 1.00000000e+00 1.77867256e-14]
 [1.00000000e+00 1.94369316e-11 8.97437777e-30]
 [2.01384635e-06 9.99997986e-01 6.33633746e-21]
 [2.36406733e-08 9.99999976e-01 3.35476062e-14]
 [9.99998312e-01 1.68824501e-06 5.08640588e-28]
 [3.09275440e-20 2.32727637e-22 1.00000000e+00]
 [1.00000000e+00 9.09221468e-16 1.84294545e-44]
 [9.99997694e-01 2.30598977e-06 6.41812873e-26]
 [5.55891208e-01 4.44108792e-01 6.59048277e-29]
 [1.00000000e+00 2.50153268e-10 1.54910327e-25]
 [9.99010158e-11 1.68351895e-02 9.83164810e-01]
 [4.86770057e-07 9.99999513e-01 7.38431551e-15]
 [1.55383802e-16 9.99999988e-01 1.19104314e-08]
 [1.26871825e-14 7.37510636e-03 9.92624894e-01]
 [3.92807753e-06 9.99996072e-01 1.07125468e-11]
 [4.62853864e-09 9.99999995e-01 1.05316557e-21]
 [4.07432297e-19 4.08073169e-08 9.99999959e-01]
 [9.99994793e-01 5.20690870e-06 3.511708

# GridSearchCV and accuracy_score using Multinomial Naive Bayes

MultinomialNB doc:
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

para_grid = {'n_estimators': [20, 50, 100, 200], # Number of trees in random forest
               'max_features': ['auto', 'sqrt', 'log2'], # Number of features to consider at every split
               'max_depth': [10, 20, 30, None], # Maximum number of levels in tree
               'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
               'min_samples_leaf': [1, 2, 4]} # Minimum number of samples required at each leaf node

rf = RandomForestClassifier()
clf = GridSearchCV(rf, para_grid, cv = 3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# optimal parameters
print(clf.best_params_)

accuracy = accuracy_score(y_pred, y_test)
train_acc = accuracy_score(clf.predict(X_train), y_train)
print ('Test accuracy = ' + str(accuracy))
print ('Train accuracy = ' + str(train_acc)) 

{'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 20}
Test accuracy = 0.8611111111111112
Train accuracy = 1.0


# make_classification

In [8]:
# Generate a random n-class classification problem.

from sklearn.datasets import make_classification

X, y = make_classification(n_samples=2000, n_features=10, n_classes=4, n_informative=4)

X

array([[ 0.63177194,  0.94751505, -0.36605781, ...,  0.78424856,
         0.48017657,  0.6801706 ],
       [-2.20334734,  1.06654479,  1.06288988, ...,  0.6682359 ,
        -0.11381227, -1.39980569],
       [ 0.60938379, -0.25850069, -2.07940208, ...,  0.41355673,
        -0.90796613, -1.6013794 ],
       ...,
       [-0.11410629,  1.54293349, -0.20800973, ..., -1.52895471,
        -0.30078988, -0.46185686],
       [-1.88533324, -1.64922936,  0.23708023, ...,  1.7018218 ,
         3.20017762,  0.49517397],
       [ 0.01559523,  0.45548936, -1.95664826, ...,  0.68576515,
        -0.4411947 , -1.8935202 ]])