In [45]:
# Import required libraries

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [47]:
# Import the Dataset
dataset = pd.read_csv("iris.csv")

In [48]:
# View the number of rows and columns in our dataset
dataset.shape

(150, 5)

In [49]:
# Inspect the first five records of the dataset
dataset.head()

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [50]:
# Divide the data into attributes and labels
x = dataset.drop('Class', axis=1)
y = dataset['Class']

In [51]:
# Here the X variable contains all the columns from the dataset, except the "Class" column. 
# The y variable contains the values from the "Class" column. 
# The X variable is the attribute set and y variable contains corresponding labels.

In [52]:
# Divide the data into training and test sets
from sklearn.model_selection import train_test_split
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.40)

In [53]:
# The model_selection library of Scikit-Learn contains train_test_split method.
# We can use it to randomly split the data into training and testing sets.
# The test_size parameter specifies the ratio of the test set.
# We split up 20% of the data in to the test set and 80% for training.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [54]:
# Now we can train the naive bayes algorithm on this data and make predictions. 
# Scikit-Learn contains the sklearn.naive_bayes library, 
# which contains built-in classes/methods for various naive bayes algorithms. 

In [55]:
# Import the GaussianNB class from the sklearn.naive_bayes library. 

In [56]:
from sklearn.naive_bayes import GaussianNB
nv = GaussianNB() # create a classifier
nv.fit(x_train,y_train) # fitting the data

GaussianNB(priors=None, var_smoothing=1e-09)

In [57]:
# For more information refer
# https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

In [58]:
# The final step is to make predictions on our test data.

In [59]:
y_pred = nv.predict(x_test) # store the prediction data

In [60]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [61]:
# Download the wine.csv and create a Jupyter Notebook “NaiveBayesWine.ipynb”. Do the following

#     a) Clean the dataset if necessary

#     b) Find the accuracy of GaussianNB clasiifier for test data.
#          i) 30% from the total dataset
#         ii) 40% from the total dataset
#         iv) 50% from the total dataset

#     c) Submit the “NaiveBayesWine.ipynb”

In [62]:
wine_data = pd.read_csv("wine.csv")
wine_data.shape

(178, 14)

In [63]:
wine_data.head

<bound method NDFrame.head of      Wine  Alcohol  Malic.acid   Ash   Acl   Mg  Phenols  Flavanoids  \
0       1    14.23        1.71  2.43  15.6  127     2.80        3.06   
1       1    13.20        1.78  2.14  11.2  100     2.65        2.76   
2       1    13.16        2.36  2.67  18.6  101     2.80        3.24   
3       1    14.37        1.95  2.50  16.8  113     3.85        3.49   
4       1    13.24        2.59  2.87  21.0  118     2.80        2.69   
5       1    14.20        1.76  2.45  15.2  112     3.27        3.39   
6       1    14.39        1.87  2.45  14.6   96     2.50        2.52   
7       1    14.06        2.15  2.61  17.6  121     2.60        2.51   
8       1    14.83        1.64  2.17  14.0   97     2.80        2.98   
9       1    13.86        1.35  2.27  16.0   98     2.98        3.15   
10      1    14.10        2.16  2.30  18.0  105     2.95        3.32   
11      1    14.12        1.48  2.32  16.8   95     2.20        2.43   
12      1    13.75        1.73  2.

In [64]:
wine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
Wine                    178 non-null int64
Alcohol                 178 non-null float64
Malic.acid              178 non-null float64
Ash                     178 non-null float64
Acl                     178 non-null float64
Mg                      178 non-null int64
Phenols                 178 non-null float64
Flavanoids              178 non-null float64
Nonflavanoid.phenols    178 non-null float64
Proanth                 178 non-null float64
Color.int               178 non-null float64
Hue                     178 non-null float64
OD                      178 non-null float64
Proline                 178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 19.5 KB


In [65]:
winex = wine_data.drop('Wine', axis=1)
winey = wine_data['Wine']


In [66]:
for s in np.arange(0.3,0.51,0.1):    
    winex_train, winex_test, winey_train, winey_test = train_test_split(winex, winey, test_size=s)
    winenv = GaussianNB() # create a classifier
    winenv.fit(winex_train,winey_train) 
    winey_pred = winenv.predict(winex_test)
    print("Accuracy(test_size={}):".format(s),metrics.accuracy_score(winey_test, winey_pred))

Accuracy(test_size=0.3): 0.9629629629629629
Accuracy(test_size=0.4): 1.0
Accuracy(test_size=0.5): 0.9775280898876404
