In [None]:

#  This NoteBook contains code from the Ch 05 folder ..
#



#  Step 00: Setup

In [None]:

%load_ext autoreload
%autoreload 2

%xmode Minimal

#  Setting display options 

import pandas as pd
   #
pd.set_option("display.width", 480)

#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate


   ###

    
#  How to use tabulate-
#
l_result = [{ "col1": 20, "col2": 30}]
   #
#  print(tabulate(l_result, headers='keys', tablefmt='psql', showindex=False))

print("--")


#  Step 01: Iris Data load, encode

In [None]:

#  Read the Iris data into a Pandas Dataframe
#
#     Features
#     1. sepal length in cm
#     2. sepal width in cm
#     3. petal length in cm
#     4. petal width in cm
#     5. class: 
#        Iris-setosa
#        Iris-versicolour
#        Iris-virginica
#
#  To convert class into a numeric, we use sklearn.preprocessing.LabelEncoder
#  See,
#     https://www.turing.com/kb/convert-categorical-data-in-pandas-and-scikit-learn
#

from sklearn.preprocessing import LabelEncoder

my_le = LabelEncoder()
   #
l_folder = "20_Data"
l_file   = "11_iris.data.txt"


pd_iris  = pd.read_csv((l_folder + "/" + l_file), header = 0, sep = ",",
   names = ["sl", "sw", "pl", "pw", "class"],
   dtype = {"sl": "float", "sw": "float", "pl": "float", "pw": "float", "class": "string"} )
      #
pd_iris["class_encoded"]  =  my_le.fit_transform(pd_iris["class"])
   #
pd_iris = pd_iris.drop(["class"], axis = 1)
    
    
#  Pandas.Dataframe.sample() returns a randomized count of rows
#
print(tabulate(pd_iris.sample(5), headers='keys', tablefmt='psql', showindex=False))
print("Number of rows: %d" % (len(pd_iris)))

print("--")


In [125]:

#  Split data into training and test.
#  Convert the data into numpy arrays, since the ml libraries we use later expect that.
#

import numpy as np
from sklearn.model_selection import train_test_split

np_iris = {}
   #
np_iris["train"], np_iris["test"] = train_test_split(pd_iris.to_numpy(),              #  random_state calls to shuffle the data,
   test_size = 0.20, random_state = 40)                                               #    which had arrived sorted
                                                                                      #  10% yields way too high of an accuracy
                                                                                      #    far below
print("Number of total rows: %d   Training rows: %d   Test rows: %d" %
  (len(pd_iris), len(np_iris["train"]), len(np_iris["test"])) )

print()
print("Train data:")
print("%s" % (np_iris["train"][0:5]))
print()
print("Test  data:")
print("%s" % (np_iris["test" ][0:5]))
print()
   #
print("--")


Number of total rows: 149   Training rows: 119   Test rows: 30

Train data:
[[6.1 2.9 4.7 1.4 1. ]
 [4.8 3.4 1.9 0.2 0. ]
 [5.2 3.5 1.5 0.2 0. ]
 [5.  3.3 1.4 0.2 0. ]
 [4.6 3.1 1.5 0.2 0. ]]

Test  data:
[[5.1 3.4 1.5 0.2 0. ]
 [5.8 2.7 4.1 1.  1. ]
 [6.5 3.  5.5 1.8 2. ]
 [7.7 2.6 6.9 2.3 2. ]
 [5.5 2.6 4.4 1.2 1. ]]

--


#  Step 02: Iris Data train, test .. NearestCentroid

In [126]:

from sklearn.neighbors import NearestCentroid

#  Our numpy array has 5 columns, with the last column being the class.
#  To review numpy array slicing,
#
#     To get the first 4 columns use,
#        np_iris["train"][:, :4]
#     To get the last column use,
#        np_iris["train"][:, -1]
#

my_model = NearestCentroid()


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")
    

Actual    labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 1. 1. 2. 2. 1. 0.
 1. 0. 0. 2. 0. 1.]
Predicted labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 2. 1. 2. 1. 1. 0.
 1. 0. 0. 2. 0. 1.]

Accuracy: 93.3333 %
--


#  Step 03: Iris Data train, test .. kNN

In [127]:

from sklearn.neighbors import KNeighborsClassifier

#  Our numpy array has 5 columns, with the last column being the class.
#  To review numpy array slicing,
#
#     To get the first 4 columns use,
#        np_iris["train"][:, :4]
#     To get the last column use,
#        np_iris["train"][:, -1]
#

#  The only line that differs between this cell and the one above, is
#  the first line
#

my_model = KNeighborsClassifier(n_neighbors = 3)


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


Actual    labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 1. 1. 2. 2. 1. 0.
 1. 0. 0. 2. 0. 1.]
Predicted labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 1. 1. 2. 1. 1. 0.
 1. 0. 0. 2. 0. 1.]

Accuracy: 96.6667 %
--


#  Step 04: Iris Data train, test .. Naive Bayes, Gaussian

In [128]:

from sklearn.naive_bayes import GaussianNB

#  Naive Bayes, Gaussian
#
#     Gaussian does better than the Multinomial below because,
#        Gaussian expects continuous values
#        Multinomial expects discreet values
#
#     And our values are continuous
#

my_model = GaussianNB()


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


Actual    labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 1. 1. 2. 2. 1. 0.
 1. 0. 0. 2. 0. 1.]
Predicted labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 2. 1. 2. 1. 1. 0.
 1. 0. 0. 2. 0. 1.]

Accuracy: 93.3333 %
--


#  Step 05: Iris Data train, test .. Naive Bayes, Multinomial

In [129]:

from sklearn.naive_bayes import MultinomialNB

#  Naive Bayes, Multinomial
#

my_model = MultinomialNB()


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


Actual    labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 1. 1. 2. 2. 1. 0.
 1. 0. 0. 2. 0. 1.]
Predicted labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 2. 1. 2. 2. 1. 0.
 1. 0. 0. 2. 0. 1.]

Accuracy: 96.6667 %
--


#  Step 06: Iris Data train, test .. Decision Tree

In [123]:

from sklearn.tree import DecisionTreeClassifier

#  Decision Tree
#

my_model = DecisionTreeClassifier()


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


Actual    labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1.]
Predicted labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1.]

Accuracy: 100.0000 %
--


#  Step 07: Iris Data train, test .. Random Forest

In [132]:

from sklearn.ensemble import RandomForestClassifier

#  Random Forest
#

my_model = RandomForestClassifier(n_estimators = 4)


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


Actual    labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 1. 1. 2. 2. 1. 0.
 1. 0. 0. 2. 0. 1.]
Predicted labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 1. 1. 2. 2. 2. 2. 1. 2. 1. 1. 0.
 1. 0. 0. 2. 0. 1.]

Accuracy: 90.0000 %
--


#  Step 08: Iris Data train, test .. Support Vector Machine (SVM)

In [133]:

from sklearn.svm import SVC

#  Support Vector Machine
#

my_model = SVC(kernel = "linear", C = 1.0)


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


Actual    labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 1. 1. 2. 2. 1. 0.
 1. 0. 0. 2. 0. 1.]
Predicted labels from test......... [0. 1. 2. 2. 1. 2. 1. 1. 1. 0. 1. 0. 0. 2. 1. 2. 2. 2. 1. 1. 2. 2. 1. 0.
 1. 0. 0. 2. 0. 1.]

Accuracy: 100.0000 %
--
