In [None]:

#  This NoteBook contains code to run classic ML routines against a 
#  number of familiar data swets ..
#


#  Step 00: Setup

In [169]:

#  Largely code to control how print statements and related work
#

%load_ext autoreload
%autoreload 2

%xmode Minimal

#  Setting display options 
#
import pandas as pd
pd.set_option("display.width", 640)
   #
import numpy as np
np.set_printoptions(edgeitems = 30, linewidth = 100000, 
   formatter = dict(float = lambda x: "%.3g" % x))


#  Sets horizontal scroll for wide outputs
#
from IPython.display import display, HTML
display(HTML(""))

from tabulate import tabulate

   ###

#  How to use tabulate-
#
#  l_result = [{ "col1": 20, "col2": 30}]
#  #
#  print(tabulate(l_result, headers='keys', tablefmt='psql', showindex=False))

print("--")



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Exception reporting mode: Minimal


--


In [166]:

#  Helper functions for what we want to do below-
#

#  We use this object to store the history of results; display only
#
class MyHistory:
  
   __my_history = []
    
   def append(self, i_str):
      self.__my_history.append(i_str)
        
   def clear(self):
      self.__my_history = []
        
   def __str__(self):
      return(str(self.__my_history))
   

l_history = MyHistory() 


#  The sklearn ML routines follow a very consistent pattern. As such, we
#  put these in a function, reduce redundant code below-
#

def do_model(i_routine, i_train_data, i_train_labels, i_test_data, i_test_labels, i_name_of_test):

   #  Train whatever model
   #
   i_routine.fit(i_train_data, i_train_labels)
   
   #  Predict on the test data
   #
   l_predicted_labels = i_routine.predict(i_test_data)
   
   #  Output results
   #
   print("   Actual    labels from test......... %s" % (i_test_labels     ) )
   print("   Predicted labels from test......... %s" % (l_predicted_labels) )
      #
   print("      ###")
   print("   Accuracy: %0.4f %%" % (i_routine.score(i_test_data, i_test_labels) * 100) )
      #
   print()

print("--")





--


#  Step 01: Iris Data load, encode

In [167]:

#  Read the Iris data into a Pandas Dataframe
#
#     Features
#     1. sepal length in cm
#     2. sepal width in cm
#     3. petal length in cm
#     4. petal width in cm
#     5. class: 
#        Iris-setosa
#        Iris-versicolour
#        Iris-virginica
#
#  To convert class into a numeric, we use sklearn.preprocessing.LabelEncoder
#  See,
#     https://www.turing.com/kb/convert-categorical-data-in-pandas-and-scikit-learn
#

from sklearn.preprocessing import LabelEncoder

my_le = LabelEncoder()
   #
l_folder = "20_Data"
l_file   = "11_iris.data.txt"


pd_iris  = pd.read_csv((l_folder + "/" + l_file), header = 0, sep = ",",
   names = ["sl", "sw", "pl", "pw", "class"],
   dtype = {"sl": "float", "sw": "float", "pl": "float", "pw": "float", "class": "string"} )
      #
pd_iris["class_encoded"]  =  my_le.fit_transform(pd_iris["class"])
   #
pd_iris = pd_iris.drop(["class"], axis = 1)
    
    
#  Pandas.Dataframe.sample() returns a randomized set of rows, versus
#  say head(), which always returns the first n ..
#
print(tabulate(pd_iris.sample(5), headers='keys', tablefmt='psql', showindex=False))
print("Number of rows: %d" % (len(pd_iris)))

print("--")


+------+------+------+------+-----------------+
|   sl |   sw |   pl |   pw |   class_encoded |
|------+------+------+------+-----------------|
|  4.8 |  3   |  1.4 |  0.3 |               0 |
|  4.9 |  3.1 |  1.5 |  0.1 |               0 |
|  6.4 |  2.8 |  5.6 |  2.2 |               2 |
|  6.7 |  3.1 |  4.7 |  1.5 |               1 |
|  5.6 |  2.7 |  4.2 |  1.3 |               1 |
+------+------+------+------+-----------------+
Number of rows: 149
--


In [170]:

#  Split data into training and test.
#  Convert the data into numpy arrays, since the ml libraries we use later expect that.
#

import numpy as np
from sklearn.model_selection import train_test_split

np_iris = {}
   #
np_iris["train"], np_iris["test"] = train_test_split(pd_iris.to_numpy(),              #  random_state calls to shuffle the data,
   test_size = 0.20, random_state = 40)                                               #    which had arrived sorted
                                                                                      #  10% yields way too high of an accuracy
                                                                                      #    far below
print("Number of total rows: %d   Training rows: %d   Test rows: %d" %
  (len(pd_iris), len(np_iris["train"]), len(np_iris["test"])) )

print()
print("Train data:")
print("%s" % (np_iris["train"][0:5]))
print()
print("Test  data:")
print("%s" % (np_iris["test" ][0:5]))
print()
   #
print("--")


Number of total rows: 149   Training rows: 119   Test rows: 30

Train data:
[[6.1 2.9 4.7 1.4 1]
 [4.8 3.4 1.9 0.2 0]
 [5.2 3.5 1.5 0.2 0]
 [5 3.3 1.4 0.2 0]
 [4.6 3.1 1.5 0.2 0]]

Test  data:
[[5.1 3.4 1.5 0.2 0]
 [5.8 2.7 4.1 1 1]
 [6.5 3 5.5 1.8 2]
 [7.7 2.6 6.9 2.3 2]
 [5.5 2.6 4.4 1.2 1]]

--


#  Step 02: Iris Data train, test .. NearestCentroid

In [171]:

from sklearn.neighbors import NearestCentroid

#  Our numpy array has 5 columns, with the last column being the class.
#  To review numpy array slicing,
#
#     To get the first 4 columns use,
#        np_iris["train"][:, :4]
#     To get the last column use,
#        np_iris["train"][:, -1]
#

do_model( NearestCentroid(), np_iris["train"][:, :4], np_iris["train"][:, -1], np_iris["test"][:, :4], np_iris["test"][:, -1], "Iris: Centroid" ) 
    
print("--")
    

   Actual    labels from test......... [0 1 2 2 1 2 1 1 1 0 1 0 0 2 1 2 2 2 1 1 2 2 1 0 1 0 0 2 0 1]
   Predicted labels from test......... [0 1 2 2 1 2 1 1 1 0 1 0 0 2 1 2 2 2 2 1 2 1 1 0 1 0 0 2 0 1]
      ###
   Accuracy: 93.3333 %



#  Step 03: Iris Data train, test .. kNN

In [None]:

from sklearn.neighbors import KNeighborsClassifier

#  Our numpy array has 5 columns, with the last column being the class.
#  To review numpy array slicing,
#
#     To get the first 4 columns use,
#        np_iris["train"][:, :4]
#     To get the last column use,
#        np_iris["train"][:, -1]
#

#  The only line that differs between this cell and the one above, is
#  the first line
#

my_model = KNeighborsClassifier(n_neighbors = 3)


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


#  Step 04: Iris Data train, test .. Naive Bayes, Gaussian

In [None]:

from sklearn.naive_bayes import GaussianNB

#  Naive Bayes, Gaussian
#
#     Gaussian does better than the Multinomial below because,
#        Gaussian expects continuous values
#        Multinomial expects discreet values
#
#     And our values are continuous
#

my_model = GaussianNB()


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


#  Step 05: Iris Data train, test .. Naive Bayes, Multinomial

In [None]:

from sklearn.naive_bayes import MultinomialNB

#  Naive Bayes, Multinomial
#

my_model = MultinomialNB()


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


#  Step 06: Iris Data train, test .. Decision Tree

In [None]:

from sklearn.tree import DecisionTreeClassifier

#  Decision Tree
#

my_model = DecisionTreeClassifier()


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


#  Step 07: Iris Data train, test .. Random Forest

In [None]:

from sklearn.ensemble import RandomForestClassifier

#  Random Forest
#

#  n_estimators, number of random trees created and trained
#
my_model = RandomForestClassifier(n_estimators = 5)


#  train the model
#
my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])

#  predict on the test data
#
l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])

print("Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
print("Predicted labels from test......... %s" % (l_predicted_labels)      )
   #
print()
print("Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )

print("--")


#  Step 08: Iris Data train, test .. Support Vector Machine (SVM)

In [None]:

from sklearn.svm import SVC

#  Support Vector Machine
#
#  We run this one with a number of configurations ..
#

def do_model(arg1):

   #  train the model
   #
   my_model.fit(np_iris["train"][:, :4], np_iris["train"][:, -1])
   
   #  predict on the test data
   #
   l_predicted_labels = my_model.predict( np_iris["test" ][:, :4])
   
   print("   Actual    labels from test......... %s" % (np_iris["test" ][:, -1]) )
   print("   Predicted labels from test......... %s" % (l_predicted_labels)      )
      #
   print("      ###")
   print("   Accuracy: %0.4f %%" % (my_model.score(np_iris["test" ][:, :4], np_iris["test" ][:, -1]) * 100) )
      #
   print()
    
    
       ############################
        

#  C      ==  margin constant
#  gamma  ==  used by the Gaussian kernel
#

my_model = SVC(kernel = "linear", C = 1.0)
   #
print("Linear...")
do_model(my_model)

my_model = SVC(kernel = "rbf", C = 1.0, gamma = 0.25 )
   #
print("RBF......")
do_model(my_model)

my_model = SVC(kernel = "rbf", C = 1.0, gamma = 0.001)
   #
print("RBF 2....")
do_model(my_model)

print("--")


In [None]:

####################################################################
####################################################################
####################################################################

####################################################################
####################################################################
####################################################################


#  Step 09:  Breast Cancer Data load, encode, normalize

In [None]:

#  Read the Breast Cancer data into a Pandas Dataframe
#
#     Features
#     1)     ID number
#     2)     Diagnosis (M = malignant, B = benign)
#     3-32)
#       Ten real-valued features are computed for each cell nucleus:
#     
#     	a) radius (mean of distances from center to points on the perimeter)
#     	b) texture (standard deviation of gray-scale values)
#     	c) perimeter
#     	d) area
#     	e) smoothness (local variation in radius lengths)
#     	f) compactness (perimeter^2 / area - 1.0)
#     	g) concavity (severity of concave portions of the contour)
#     	h) concave points (number of concave portions of the contour)
#     	i) symmetry 
#     	j) fractal dimension ("coastline approximation" - 1)
#

from sklearn.preprocessing import LabelEncoder

my_le = LabelEncoder()
   #
l_folder = "20_Data"
l_file   = "11_iris.data.txt"


pd_iris  = pd.read_csv((l_folder + "/" + l_file), header = 0, sep = ",",
   names = ["sl", "sw", "pl", "pw", "class"],
   dtype = {"sl": "float", "sw": "float", "pl": "float", "pw": "float", "class": "string"} )
      #
pd_iris["class_encoded"]  =  my_le.fit_transform(pd_iris["class"])
   #
pd_iris = pd_iris.drop(["class"], axis = 1)
    
    
#  Pandas.Dataframe.sample() returns a randomized count of rows
#
print(tabulate(pd_iris.sample(5), headers='keys', tablefmt='psql', showindex=False))
print("Number of rows: %d" % (len(pd_iris)))

print("--")


In [None]:

#  Split data into training and test.
#  Convert the data into numpy arrays, since the ml libraries we use later expect that.
#

import numpy as np
from sklearn.model_selection import train_test_split

np_iris = {}
   #
np_iris["train"], np_iris["test"] = train_test_split(pd_iris.to_numpy(),              #  random_state calls to shuffle the data,
   test_size = 0.20, random_state = 40)                                               #    which had arrived sorted
                                                                                      #  10% yields way too high of an accuracy
                                                                                      #    far below
print("Number of total rows: %d   Training rows: %d   Test rows: %d" %
  (len(pd_iris), len(np_iris["train"]), len(np_iris["test"])) )

print()
print("Train data:")
print("%s" % (np_iris["train"][0:5]))
print()
print("Test  data:")
print("%s" % (np_iris["test" ][0:5]))
print()
   #
print("--")


In [None]:


1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)

Ten real-valued features are computed for each cell nucleus:

	a) radius (mean of distances from center to points on the perimeter)
	b) texture (standard deviation of gray-scale values)
	c) perimeter
	d) area
	e) smoothness (local variation in radius lengths)
	f) compactness (perimeter^2 / area - 1.0)
	g) concavity (severity of concave portions of the contour)
	h) concave points (number of concave portions of the contour)
	i) symmetry 
	j) fractal dimension ("coastline approximation" - 1)
    