In [0]:
import matplotlib.pyplot as plt

import tensorflow as tf
import pandas as pd 

# Training data collected

In [0]:
train_dataset_url = "http://download.tensorflow.org/data/iris_training.csv"
test_dataset_url = "http://download.tensorflow.org/data/iris_test.csv"

CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth',
                    'PetalLength', 'PetalWidth', 'Species']

#Parse the dataset from a CSV-formatted text file 

In [0]:
def load_data(label_name = 'Species'):
  
  #create a local copy of the training set
  train_path = tf.keras.utils.get_file(fname = train_dataset_url.split('/')[-1],
                                      origin = train_dataset_url)
  # parse the dataset
  train = pd.read_csv(filepath_or_buffer = train_path,
                      names = CSV_COLUMN_NAMES,
                      header = 0
                     )
  #train a holds a pandas DataFrame, table-like data structure
  train_in, train_out = train, train.pop(label_name)
  
  #do the same for the test dataset
  test_path = tf.keras.utils.get_file(fname = test_dataset_url.split('/')[-1],
                                     origin = test_dataset_url)
  test = pd.read_csv(filepath_or_buffer = test_path,
                     names = CSV_COLUMN_NAMES,
                     header = 0
                    )
  
  test_in, test_out = test, test.pop(label_name)
  
  return (train_in, train_out), (test_in, test_out)

In [5]:
(train_in, train_out), (test_in, test_out) = load_data()

Downloading data from http://download.tensorflow.org/data/iris_training.csv
Downloading data from http://download.tensorflow.org/data/iris_test.csv


In [6]:
train_out

0      2
1      1
2      2
3      0
4      0
5      0
6      0
7      2
8      1
9      0
10     1
11     1
12     0
13     0
14     2
15     1
16     2
17     2
18     2
19     0
20     2
21     2
22     0
23     2
24     2
25     0
26     1
27     2
28     1
29     1
      ..
90     2
91     1
92     0
93     0
94     2
95     0
96     0
97     2
98     1
99     0
100    0
101    1
102    0
103    1
104    0
105    0
106    0
107    0
108    1
109    0
110    2
111    1
112    0
113    2
114    0
115    1
116    1
117    0
118    0
119    1
Name: Species, Length: 120, dtype: int64

# Selecting a model
We will be selecting neural network to estimate out classifier

In [0]:
def train_model(clf, X_train, X_test, Y_train, Y_test):
    
    """
    Train a classifier, evaluate, and plot
    
    Parameters
    ----------
    
    clf  :  a sklearn classifier object (e.g., sklearn.neighbors)
    
    X_train : pandas dataframe containing training input
                        
    X_test :  pandas dataframe containing test input
    
    Y_train : pandas dataframe containing training output
    
    Y_test : pandas dataframe containing test output
                                   
                  
    Returns
    -------
    acc :   dictionary
                'acc_train' : training accuracy
                'acc_test'  : testing accuracy
    
    """
    
    
    # Train (fit) the classifier to training data    
    clf = clf.fit(X_train, Y_train)

    # Apply the classifier to train and test data
    Y_train_predict = clf.predict(X_train)
    Y_test_predict = clf.predict(X_test)
    
    # Calculate train and test accuracy
    acc_train = clf.score(X_train, Y_train)
    acc_test = clf.score(X_test,Y_test)
            
        
    return {'acc_train':acc_train, 'acc_test':acc_test}

#Naive Bayes

In [8]:
from sklearn import naive_bayes
    
# Define the classifier
priors=None
clf = naive_bayes.GaussianNB(priors=priors)


# Train the classifier and plot
acc = train_model(clf, train_in, test_in, train_out, test_out)
acc


{'acc_test': 0.9666666666666667, 'acc_train': 0.95}

In [9]:
clf.class_prior_[0]

0.35

#K_nearest Neighbors

In [10]:
from sklearn import neighbors
    
# Define the classifier
n_neighbors = 5
clf = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)

# Train the classifier and plot
acc = train_model(clf, train_in, test_in, train_out, test_out)

# Print accuracy values
print('Training Accuracy: {:0.3f}'.format(acc['acc_train']))
print('Testing Accuracy:  {:0.3f}'.format(acc['acc_test']))

Training Accuracy: 0.975
Testing Accuracy:  0.967


##Finding the best number of neighbors

In [11]:
high_acc_n = -1
high_acc = -1.0
for n in range(1,21):
    
    # Student code goes here
    clf = neighbors.KNeighborsClassifier(n_neighbors=n)
    curr_acc = train_model(clf, train_in, test_in, train_out, test_out)
    
    if curr_acc['acc_test']>high_acc:
        high_acc = curr_acc['acc_test']
        high_acc_n = n
        
    print("n", n, "accuracy", curr_acc['acc_test'])
print("n", high_acc_n, "highest test accuracy", high_acc)

n 1 accuracy 0.9333333333333333
n 2 accuracy 0.9333333333333333
n 3 accuracy 0.9666666666666667
n 4 accuracy 0.9666666666666667
n 5 accuracy 0.9666666666666667
n 6 accuracy 0.9666666666666667
n 7 accuracy 0.9666666666666667
n 8 accuracy 1.0
n 9 accuracy 1.0
n 10 accuracy 1.0
n 11 accuracy 1.0
n 12 accuracy 1.0
n 13 accuracy 1.0
n 14 accuracy 1.0
n 15 accuracy 1.0
n 16 accuracy 1.0
n 17 accuracy 1.0
n 18 accuracy 1.0
n 19 accuracy 1.0
n 20 accuracy 1.0
n 8 highest test accuracy 1.0


#Decision Trees

In [12]:
from sklearn import tree
    
# Define the classifier
max_depth = 5
clf = tree.DecisionTreeClassifier(max_depth = max_depth, random_state=9)

# Train the classifier and plot
acc = train_model(clf, train_in, test_in, train_out, test_out)

# Print accuracy values
print('Training Accuracy: {:0.3f}'.format(acc['acc_train']))
print('Testing Accuracy:  {:0.3f}'.format(acc['acc_test']))

Training Accuracy: 0.992
Testing Accuracy:  0.967


##Finding the best depth

In [14]:
best_depth = -1
highest_acc = -1.0
for d in range(5, 40, 5):
  # Define the classifier
  #max_depth = 5
  clf = tree.DecisionTreeClassifier(max_depth = d)
  
  # Train the classifier and plot
  acc = train_model(clf, train_in, test_in, train_out, test_out)
  
  if(acc['acc_test']> highest_acc):
    best_depth = d
    highest_acc = acc['acc_test']
  
  print("Depth:", d, "Accuracy of Test Set:", acc['acc_test'])

print("Best Depth:", best_depth, "Accuracy:", highest_acc)

Depth: 5 Accuracy of Test Set: 0.9666666666666667
Depth: 10 Accuracy of Test Set: 0.9666666666666667
Depth: 15 Accuracy of Test Set: 1.0
Depth: 20 Accuracy of Test Set: 0.9666666666666667
Depth: 25 Accuracy of Test Set: 1.0
Depth: 30 Accuracy of Test Set: 1.0
Depth: 35 Accuracy of Test Set: 1.0
Best Depth: 15 Accuracy: 1.0


#Support Vector Machines

In [16]:
from sklearn import svm
    
# Define the classifier
kernel = 'rbf'
C = 1.0 #error coefficient
gamma = 3.0 
clf = svm.SVC(kernel=kernel, C=C, gamma=gamma, random_state=999)

# Train the classifier and plot
acc = train_model(clf, train_in, test_in, train_out, test_out)

# Print accuracy values
print('Training Accuracy: {:0.3f}'.format(acc['acc_train']))
print('Testing Accuracy:  {:0.3f}'.format(acc['acc_test']))

Training Accuracy: 0.983
Testing Accuracy:  0.967
