In [None]:
import numpy as np
import pandas as pd
import sklearn.linear_model as sklearn_lm
import matplotlib.pyplot as mp_plt
import sklearn.multiclass as sklearn_mc
from matplotlib.colors import ListedColormap
%matplotlib inline

### Load data

In [None]:
# load data
data_names = ['sepal length','sepal width','petal length', 'petal width', 'y']
data = pd.read_csv('iris_dataset.txt', delimiter=',', names=data_names)
data = data.values

# define constants
NUM_SAMPLES = data.shape[0]
dr = 0.8
NUM_TRAIN = int(NUM_SAMPLES * dr)
NUM_TEST = NUM_SAMPLES - NUM_TRAIN
NUM_CLASSES = 3
NUM_ITER = 1
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

# preprocess data
train_indexes = np.random.choice(NUM_SAMPLES, NUM_TRAIN, replace=False)
test_indexes = np.setdiff1d(np.arange(0,NUM_SAMPLES), train_indexes, assume_unique=True)
train_data = data[train_indexes]
test_data = data[test_indexes]

X_train = train_data[:,:-1]
y_train = train_data[:,-1]
X_test = test_data[:,:-1]
y_test = test_data[:,-1]

# training class labels
label_idx = []
for c in range(NUM_CLASSES):
  label_idx.append(np.asarray([i for i in range(NUM_TRAIN) if y_train[i] == c+1]))
  

### Plot training and test set

In [None]:
# plot data
data_plots = [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]]
data_plots_name = [['SL','SW'],['SL','PL'],['SL','PW'],['SW','PL'],['SW','PW'],['PL','PW']]
fig = mp_plt.figure(figsize=(24,16))
for p in range(6):
  mp_plt.subplot(2,3,p+1)
  for i in range(NUM_CLASSES):
    mp_plt.scatter(X_train[label_idx[i], data_plots[p][0]], 
                   X_train[label_idx[i], data_plots[p][1]], 
                   cmap=cmap_bold, label='Class: '+str(i+1))
    mp_plt.xlabel(data_plots_name[p][0])
    mp_plt.ylabel(data_plots_name[p][1])
  mp_plt.legend()
  
mp_plt.savefig('figures/5.pdf')

In [None]:
# class labels for testing
label_idx_test = []
for c in range(NUM_CLASSES):
  label_idx_test.append(np.asarray([i for i in range(NUM_TEST) if y_test[i] == c+1]))

# plot data
data_color = ['magenta', 'orange', 'cyan']
data_plots = [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]]
data_plots_name = [['SL','SW'],['SL','PL'],['SL','PW'],['SW','PL'],['SW','PW'],['PL','PW']]
fig = mp_plt.figure(figsize=(24,16))
for p in range(6):
  mp_plt.subplot(2,3,p+1)
  for i in range(NUM_CLASSES):
    mp_plt.scatter(X_test[label_idx_test[i], data_plots[p][0]], 
                   X_test[label_idx_test[i], data_plots[p][1]], 
                   cmap=cmap_bold, label='Class: '+str(i+1))
    mp_plt.xlabel(data_plots_name[p][0])
    mp_plt.ylabel(data_plots_name[p][1])
  mp_plt.legend()
  
mp_plt.savefig('figures/6.pdf')

In [None]:
# plot data
data_color = ['magenta', 'orange', 'cyan']
data_plots = [[0,1],[0,2],[0,3],[1,2],[1,3],[2,3]]
data_plots_name = [['SL','SW'],['SL','PL'],['SL','PW'],['SW','PL'],['SW','PW'],['PL','PW']]
fig = mp_plt.figure(figsize=(24,16))
for p in range(6):
  mp_plt.subplot(2,3,p+1)
  for i in range(NUM_CLASSES):
    mp_plt.scatter(X_train[label_idx[i], data_plots[p][0]], 
                   X_train[label_idx[i], data_plots[p][1]], 
                   c=data_color[i], label='Class: '+str(i+1))
    mp_plt.xlabel(data_plots_name[p][0])
    mp_plt.ylabel(data_plots_name[p][1])
  mp_plt.legend()
  
mp_plt.savefig('figures/5.pdf')

### Multi-class Classification

In [None]:
def make_meshplot_multi(X, label_idx, clf, kwargs=None):    
  h = 0.2 # meshgrid step
  NUM_CLASSES = 3
  cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
  cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
  cmap_color = ['#FF0000', '#00FF00', '#0000FF']
  
  # make meshgraid and decision regions
  x1_min, x1_max = X[:,0].min() - 1, X[:, 0].max() + 1
  x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                       np.arange(x2_min, x2_max, h))
  dbmesh_pred = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])

  # reshape and stack into color plot
  dbmesh_pred = dbmesh_pred.reshape(xx1.shape)
  mp_plt.figure()
  fig = mp_plt.figure(figsize=(8,8))
  mp_plt.pcolormesh(xx1, xx2, dbmesh_pred, cmap=cmap_light)

  # plot training set
  for i in range(NUM_CLASSES):
    mp_plt.scatter(X[label_idx[i], 0], 
                   X[label_idx[i], 1], 
                   c=cmap_color[i], edgecolor='k', cmap=cmap_bold,
                   label='Class: '+str(i+1))
  mp_plt.xlim(xx1.min(), xx1.max())
  mp_plt.ylim(xx2.min(), xx2.max())
  if(kwargs):
    mp_plt.xlabel(kwargs['x'])
    mp_plt.ylabel(kwargs['y'])
    mp_plt.title(kwargs['title'])
  mp_plt.legend()
  

In [None]:
def get_accuracy(y_test, y_pred):
  return np.float64(sum(y_test == y_pred)) / np.float64(y_test.size)

def linear_classifier(X_train, y_train, X_test, y_test, normalise_data, NUM_ITER):
  if(normalise_data):
    X_mu = np.mean(X_train, axis=0)
    X_var = np.var(X_train, axis=0)
    X_train = (X_train - X_mu) / np.sqrt(X_var)
    X_test = (X_test - X_mu) / np.sqrt(X_var)
  
  # randomly sample data from both classes
  acc = np.empty((NUM_ITER, 1))
  for i in range(NUM_ITER):
#     clf = sklearn_lm.SGDClassifier(loss='hinge', eta0=1, learning_rate='constant', penalty='none')
#     clf = sklearn_lm.SGDClassifier(loss='squared_loss', penalty='l1')
    clf = sklearn_lm.LogisticRegression(penalty='l2')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    
    # predicted label indices 
    label_idx_pred = []
    for c in range(NUM_CLASSES):
      label_idx_pred.append(np.asarray([i for i in range(X_test.shape[0]) if y_pred[i] == c+1]))

    # draw decision boundaries if 2 features are used
#     make_meshplot_multi(X_test[:,2:], label_idx_test, clf,
#                     {'x':'PL', 'y':'PW', 'title':'Decision Boundary with Actual Classes'})
#     make_meshplot_multi(X_test[:,2:], label_idx_pred, clf, 
#                     {'x':'PL', 'y':'PW', 'title':'Decision Boundary with Predicted Classes'})
    
    acc[i] = get_accuracy(y_test, y_pred)
#     y_score = clf.predict_proba(X_test)

  return acc, clf, y_pred, y_test

normalise_data = False
acc, clf, y_pred, y_test = linear_classifier(X_train, y_train, X_test, y_test, normalise_data, NUM_ITER)
print(np.mean(acc), np.var(acc))

In [None]:
make_roc(y_test, y_score)


### One vs. Rest Classification

In [None]:
def make_meshplot_ovr(X, label_idx, clf, kwargs=None):    
  h = 0.2 # meshgrid step
  NUM_CLASSES = 2
  cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA'])
  cmap_bold = ListedColormap(['#FF0000', '#00FF00'])
  cmap_color = ['#FF0000', '#00FF00']
  
  # make meshgraid and decision regions
  x1_min, x1_max = X[:,0].min() - 1, X[:, 0].max() + 1
  x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                       np.arange(x2_min, x2_max, h))
  dbmesh_pred = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])

  # reshape and stack into color plot
  dbmesh_pred = dbmesh_pred.reshape(xx1.shape)
  mp_plt.figure()
  fig = mp_plt.figure(figsize=(8,8))
  mp_plt.pcolormesh(xx1, xx2, dbmesh_pred, cmap=cmap_light)

  # plot training set
  for i in range(2):
    mp_plt.scatter(X[label_idx[i], 0], 
                   X[label_idx[i], 1], 
                   c = cmap_color[i], cmap=cmap_bold, edgecolor='k', 
                   label='Class: '+str(i))
  mp_plt.xlim(xx1.min(), xx1.max())
  mp_plt.ylim(xx2.min(), xx2.max())
  if(kwargs):
    mp_plt.xlabel(kwargs['x'])
    mp_plt.ylabel(kwargs['y'])
    mp_plt.title(kwargs['title'])
    mp_plt.legend()
  

In [None]:
def get_accuracy(y_test, y_pred):
  return np.float64(sum(y_test == y_pred)) / np.float64(y_test.size)

def linear_classifier(X_train, y_train, X_test, y_test, normalise_data, NUM_CLASSES, NUM_ITER):
  if(normalise_data):
    X_mu = np.mean(X_train, axis=0)
    X_var = np.var(X_train, axis=0)
    X_train = (X_train - X_mu) / np.sqrt(X_var)
    X_test = (X_test - X_mu) / np.sqrt(X_var)
  
  # randomly sample data from both classes
  acc = np.empty((NUM_ITER, 1))
  for i in range(NUM_ITER):
    clf = sklearn_lm.LogisticRegression(penalty='l2')
#     clf = sklearn_lm.SGDClassifier(loss='hinge', eta0=1, learning_rate='constant', penalty='none')
#     clf = sklearn_mc.OneVsRestClassifier(sklearn_lm.SGDClassifier(loss='squared_loss', penalty='none'))
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    
    # label indices for predicted classes
    label_idx_pred = []
    for c in range(NUM_CLASSES):
      label_idx_pred.append(np.asarray([i for i in range(X_test.shape[0]) if y_pred[i] == c+1]))
    
    acc[i] = get_accuracy(y_test, y_pred)

  return acc, clf, label_idx_pred

normalise_data = False
acc, clf, label_idx_pred = linear_classifier(X_train, y_train, X_test, y_test, normalise_data, NUM_CLASSES, NUM_ITER)
print(np.mean(acc), np.var(acc))

In [None]:
y_act = np.concatenate((np.ones(10, dtype=np.int32), np.zeros(20, dtype=np.int32)))

for i in range(NUM_CLASSES):
  curr_clf = clf.estimators_[i]
  y_pred = curr_clf.predict(X_test[:,2:])

  label_idx_act = []
  for c in range(2):
    label_idx_act.append(np.asarray([i for i in range(X_test.shape[0]) if y_act[i] == c]))
  
  label_idx_pred = []
  for c in range(2):
    label_idx_pred.append(np.asarray([i for i in range(X_test.shape[0]) if y_pred[i] == c]))

  make_meshplot_ovr(X_test[:,2:], label_idx_act, curr_clf,
                    {'x':'PL', 'y':'PW', 'title':'Decision Boundary with Actual Classes'})
  make_meshplot_ovr(X_test[:,2:], label_idx_pred, curr_clf,
                    {'x':'PL', 'y':'PW', 'title':'Decision Boundary with Predicted Classes'})
  
  y_act = np.roll(y_act, 10)

### One vs One Classification

In [None]:
def get_accuracy(y_test, y_pred):
  return np.float64(sum(y_test == y_pred)) / np.float64(y_test.size)

def linear_classifier(X_train, y_train, X_test, y_test, normalise_data, NUM_ITER):
  if(normalise_data):
    X_mu = np.mean(X_train, axis=0)
    X_var = np.var(X_train, axis=0)
    X_train = (X_train - X_mu) / np.sqrt(X_var)
    X_test = (X_test - X_mu) / np.sqrt(X_var)
  
  # randomly sample data from both classes
  acc = np.empty((NUM_ITER, 1))
  for i in range(NUM_ITER):
    clf = sklearn_mc.OneVsOneClassifier(sklearn_lm.SGDClassifier(loss='log', penalty='none'))
    clf.fit(X_train[:,2:], y_train)

    y_pred = clf.predict(X_test[:,2:])
    acc[i] = get_accuracy(y_test, y_pred)

  return acc, clf

normalise_data = False
acc, clf = linear_classifier(X_train, y_train, X_test, y_test, normalise_data, 1)
print(np.mean(acc), np.var(acc))

In [None]:
y_act = np.concatenate((np.ones(10, dtype=np.int32), np.zeros(20, dtype=np.int32)))

for i in range(NUM_CLASSES):
  curr_clf = clf.estimators_[i]
  y_pred = curr_clf.predict(X_test[:,2:])

  label_idx_act = []
  for c in range(2):
    label_idx_act.append(np.asarray([i for i in range(X_test.shape[0]) if y_act[i] == c]))
  
  label_idx_pred = []
  for c in range(2):
    label_idx_pred.append(np.asarray([i for i in range(X_test.shape[0]) if y_pred[i] == c]))

  make_meshplot_ovr(X_test[:,2:], label_idx_act, curr_clf,
                    {'x':'PL', 'y':'PW', 'title':'Decision Boundary with Actual Classes'})
  make_meshplot_ovr(X_test[:,2:], label_idx_pred, curr_clf,
                    {'x':'PL', 'y':'PW', 'title':'Decision Boundary with Predicted Classes'})
  
  y_act = np.roll(y_act, 10)

### Decision Boundaries for Multiclass-Classification using two features

In [None]:
def make_meshplot(X, label_idx, clf, kwargs=None):    
  h = 0.2 # meshgrid step
  NUM_CLASSES = 3
  cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
  cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
  cmap_color = ['#FF0000', '#00FF00', '#0000FF']
  
  # make meshgraid and decision regions
  x1_min, x1_max = X[:,0].min() - 1, X[:, 0].max() + 1
  x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
                       np.arange(x2_min, x2_max, h))
  dbmesh_pred = clf.predict(np.c_[xx1.ravel(), xx2.ravel()])

  # reshape and stack into color plot
  dbmesh_pred = dbmesh_pred.reshape(xx1.shape)
  mp_plt.figure()
  fig = mp_plt.figure(figsize=(8,8))
  mp_plt.pcolormesh(xx1, xx2, dbmesh_pred, cmap=cmap_light)

  # plot training set
  for i in range(NUM_CLASSES):
    mp_plt.scatter(X[label_idx[i], 0], 
                   X[label_idx[i], 1], 
                   c=cmap_color[i], edgecolor='k', cmap=cmap_bold,
                   label='Class: '+str(i+1))
  mp_plt.xlim(xx1.min(), xx1.max())
  mp_plt.ylim(xx2.min(), xx2.max())
  if(kwargs):
    mp_plt.xlabel(kwargs['x'])
    mp_plt.ylabel(kwargs['y'])
    mp_plt.title(kwargs['title'])
  mp_plt.legend()
  

In [None]:
def get_accuracy(y_test, y_pred):
  return np.float64(sum(y_test == y_pred)) / np.float64(y_test.size)

def linear_classifier(X_train, y_train, X_test, y_test, label_idx, normalise_data, NUM_CLASSES, NUM_ITER):
  if(normalise_data):
    X_mu = np.mean(X_train, axis=0)
    X_var = np.var(X_train, axis=0)
    X_train = (X_train - X_mu) / np.sqrt(X_var)
    X_test = (X_test - X_mu) / np.sqrt(X_var)
  
  # randomly sample data from both classes
  acc = np.empty((NUM_ITER, 1))
  for i in range(NUM_ITER):
    clf = sklearn_lm.SGDClassifier(loss='log', penalty='none')
    clf.fit(X_train[:,2:], y_train)

    y_pred = clf.predict(X_test[:,2:])
    
    # label indices for predicted classes
    label_idx_pred = []
    for c in range(NUM_CLASSES):
      label_idx_pred.append(np.asarray([i for i in range(X_test.shape[0]) if y_pred[i] == c+1]))
    
    make_meshplot(X_test[:,2:], label_idx, clf, {'x':'PL', 'y':'PW', 'title':'Decision Boundary with Actual Classes'})
    make_meshplot(X_test[:,2:], label_idx_pred, clf, {'x':'PL', 'y':'PW', 'title':'Decision Boundary with Predicted Classes'})

    acc[i] = get_accuracy(y_test, y_pred)

  return acc

normalise_data = True
acc = linear_classifier(X_train, y_train, X_test, y_test, label_idx_test, normalise_data, NUM_CLASSES, NUM_ITER)
print(np.mean(acc), np.var(acc))