In [1]:
def train_test(clf, data):
        accu_train = []
        accu_test = []
        loss_train = []
        loss_test = []    
        t_last = []  
        train_X, train_Y, test_X, test_Y = data['train_X'], data['train_Y'], data['test_X'], data['test_Y']
        
        for j in range(0, repeat_times):
            t_begin = time.time()

            clf.fit(train_X, train_Y)           

            predict_train_Y = clf.predict(train_X)
            predict_test_Y = clf.predict(test_X)        
            t_last.append(time.time() - t_begin)

            posterior_train_Y = clf.predict_proba(train_X)
            posterior_test_Y = clf.predict_proba(test_X)

            accu_train.append(accuracy_score(predict_train_Y, train_Y))
            accu_test.append(accuracy_score(predict_test_Y, test_Y))

            loss_train.append(log_loss(train_Y, posterior_train_Y, normalize=True))
            loss_test.append(log_loss(test_Y, posterior_test_Y, normalize=True))

        cnf_matrix_train = confusion_matrix(predict_train_Y, data['train_Y'])
        print('[INFO] confusion matrix', cnf_matrix_train)
       
        cnf_matrix_test = confusion_matrix(predict_test_Y, data['test_Y'])         
        print('[INFO] confusion matrix', cnf_matrix_test)      

        accu_train = round(np.mean(accu_train), 4)
        accu_test = round(np.mean(accu_test), 4)
        loss_train = round(np.mean(loss_train), 4)
        loss_test = round(np.mean(loss_test), 4)
        t_last = round(np.mean(t_last)*1000, 4)
        
        ''' report train and test error '''
        print('Average training data accuracy:', accu_train)
        print('Average testing data accuracy:', accu_test)

        ''' report train and test log loss'''
        print('Average training data log loss:', loss_train)
        print('Average testing data log loss:', loss_test) 
        print('Average Time ms', t_last)
        
        return accu_train, accu_test, loss_train, loss_test, t_last, cnf_matrix_train, cnf_matrix_test

In [2]:
import numpy as np
import matplotlib.pyplot as plt

classes = np.array([0, 1])

title = ['breast-cancer', 'diabetes', 'digit', 'iris', 'wine']
plot_args = [{'c': 'red', 'linestyle': '-'},
             {'c': 'blue', 'linestyle': '-'}]
mode = ['Training data set', 'Testing data set']


def plot_performance(loss, accu_train, accu_test, i):
    
#     ax = plt.subplot(2, 3, i+1)
#     epoh = np.linspace(1,len(loss),len(loss))
#     ax.plot(epoh, loss, 'm-')
            
#     plt.title('Performance over time on '+title[i])
#     plt.ylabel('Log Loss') 
#     plt.xlim(-1,np.max(epoh))
#     plt.xlabel('Training Epochs')
#     plt.rc('xtick', labelsize=10)
#     plt.rc('ytick', labelsize=10)
    
    
    ax = plt.subplot(2, 3, i+1)
    epoh = np.linspace(1,len(accu_train),len(accu_train))
    ax.plot(epoh, accu_train, 'r-')
    ax.plot(epoh, accu_test, 'b-')
            
    plt.title('Performance over time on '+title[i])
    plt.ylabel('Classification Accuracy') 
    plt.xlim(-1,np.max(epoh))
    plt.xlabel('Training Epochs')
    plt.rc('xtick', labelsize=10)
    plt.rc('ytick', labelsize=10)
    fig.legend(ax.get_lines(), mode, ncol=2, loc="upper center")                 
                       
    
def partial_train_test(clf, data):
        loss = []
        accu_train = []
        accu_test = []
        t_last = []             
        train_X, train_Y, test_X, test_Y = data['train_X'], data['train_Y'], data['test_X'], data['test_Y']
#         mini_batch_num = batch_num
        
        for i in range(1, iters):        
            for X, Y in zip(np.array_split(train_X, batch_num), np.array_split(train_Y, batch_num)):
                t_begin = time.time()
    #             clf.fit(X, Y)
                clf.partial_fit(X, Y, classes=classes)           
            
                # loss 
                loss.append(log_loss(train_Y, clf.predict(train_X), normalize=True))            

                # accuracy on train & test data
                accu_train.append(clf.score(train_X, train_Y))
                accu_test.append(clf.score(test_X, test_Y))

                # time required for learning and testing
                t_last.append(time.time() - t_begin)            
                        
#             iterations.append(clf.n_iter_)
        print(clf.score(train_X, train_Y)*100)
        print(log_loss(train_Y, clf.predict(train_X), normalize=True))
        print(clf.score(test_X, test_Y)*100)
        print(log_loss(test_Y, clf.predict(test_X), normalize=True))
        print(np.sum(t_last)*1000, 'ms')
           
        return loss, accu_train, accu_test, t_last

In [3]:
import numpy as np
import csv
def output_csv(result):
    fieldnames = result[0,:]
    with open('experiment_result/log_reg.csv', 'w') as csvfile:   
        resultwriter = csv.DictWriter(csvfile, fieldnames=fieldnames)
        resultwriter.writeheader()
        for i in range(1, result.shape[0]):
            dict = {}
            for j in range(result.shape[1]):
                 dict[result[0,j]] = result[i,j]
            resultwriter.writerow(dict)


In [4]:
import itertools
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


In [None]:
# Module 1: setting the parameters
m_i = 1000
c = 0.001
p = "l2"
eta = 0.01
learning_rate='constant'
repeat_times = 100
file_count = 5
class_names = ['0', '1']
batch_num = 20
iters = 30

In [None]:
# Module 2: this module could help you to train and test a classifier

import time

import numpy as np
import matplotlib.pyplot as plot

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

# Create a linear classifier
clf = SGDClassifier(loss="log", penalty=p, max_iter=m_i, tol=c, 
                    learning_rate=learning_rate, eta0=eta, verbose=False)
clfs = []

input_data_filename = ['breast-cancer', 'diabetes', 
                       'digit', 'iris', 
                       'wine']
result = np.array([['Dataset', '$m\_i$', '$c$', '$\eta$', '$p$', '$a_{train}(\%)$', 
                    '$a_{test}(\%)$', '$l_{train}$', '$l_{test}$', '$time(ms)$']])

fig, axes=plt.subplots(2, 5, figsize=(15, 10))

for i in range(0, file_count):
    print('Reading data from: ', input_data_filename[i])
    data = np.load('datasets/' + input_data_filename[i] + '.npz')

    accu_train, accu_test, loss_train, loss_test, t_last, cnf_matrix_train, cnf_matrix_test = train_test(clf, data)
    
    ''' report parameters '''
    print('m_i', m_i, 'c', c, 'learning rate', eta, 'p', p, 't', repeat_times, 
          'a_train', accu_train*100, 'a_test', accu_test*100, 'loss_train', loss_train, 'loss_test', loss_test, 
          'time=train+test', t_last, 'ms')
    
    plt.subplot(2,5,i+1)
    plot_confusion_matrix(cnf_matrix_train, classes=class_names, normalize=False, title=input_data_filename[i]+'_train')
    plt.subplot(2,5,5+i+1)
    plot_confusion_matrix(cnf_matrix_test, classes=class_names, normalize=False, title=input_data_filename[i]+'_test')
    
#     print('auc', compute_auc(data['train_Y'], posterior_train_Y[:,1]))
    
    newrow = np.array([[input_data_filename[i], m_i, c, eta, p, accu_train*100, accu_test*100, loss_train, loss_test, t_last]])
    result = np.append(result, newrow, axis=0)
    
output_csv(result)
plt.savefig('log_confusion_matrix.eps', dpi=600)


Reading data from:  breast-cancer
[INFO] confusion matrix [[178   7]
 [ 13 349]]
[INFO] confusion matrix [[45  2]
 [ 3 86]]
Average training data accuracy: 0.9657
Average testing data accuracy: 0.9632
Average training data log loss: 0.0916
Average testing data log loss: 0.078
Average Time ms 1.1655
m_i 1000 c 0.001 learning rate 0.01 p l2 t 100 a_train 96.57 a_test 96.32 loss_train 0.0916 loss_test 0.078 time=train+test 1.1655 ms
Confusion matrix, without normalization
[[178   7]
 [ 13 349]]
Confusion matrix, without normalization
[[45  2]
 [ 3 86]]
Reading data from:  diabetes
[INFO] confusion matrix [[117  47]
 [ 97 354]]
[INFO] confusion matrix [[34 13]
 [20 86]]
Average training data accuracy: 0.7652
Average testing data accuracy: 0.7822
Average training data log loss: 0.4821
Average testing data log loss: 0.5043
Average Time ms 1.2469
m_i 1000 c 0.001 learning rate 0.01 p l2 t 100 a_train 76.52 a_test 78.22 loss_train 0.4821 loss_test 0.5043 time=train+test 1.2469 ms
Confusion mat

In [None]:
# Module 3: measuring model's performance overtimes

import time

import numpy as np
import matplotlib.pyplot as plot

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

# Create a linear classifier
input_data_filename = ['breast-cancer', 'diabetes', 
                       'digit', 'iris', 
                       'wine']

fig, axes=plt.subplots(2, 3, figsize=(15, 10))

for i in range(0, file_count):
    print('Reading data from: ', input_data_filename[i])
    data = np.load('datasets/' + input_data_filename[i] + '.npz')

    clf = SGDClassifier(loss="log", penalty=p, max_iter=m_i, tol=c, power_t=0.5, 
                        learning_rate=learning_rate, eta0=eta, verbose=False)    
    
    loss, accu_train, accu_test, t_last = partial_train_test(clf, data)
    
    plot_performance(loss, accu_train, accu_test, i)
    
plt.savefig('performance_accuracy.eps', dpi=600)
plt.show()