In [1]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from pprint import pprint

%load_ext autoreload
%autoreload 1

# Experiment pipeline

In [41]:
from gradientdescent import max_iters

import adaline
import naivebayes

experiments = [
    {
        'dataset': 'trec2007',
        'dataset_filename': 'trec2007-1607061515',
        'feature_extraction_parameters': {
        },
        'labels': {
            'ham_label': -1,
            'spam_label': 1,
        },
        'attack': None,
        'attack_parameters': {
        },
        'classifier': adaline,
        'training_parameters': {
            'learning_rate': 0.15,
            'initial_weights': None,
            'termination_condition': max_iters(50),
            'verbose': False,
        },
        'testing_parameters': {
        },
    },
    
    {
        'dataset': 'trec2007',
        'dataset_filename': 'trec2007-1607061515',
        'feature_extraction_parameters': {
        },
        'labels': {
            'ham_label': -1,
            'spam_label': 1,
        },
        'attack': None,
        'attack_parameters': {
        },
        'classifier': naivebayes,
        'training_parameters': {
        },
        'testing_parameters': {
        },
    },
]

def process_experiment_declaration(experiment):
    
    ham_label = experiment['labels']['ham_label']
    experiment['training_parameters']['ham_label'] = ham_label
    experiment['testing_parameters' ]['ham_label'] = ham_label
    
    if not experiment['attack']:
        def no_attack(x, **kwargs):
            return x
        experiment['attack'] = no_attack
    return experiment

experiments = list(map(process_experiment_declaration, experiments))

pprint(list(experiments), width=1)

[{'attack': <function process_experiment_declaration.<locals>.no_attack at 0x7f50ca282598>,
  'attack_parameters': {},
  'classifier': <module 'adaline' from '../adaline.py'>,
  'dataset': 'trec2007',
  'dataset_filename': 'trec2007-1607061515',
  'feature_extraction_parameters': {},
  'labels': {'ham_label': -1,
             'spam_label': 1},
  'testing_parameters': {'ham_label': -1},
  'training_parameters': {'ham_label': -1,
                          'initial_weights': None,
                          'learning_rate': 0.15,
                          'termination_condition': <function max_iters.<locals>.<lambda> at 0x7f50ca282378>,
                          'verbose': False}},
 {'attack': <function process_experiment_declaration.<locals>.no_attack at 0x7f50ca2820d0>,
  'attack_parameters': {},
  'classifier': <module 'naivebayes' from '../naivebayes.py'>,
  'dataset': 'trec2007',
  'dataset_filename': 'trec2007-1607061515',
  'feature_extraction_parameters': {},
  'labels': {'ham_labe

## choose experiment

In [42]:
experiment = experiments[1]
experiment

{'attack': <function __main__.process_experiment_declaration.<locals>.no_attack>,
 'attack_parameters': {},
 'classifier': <module 'naivebayes' from '../naivebayes.py'>,
 'dataset': 'trec2007',
 'dataset_filename': 'trec2007-1607061515',
 'feature_extraction_parameters': {},
 'labels': {'ham_label': -1, 'spam_label': 1},
 'testing_parameters': {'ham_label': -1},
 'training_parameters': {'ham_label': -1}}

## load data

In [31]:
import pickle

with open('../../datasets/processed/%s-features.dat' % experiment['dataset_filename'], 'rb') as infile:
    X = pickle.load(infile)

with open('../../datasets/processed/%s-labels.dat' % experiment['dataset_filename'], 'rb') as infile:
    Y = pickle.load(infile)

N, D = X.shape

print('X is a %s \t %s' % (X.shape, type(X)))
print('Y is a %s \t %s' % (Y.shape, type(Y)))


X is a (75419, 10000) 	 <class 'numpy.ndarray'>
Y is a (75419, 1) 	 <class 'numpy.matrixlib.defmatrix.matrix'>


## split into test and training sets

In [32]:
add_bias = lambda x: np.insert(x, 0, values=1, axis=1) # add bias term
convert_labels = lambda y: y*2 - 1

In [34]:
permutated_indices = np.random.permutation(N)
X = X[permutated_indices]
Y = Y[permutated_indices]

N_train = int(np.round(N * 0.5))
X_train = X[:N_train]
Y_train = Y[:N_train]
X_test = X[N_train:]
Y_test = Y[N_train:]

if experiment['classifier'] != 'naive bayes':
    X_train, X_test = map(add_bias, [X_train, X_test])
if experiment['labels']['ham_label'] == -1:
    Y_train, Y_test = map(convert_labels, [Y_train, Y_test])

print(X_train.shape)
print(Y_train.shape)
print()
print(X_test.shape)
print(Y_test.shape)

(37710, 10001)
(37710, 1)

(37709, 10001)
(37709, 1)


## apply attack

In [35]:
X_train = experiment['attack'](X_train, **experiment['attack_parameters'])
X_test  = experiment['attack'](X_test,  **experiment['attack_parameters'])

## training

In [43]:
model_parameters = experiment['classifier'].train(features=X_train, labels=Y_train, **experiment['training_parameters'])
O_train = experiment['classifier'].test(parameters=model_parameters, features=X_train, **experiment['testing_parameters'])

## testing

In [44]:
O_test = experiment['classifier'].test(parameters=model_parameters, features=X_test, **experiment['testing_parameters'])

## performance

In [45]:
import performance

## error
error_train = performance.get_error(Y_train, O_train)
error_test  = performance.get_error(Y_test,  O_test)

## False Positive Rate (=fall-out) ~also called false alarm rate
FPR = performance.get_FPR(Y_test, O_test, **experiment['labels'])

## False Negative Rate ~miss rate
FNR = performance.get_FNR(Y_test, O_test, **experiment['labels'])

In [46]:
print('error training set:\t%.3f' % error_train)
print('error testing  set:\t%.3f' % error_test )
print('false positive rate:\t%.3f' % FPR)
print('false negative rate:\t%.3f' % FNR)

error training set:	0.041
error testing  set:	0.042
false positive rate:	0.031
false negative rate:	0.048
