# Naive Bayes

In [1]:
import os
import os.path
import numpy as np
import time
from tqdm import tqdm

assert os.path.exists('data')

In [2]:
def load_dataset(data_source_filename):
  print("Loading %s... " % data_source_filename.split("/")[-1], end="")
  t = time.time()
  dataset = dict()
  with np.load(data_source_filename, allow_pickle=True) as source_file:
    for key in source_file.keys():
      # print(key)
      dataset[key] = source_file[key].tolist()
  print("done (%.1fs)" % (time.time()-t), flush=True)
  return dataset

data_source_filenames = [os.path.join('data', fn) for fn in os.listdir('data')
                            if os.path.isfile(os.path.join('data', fn)) and fn[-3:]=='npz']
data_source_filenames

['data/StClare_facs_danish.npz',
 'data/SDHK_Latin.npz',
 'data/StClare_facs_latin.npz',
 'data/StClare_dipl_danish.npz',
 'data/StClare_dipl_latin.npz',
 'data/velux_facs_danish.npz',
 'data/SDHK_Swedish.npz',
 'data/Colonia.npz',
 'data/SemEval2015.npz',
 'data/velux_dipl_latin.npz',
 'data/velux_facs_latin.npz',
 'data/velux_dipl_danish.npz']

In [3]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB

bin_width = 25

results_multinomial = dict()
results_gaussian = dict()

## Multinomial NB

In [9]:
for data_source_filename in set(data_source_filenames).symmetric_difference(set(results_multinomial.keys())):
#for data_source_filename in data_source_filenames:
    dataset = load_dataset(data_source_filename)
    y = np.asarray([np.mean(item['date']) if type(item['date']) is tuple else item['date'] for item in dataset['data']], np.int)
    y = bin_width*(y//bin_width)
    print(" Classes in data: %s" % np.unique(y))

    training_set = dataset['folds']['train']
    validation_set = dataset['folds']['val']
    test_set = dataset['folds']['test']
    training_validation_set = list()
    training_validation_set.extend(training_set)
    training_validation_set.extend(validation_set)

    results_multinomial[data_source_filename] = dict()
    print(" Running estimations", flush=True)
    for feature_set in dataset['feature_sets'].keys():
        X = dataset['feature_sets'][feature_set]
        estimator = MultinomialNB()
        estimator.fit(X[training_validation_set, :], y[training_validation_set])
        y_pred = estimator.predict(X[test_set, :])
        results_multinomial[data_source_filename][feature_set] = {'accuracy': np.sum(y[test_set]==y_pred)/len(test_set),
                                                                  'y_pred': y_pred, 'y_test': y[test_set]}

    print(" Results multinomial")
    for feature_set in results_multinomial[data_source_filename].keys():
        print("  %s, %.1f%%" % (feature_set, 100*results_multinomial[data_source_filename][feature_set]['accuracy']))
        print("   unique predicted classes: %s" % np.unique(results_multinomial[data_source_filename][feature_set]['y_pred']))

In [10]:
for data_source_filename in results_multinomial.keys():
    dataset = load_dataset(data_source_filename)
    print(" Saving %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    if 'multinomial_naive_bayes' in dataset.keys():
        dataset.pop('multinomial_naive_bayes')
    np.savez_compressed(data_source_filename, **dataset, multinomial_naive_bayes=results_multinomial[data_source_filename])
    print("done (%.1fs)" % (time.time()-t), flush=True)

Loading StClare_facs_latin.npz... done (0.1s)
 Saving StClare_facs_latin.npz... done (0.4s)
Loading SDHK_Swedish.npz... done (0.8s)
 Saving SDHK_Swedish.npz... done (3.3s)
Loading Colonia.npz... done (7.4s)
 Saving Colonia.npz... done (19.9s)
Loading StClare_facs_danish.npz... done (0.0s)
 Saving StClare_facs_danish.npz... done (0.2s)
Loading SemEval2015.npz... done (0.3s)
 Saving SemEval2015.npz... done (1.4s)
Loading velux_dipl_latin.npz... done (0.1s)
 Saving velux_dipl_latin.npz... done (0.4s)
Loading StClare_dipl_latin.npz... done (0.1s)
 Saving StClare_dipl_latin.npz... done (0.3s)
Loading velux_facs_latin.npz... done (0.1s)
 Saving velux_facs_latin.npz... done (0.4s)
Loading StClare_dipl_danish.npz... done (0.0s)
 Saving StClare_dipl_danish.npz... done (0.1s)
Loading velux_dipl_danish.npz... done (0.0s)
 Saving velux_dipl_danish.npz... done (0.1s)
Loading velux_facs_danish.npz... done (0.0s)
 Saving velux_facs_danish.npz... done (0.1s)
Loading SDHK_Latin.npz... done (1.7s)
 Savi

## Gausian NB

In [8]:
for data_source_filename in set(data_source_filenames).symmetric_difference(set(results_gaussian.keys())):
#for data_source_filename in data_source_filenames:
    dataset = load_dataset(data_source_filename)
    y = np.asarray([np.mean(item['date']) if type(item['date']) is tuple else item['date'] for item in dataset['data']], np.int)
    y = bin_width*(y//bin_width)
    print(" Classes in data: %s" % np.unique(y))

    training_set = dataset['folds']['train']
    validation_set = dataset['folds']['val']
    test_set = dataset['folds']['test']
    training_validation_set = list()
    training_validation_set.extend(training_set)
    training_validation_set.extend(validation_set)

    results_gaussian[data_source_filename] = dict()
    print(" Running estimations", flush=True)
    for feature_set in dataset['feature_sets'].keys():
        X = dataset['feature_sets'][feature_set]
        if np.prod(X[training_validation_set, :].shape)*8 < 10e9: # max 10Gb allowed memory
            estimator = GaussianNB()
            estimator.fit(X[training_validation_set, :].todense(), y[training_validation_set])
            y_pred = estimator.predict(X[test_set, :].todense())
            results_gaussian[data_source_filename][feature_set] = {'accuracy': np.sum(y[test_set]==y_pred)/len(test_set),
                                                                   'y_pred': y_pred, 'y_test': y[test_set]}
    print(" Results gaussian")
    for feature_set in results_gaussian[data_source_filename].keys():
        print("  %s, %.1f%%" % (feature_set, 100*results_gaussian[data_source_filename][feature_set]['accuracy']))
        print("   unique predicted classes: %s" % np.unique(results_gaussian[data_source_filename][feature_set]['y_pred']))

In [11]:
for data_source_filename in results_gaussian.keys():
    dataset = load_dataset(data_source_filename)
    print(" Saving %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    if 'gaussian_naive_bayes' in dataset.keys():
        dataset.pop('gaussian_naive_bayes')
    np.savez_compressed(data_source_filename, **dataset, gaussian_naive_bayes=results_gaussian[data_source_filename])
    print("done (%.1fs)" % (time.time()-t), flush=True)

Loading StClare_facs_latin.npz... done (0.1s)
 Saving StClare_facs_latin.npz... done (0.4s)
Loading SDHK_Swedish.npz... done (0.8s)
 Saving SDHK_Swedish.npz... done (3.2s)
Loading Colonia.npz... done (7.7s)
 Saving Colonia.npz... done (19.6s)
Loading StClare_facs_danish.npz... done (0.0s)
 Saving StClare_facs_danish.npz... done (0.1s)
Loading SemEval2015.npz... done (0.3s)
 Saving SemEval2015.npz... done (1.3s)
Loading velux_dipl_latin.npz... done (0.1s)
 Saving velux_dipl_latin.npz... done (0.4s)
Loading StClare_dipl_latin.npz... done (0.1s)
 Saving StClare_dipl_latin.npz... done (0.3s)
Loading velux_facs_latin.npz... done (0.1s)
 Saving velux_facs_latin.npz... done (0.4s)
Loading StClare_dipl_danish.npz... done (0.0s)
 Saving StClare_dipl_danish.npz... done (0.1s)
Loading velux_dipl_danish.npz... done (0.0s)
 Saving velux_dipl_danish.npz... done (0.1s)
Loading velux_facs_danish.npz... done (0.0s)
 Saving velux_facs_danish.npz... done (0.1s)
Loading SDHK_Latin.npz... done (1.7s)
 Savi