# SVM-RBF

In [1]:
import os
import os.path
import numpy as np
import time
from tqdm import tqdm

assert os.path.exists('data')

In [2]:
def load_dataset(data_source_filename):
  print("Loading %s... " % data_source_filename.split("/")[-1], end="")
  t = time.time()
  dataset = dict()
  with np.load(data_source_filename, allow_pickle=True) as source_file:
    for key in source_file.keys():
      # print(key)
      dataset[key] = source_file[key].tolist()
  print("done (%.1fs)" % (time.time()-t), flush=True)
  return dataset

data_source_filenames = [os.path.join('data', fn) for fn in os.listdir('data')
                            if os.path.isfile(os.path.join('data', fn)) and fn[-3:]=='npz']
data_source_filenames

['data/StClare_facs_danish.npz',
 'data/SDHK_Latin.npz',
 'data/StClare_facs_latin.npz',
 'data/StClare_dipl_danish.npz',
 'data/StClare_dipl_latin.npz',
 'data/velux_facs_danish.npz',
 'data/SDHK_Swedish.npz',
 'data/Colonia.npz',
 'data/SemEval2015.npz',
 'data/velux_dipl_latin.npz',
 'data/velux_facs_latin.npz',
 'data/velux_dipl_danish.npz']

In [3]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import ParameterGrid

bin_width = 25
n_features_max = 1000
n_samples_limit = 3000

results = dict()

In [6]:
for data_source_filename in set(data_source_filenames).difference(set(results.keys())):
#for data_source_filename in data_source_filenames:
    dataset = load_dataset(data_source_filename)
    y = np.asarray([np.mean(item['date']) if type(item['date']) is tuple else item['date'] for item in dataset['data']], np.int)
    y = bin_width*(y//bin_width)
    print(" Classes in data: %s" % np.unique(y))
    
    training_set = dataset['folds']['train']
    validation_set = dataset['folds']['val']
    test_set = dataset['folds']['test']
    training_validation_set = list()
    training_validation_set.extend(training_set)
    training_validation_set.extend(validation_set)

    # Subsample
    if len(training_validation_set) > n_samples_limit:
        print(" Training data too large (%i), subsampling to %i" % (len(training_validation_set), n_samples_limit))
        training_validation_set = np.asarray(training_validation_set)[np.linspace(0, len(training_validation_set), n_samples_limit, endpoint=False, dtype=np.int)]

    results[data_source_filename] = dict()
    print(" Running estimations", flush=True)
    for feature_set in list(dataset['feature_sets'].keys()):
        X = dataset['feature_sets'][feature_set]
        print("  Loaded feature set %s" % feature_set)
        n_samples, n_features = X.shape
        y_pred = np.zeros(y.shape)
        # Chi^2 feature selection
        if n_features > n_features_max:
            print("   Reducing feature space")
            given_data = list()
            given_data.extend(training_set)
            given_data.extend(validation_set)
            feature_selector = SelectKBest(chi2, k=n_features_max).fit(X[given_data], y[given_data])
            X = feature_selector.transform(X)
            n_features = X.shape[1]
        # Zero mean
        print("   Rescaling")
        X = X.tocsc(copy=True).astype(np.float)
        for i in range(X.shape[1]):
            if X.indptr[i] < X.indptr[i+1]:
                X.data[X.indptr[i]:X.indptr[i+1]] -= np.mean(X.data[X.indptr[i]:X.indptr[i+1]])
        X = StandardScaler(with_mean=False).fit_transform(X)
        X = X.tocsr(copy=True)
        # Set up parameter grid
        param_grid = {'C': np.logspace(-12, 5, 10), 'gamma': np.logspace(-10, 0, 10)}
        param_list = list(ParameterGrid(param_grid))       
        # Evaluate hyper-parameters
        print(" Making the estimator", flush=True)
        validation_accuracy = list()
        for params in tqdm(param_list, desc="  Searching for hyper-parameters"):
            estimator = SVC(class_weight='balanced', **params)
            estimator.fit(X[training_set, :], y[training_set])
            validation_accuracy.append(accuracy_score(y[validation_set], estimator.predict(X[validation_set, :]).ravel()))
        # Pick best hyperparameters and run with training and validation data
        print("  Best hyper-parameters:", param_list[np.argmax(validation_accuracy)])
        estimator = SVC(class_weight='balanced', **param_list[np.argmax(validation_accuracy)])
        estimator.fit(X[training_validation_set, :], y[training_validation_set])
        # Predict
        y_pred = estimator.predict(X[test_set, :])
        results[data_source_filename][feature_set] = {'accuracy': accuracy_score(y[test_set], y_pred), 
                                                      'y_pred': y_pred, 'y_true': y[test_set]}

    print()
    print(" Results")
    for feature_set in results[data_source_filename].keys():
        print("  %s, %.1f%%" % (feature_set, 100*results[data_source_filename][feature_set]['accuracy']))
        print("  unique predicted classes: %s" % np.unique(results[data_source_filename][feature_set]['y_pred']))

    def class_counter(data):
        count = dict()
        for e in data:
            if e in count:
                count[e] += 1
            else:
                count[e] = 1
        return count

    count = class_counter(y)
    print(" Controls")
    print("  MLE choice: %.1f%%" % (100*np.max(list(count.values()))/len(y)))
    print("  Uniform: %.1f%%" % (100*np.mean([np.sum(np.random.choice(np.unique(y), size=len(y), replace=True)==y)/len(y) for i in range(1000)])))
    print("  Weighted: %.1f%%" % (100*np.mean([np.sum(np.random.choice(y, size=len(y), replace=False)==y)/len(y) for i in range(1000)])))

In [5]:
for data_source_filename in results.keys():
    dataset = load_dataset(data_source_filename)
    print(" Saving %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    if 'svc' in dataset.keys():
        dataset.pop('svc')
    np.savez_compressed(data_source_filename, **dataset, svc=results[data_source_filename])
    print("done (%.1fs)" % (time.time()-t), flush=True)

Loading StClare_dipl_latin.npz... done (0.1s)
 Saving StClare_dipl_latin.npz... done (0.3s)
Loading SemEval2015.npz... done (0.2s)
 Saving SemEval2015.npz... done (1.1s)
Loading SDHK_Latin.npz... done (1.5s)
 Saving SDHK_Latin.npz... done (6.7s)
Loading SDHK_Swedish.npz... done (0.5s)
 Saving SDHK_Swedish.npz... done (2.7s)
Loading StClare_facs_danish.npz... done (0.0s)
 Saving StClare_facs_danish.npz... done (0.1s)
Loading Colonia.npz... done (5.4s)
 Saving Colonia.npz... done (16.3s)
Loading velux_dipl_latin.npz... done (0.1s)
 Saving velux_dipl_latin.npz... done (0.3s)
Loading velux_dipl_danish.npz... done (0.0s)
 Saving velux_dipl_danish.npz... done (0.1s)
Loading StClare_dipl_danish.npz... done (0.0s)
 Saving StClare_dipl_danish.npz... done (0.1s)
Loading velux_facs_danish.npz... done (0.0s)
 Saving velux_facs_danish.npz... done (0.1s)
Loading velux_facs_latin.npz... done (0.1s)
 Saving velux_facs_latin.npz... done (0.4s)
Loading StClare_facs_latin.npz... done (0.1s)
 Saving StCla