# SVM

In [1]:
import os
import os.path
import numpy as np
import time
from tqdm import tqdm

assert os.path.exists('data')

In [2]:
def load_dataset(data_source_filename):
  print("Loading %s... " % data_source_filename.split("/")[-1], end="")
  t = time.time()
  dataset = dict()
  with np.load(data_source_filename, allow_pickle=True) as source_file:
    for key in source_file.keys():
      # print(key)
      dataset[key] = source_file[key].tolist()
  print("done (%.1fs)" % (time.time()-t), flush=True)
  return dataset

data_source_filenames = [os.path.join('data', fn) for fn in os.listdir('data')
                            if os.path.isfile(os.path.join('data', fn)) and fn[-3:]=='npz']
data_source_filenames

['data/StClare_facs_danish.npz',
 'data/SDHK_Latin.npz',
 'data/StClare_facs_latin.npz',
 'data/StClare_dipl_danish.npz',
 'data/StClare_dipl_latin.npz',
 'data/velux_facs_danish.npz',
 'data/SDHK_Swedish.npz',
 'data/Colonia.npz',
 'data/SemEval2015.npz',
 'data/velux_dipl_latin.npz',
 'data/velux_facs_latin.npz',
 'data/velux_dipl_danish.npz']

In [3]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

bin_width = 25
n_features_max = 1000

results = dict()

In [4]:
for data_source_filename in set(data_source_filenames).symmetric_difference(set(results.keys())):
#for data_source_filename in data_source_filenames:
    dataset = load_dataset(data_source_filename)
    y = np.asarray([np.mean(item['date']) if type(item['date']) is tuple else item['date'] for item in dataset['data']], np.int)
    y = bin_width*(y//bin_width)
    print(" Classes in data: %s" % np.unique(y))
    
    training_set = dataset['folds']['train']
    validation_set = dataset['folds']['val']
    test_set = dataset['folds']['test']
    training_validation_set = list()
    training_validation_set.extend(training_set)
    training_validation_set.extend(validation_set)
    
    results[data_source_filename] = dict()
    print(" Running estimations", flush=True)
    for feature_set in list(dataset['feature_sets'].keys()):
        X = dataset['feature_sets'][feature_set]
        print("  Loaded feature set %s" % feature_set)
        n_samples, n_features = X.shape
        y_pred = np.zeros(y.shape)
        
        # Chi^2 feature selection
        if n_features > n_features_max:
            print("   Reducing feature space")
            feature_selector = SelectKBest(chi2, k=n_features_max).fit(X[training_validation_set], y[training_validation_set])
            X = feature_selector.transform(X)
            n_features = X.shape[1]
        # Zero mean
        X = X.tocsc(copy=True).astype(np.float)
        for i in range(X.shape[1]):
            if X.indptr[i] < X.indptr[i+1]:
                X.data[X.indptr[i]:X.indptr[i+1]] -= np.mean(X.data[X.indptr[i]:X.indptr[i+1]])
        X = StandardScaler(with_mean=False).fit_transform(X)
        X = X.tocsr(copy=True)

        C = np.logspace(-10, 5, 20)
        validation_accuracy = list()
        print(" Making the estimator", flush=True)
        for c in tqdm(C, desc="  Searching for regularisation parameter C"):
            estimator = LinearSVC(dual=(n_samples<=n_features), class_weight='balanced', C=c)
            estimator.fit(X[training_set, :], y[training_set])
            validation_accuracy.append(accuracy_score(y[validation_set], estimator.predict(X[validation_set, :]).ravel()))
        estimator = LinearSVC(dual=(n_samples<=n_features), 
                              class_weight='balanced', C=C[np.argmax(validation_accuracy)])
        print("  Best C:", C[np.argmax(validation_accuracy)])
        estimator.fit(X[training_validation_set, :], y[training_validation_set])
        y_pred = estimator.predict(X[test_set, :])
        results[data_source_filename][feature_set] = {'accuracy': accuracy_score(y[test_set], y_pred), 
                                                      'y_pred': y_pred, 'y_true': y[test_set]}

    print()
    print(" Results")
    for feature_set in results[data_source_filename].keys():
        print("  %s, %.1f%%" % (feature_set, 100*results[data_source_filename][feature_set]['accuracy']))
        print("  unique predicted classes: %s" % np.unique(results[data_source_filename][feature_set]['y_pred']))

    def class_counter(data):
        count = dict()
        for e in data:
            if e in count:
                count[e] += 1
            else:
                count[e] = 1
        return count

    count = class_counter(y)
    print(" Controls")
    print("  MLE choice: %.1f%%" % (100*np.max(list(count.values()))/len(y)))
    print("  Uniform: %.1f%%" % (100*np.mean([np.sum(np.random.choice(np.unique(y), size=len(y), replace=True)==y)/len(y) for i in range(1000)])))
    print("  Weighted: %.1f%%" % (100*np.mean([np.sum(np.random.choice(y, size=len(y), replace=False)==y)/len(y) for i in range(1000)])))

Loading velux_dipl_latin.npz... done (0.1s)
 Classes in data: [1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500 1525]
 Running estimations
  Loaded feature set bow_words
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 29.11it/s] 

  Best C: 6.158482110660255e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 40.48it/s]

  Best C: 0.007847599703514624
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:02<00:00,  7.21it/s]

  Best C: 0.00020691380811147902
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 29.00it/s]


  Best C: 0.00020691380811147902
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 33.41it/s] 

  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 84.04it/s] 


  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 28.26it/s]


  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 28.45it/s]

  Best C: 6.158482110660255e-10

 Results
  bow_words, 19.7%
  unique predicted classes: [1300]
  tfidf_words, 22.5%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  character_ngram_1, 33.8%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500]
  character_ngram_2, 49.3%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1450 1500]
  character_ngram_3, 49.3%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1425 1500]
  word_ngram_1, 39.4%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  word_ngram_2, 29.6%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  word_ngram_3, 21.1%
  unique predicted classes: [1250 1275 1300 1325 1425 1475 1500]
 Controls
  MLE choice: 19.3%
  Uniform: 8.3%
  Weighted: 14.6%
Loading StClare_dipl_danish.npz... done (0.0s)
 Classes in data: [1400 1425 1450 1475 1500 1525 1550 1575]
 Running estimations
  Loaded feature set bow_words
   Reducing featur


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 86.92it/s] 

  Best C: 1e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 690.50it/s]

  Best C: 0.00020691380811147902
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 73.11it/s]

  Best C: 1e-10
  Loaded feature set character_ngram_2
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 232.77it/s]

  Best C: 3.359818286283781e-05
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 324.22it/s]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 669.11it/s]

  Best C: 0.04832930238571752
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 284.51it/s]


  Best C: 1e-10
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 78.95it/s] 

  Best C: 0.2976351441631313

 Results
  bow_words, 15.8%
  unique predicted classes: [1450]
  tfidf_words, 10.5%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  character_ngram_1, 26.3%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  character_ngram_2, 52.6%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  character_ngram_3, 42.1%
  unique predicted classes: [1425 1450 1475 1500 1525 1550]
  word_ngram_1, 5.3%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1575]
  word_ngram_2, 10.5%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  word_ngram_3, 10.5%
  unique predicted classes: [1400 1450 1475]
 Controls
  MLE choice: 17.3%
  Uniform: 12.4%
  Weighted: 14.0%
Loading velux_dipl_danish.npz... done (0.0s)
 Classes in data: [1400 1425 1450 1475 1500 1525 1550 1575]
 Running estimations
  Loaded feature set bow_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 87.59it/s] 

  Best C: 1e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 737.22it/s]

  Best C: 0.00020691380811147902
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 56.56it/s] 

  Best C: 0.0012742749857031347
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 174.68it/s]

  Best C: 0.00020691380811147902
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 335.45it/s]

  Best C: 1e-10
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 620.56it/s]


  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 646.05it/s]

  Best C: 0.2976351441631313
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 130.53it/s]


  Best C: 1e-10

 Results
  bow_words, 15.8%
  unique predicted classes: [1450]
  tfidf_words, 5.3%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550]
  character_ngram_1, 21.1%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  character_ngram_2, 47.4%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  character_ngram_3, 36.8%
  unique predicted classes: [1425 1450 1475 1500 1525 1550 1575]
  word_ngram_1, 10.5%
  unique predicted classes: [1400 1425 1450 1475 1500 1525]
  word_ngram_2, 21.1%
  unique predicted classes: [1425 1450 1475 1500 1525]
  word_ngram_3, 21.1%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550]
 Controls
  MLE choice: 17.3%
  Uniform: 12.5%
  Weighted: 13.8%
Loading velux_facs_danish.npz... done (0.0s)
 Classes in data: [1400 1425 1450 1475 1500 1525 1550 1575]
 Running estimations
  Loaded feature set bow_words
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 86.10it/s] 

  Best C: 1e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 705.70it/s]

  Best C: 0.007847599703514624
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 518.20it/s]

  Best C: 1e-10
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 296.54it/s]

  Best C: 1e-10
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 298.88it/s]


  Best C: 0.007847599703514624
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 496.63it/s]


  Best C: 1e-10
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 145.44it/s]


  Best C: 0.007847599703514624
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 104.73it/s]

  Best C: 0.0012742749857031347

 Results
  bow_words, 15.8%
  unique predicted classes: [1425]
  tfidf_words, 5.3%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1575]
  character_ngram_1, 42.1%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  character_ngram_2, 31.6%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550]
  character_ngram_3, 21.1%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550]
  word_ngram_1, 5.3%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  word_ngram_2, 10.5%
  unique predicted classes: [1400 1425 1450 1475 1525]
  word_ngram_3, 21.1%
  unique predicted classes: [1425 1450 1475]
 Controls
  MLE choice: 17.3%





  Uniform: 12.6%
  Weighted: 14.2%
Loading StClare_dipl_latin.npz... done (0.1s)
 Classes in data: [1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500 1525]
 Running estimations
  Loaded feature set bow_words
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 36.38it/s] 

  Best C: 2.3357214690901214e-08
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 49.10it/s] 

  Best C: 11.288378916846883
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:02<00:00,  9.32it/s]

  Best C: 0.0012742749857031347
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 33.36it/s] 

  Best C: 0.0012742749857031347
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 27.21it/s]

  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 75.26it/s] 


  Best C: 0.007847599703514624
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 35.11it/s]


  Best C: 0.2976351441631313
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 33.99it/s] 


  Best C: 100000.0

 Results
  bow_words, 19.7%
  unique predicted classes: [1300]
  tfidf_words, 32.4%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  character_ngram_1, 36.6%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500]
  character_ngram_2, 47.9%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1450 1475 1500 1525]
  character_ngram_3, 40.8%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1450 1475 1500]
  word_ngram_1, 31.0%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  word_ngram_2, 19.7%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  word_ngram_3, 18.3%
  unique predicted classes: [1250 1275 1300 1325 1375 1400]
 Controls
  MLE choice: 19.3%
  Uniform: 8.4%
  Weighted: 14.5%
Loading SDHK_Latin.npz... done (2.1s)
 Classes in data: [1100 1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425
 1450 1475 1500]
 Running estimations
  Loaded featur

  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 50.23it/s]


  Best C: 1e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [01:05<00:00,  3.28s/it]

  Best C: 3.359818286283781e-05
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:15<00:00,  1.32it/s]


  Best C: 3.359818286283781e-05
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [07:58<00:00, 23.91s/it]


  Best C: 0.0012742749857031347
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [08:51<00:00, 26.59s/it]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [01:26<00:00,  4.34s/it]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:55<00:00,  2.78s/it]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:35<00:00,  1.79s/it]


  Best C: 0.0012742749857031347

 Results
  bow_words, 26.3%
  unique predicted classes: [1350]
  tfidf_words, 34.6%
  unique predicted classes: [1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425 1450 1475
 1500]
  character_ngram_1, 39.0%
  unique predicted classes: [1100 1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425
 1450 1475 1500]
  character_ngram_2, 53.6%
  unique predicted classes: [1100 1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425
 1450 1475 1500]
  character_ngram_3, 54.6%
  unique predicted classes: [1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425 1450
 1475 1500]
  word_ngram_1, 44.0%
  unique predicted classes: [1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425 1450 1475
 1500]
  word_ngram_2, 34.7%
  unique predicted classes: [1125 1150 1175 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425 1450
 1475 1500]
  word_ngram_3, 34.3%
  unique predicted classes: [1150 1175 1200 1225 1250 1275 1300 1325 1

  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 35.69it/s]

  Best C: 8.858667904100833e-07
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 41.02it/s]


  Best C: 428.13323987193957
  Loaded feature set character_ngram_1
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:02<00:00,  7.62it/s]

  Best C: 0.0012742749857031347
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 49.55it/s]


  Best C: 0.0012742749857031347
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 75.79it/s] 


  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 33.57it/s] 


  Best C: 0.007847599703514624
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 38.20it/s] 


  Best C: 11.288378916846883
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 30.02it/s]

  Best C: 0.04832930238571752

 Results
  bow_words, 19.7%
  unique predicted classes: [1300]
  tfidf_words, 22.5%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  character_ngram_1, 47.9%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1425 1475]
  character_ngram_2, 47.9%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1425 1450 1500 1525]
  character_ngram_3, 39.4%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1425]
  word_ngram_1, 32.4%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  word_ngram_2, 18.3%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  word_ngram_3, 22.5%
  unique predicted classes: [1275 1300 1325 1350 1425]
 Controls
  MLE choice: 19.3%
  Uniform: 8.3%
  Weighted: 14.5%
Loading StClare_facs_danish.npz... done (0.0s)
 Classes in data: [1400 1425 1450 1475 1500 1525 1550 1575]
 Running estimations
  Loaded feature set bow_words
   Reducing feature space
 Making t


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 92.20it/s] 

  Best C: 1e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 737.56it/s]


  Best C: 0.007847599703514624
  Loaded feature set character_ngram_1
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 534.89it/s]

  Best C: 0.2976351441631313
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 311.27it/s]

  Best C: 3.359818286283781e-05
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 402.66it/s]


  Best C: 1e-10
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 358.54it/s]


  Best C: 1e-10
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 124.36it/s]

  Best C: 0.007847599703514624
  Loaded feature set word_ngram_3
   Reducing feature space





 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 86.47it/s] 


  Best C: 0.0012742749857031347

 Results
  bow_words, 15.8%
  unique predicted classes: [1425]
  tfidf_words, 5.3%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1575]
  character_ngram_1, 42.1%
  unique predicted classes: [1400 1450 1475 1500 1525 1550]
  character_ngram_2, 31.6%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  character_ngram_3, 10.5%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1575]
  word_ngram_1, 5.3%
  unique predicted classes: [1400 1425 1450 1475 1500 1525 1550 1575]
  word_ngram_2, 10.5%
  unique predicted classes: [1400 1425 1450 1475 1525]
  word_ngram_3, 21.1%
  unique predicted classes: [1425 1450 1475]
 Controls
  MLE choice: 17.3%
  Uniform: 12.3%
  Weighted: 14.1%
Loading velux_facs_latin.npz... done (0.1s)
 Classes in data: [1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500 1525]
 Running estimations
  Loaded feature set bow_words
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 38.43it/s]

  Best C: 6.158482110660255e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 42.28it/s] 

  Best C: 0.2976351441631313
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:02<00:00,  8.03it/s] 

  Best C: 0.0012742749857031347
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 40.18it/s]


  Best C: 0.007847599703514624
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 72.55it/s] 


  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 39.22it/s] 


  Best C: 0.007847599703514624
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 32.10it/s]


  Best C: 11.288378916846883
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 34.94it/s]


  Best C: 0.04832930238571752

 Results
  bow_words, 19.7%
  unique predicted classes: [1300]
  tfidf_words, 23.9%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  character_ngram_1, 50.7%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1425 1475]
  character_ngram_2, 50.7%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1450 1500 1525]
  character_ngram_3, 42.3%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400 1425]
  word_ngram_1, 31.0%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  word_ngram_2, 16.9%
  unique predicted classes: [1250 1275 1300 1325 1350 1375 1400]
  word_ngram_3, 22.5%
  unique predicted classes: [1275 1300 1325 1350 1425]
 Controls
  MLE choice: 19.3%
  Uniform: 8.4%
  Weighted: 14.7%
Loading Colonia.npz... done (6.5s)
 Classes in data: [1500 1525 1550 1575 1600 1625 1650 1675 1700 1725 1750 1775 1800 1825
 1850 1875 1900 1925]
 Running estimations
  Loaded feature set bow_words
 

  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 67.88it/s] 


  Best C: 1e-10
  Loaded feature set bow_pos
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 947.06it/s]

  Best C: 1e-10
  Loaded feature set bow_words_pos
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 66.61it/s] 


  Best C: 1e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 42.92it/s]

  Best C: 0.0012742749857031347
  Loaded feature set tfidf_pos
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 135.94it/s]


  Best C: 0.2976351441631313
  Loaded feature set tfidf_words_pos
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 52.17it/s]

  Best C: 1e-10
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 138.18it/s]

  Best C: 0.00020691380811147902
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:01<00:00, 11.68it/s] 


  Best C: 0.007847599703514624
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:01<00:00, 10.44it/s]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:01<00:00, 19.84it/s]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 43.46it/s]


  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 142.54it/s]

  Best C: 1e-10

 Results
  bow_words, 26.3%
  unique predicted classes: [1875]
  bow_pos, 26.3%
  unique predicted classes: [1875]
  bow_words_pos, 26.3%
  unique predicted classes: [1875]
  tfidf_words, 47.4%
  unique predicted classes: [1500 1525 1600 1625 1650 1675 1725 1775 1850 1875 1925]
  tfidf_pos, 42.1%
  unique predicted classes: [1500 1525 1550 1575 1600 1625 1650 1675 1725 1825 1850 1875 1900]
  tfidf_words_pos, 42.1%
  unique predicted classes: [1525 1575 1625 1650 1675 1725 1875]
  character_ngram_1, 31.6%
  unique predicted classes: [1500 1525 1550 1625 1675 1725 1750 1875 1900]
  character_ngram_2, 47.4%
  unique predicted classes: [1525 1600 1625 1650 1675 1725 1825 1850 1875 1925]
  character_ngram_3, 42.1%
  unique predicted classes: [1525 1550 1625 1650 1675 1725 1775 1875]
  word_ngram_1, 42.1%
  unique predicted classes: [1500 1525 1625 1650 1675 1725 1775 1875]
  word_ngram_2, 36.8%
  unique predicted classes: [1500 1525 1625 1675 1725 1775 1875 1900]
  word_ngr




done (0.6s)
 Classes in data: [1150 1200 1225 1250 1275 1300 1325 1350 1375 1400 1425 1450 1475 1500]
 Running estimations
  Loaded feature set bow_words
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 260.83it/s]

  Best C: 1e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:10<00:00,  1.98it/s]

  Best C: 0.00020691380811147902
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:14<00:00,  1.34it/s]


  Best C: 0.2976351441631313
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [02:16<00:00,  6.82s/it]


  Best C: 0.0012742749857031347
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [01:06<00:00,  3.31s/it]


  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:04<00:00,  4.41it/s]


  Best C: 3.359818286283781e-05
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:02<00:00,  9.21it/s] 


  Best C: 0.00020691380811147902

 Results
  bow_words, 69.0%
  unique predicted classes: [1400]
  tfidf_words, 66.9%
  unique predicted classes: [1275 1325 1350 1375 1400]
  character_ngram_1, 82.5%
  unique predicted classes: [1150 1200 1275 1300 1325 1350 1375 1400 1425 1450 1500]
  character_ngram_2, 90.3%
  unique predicted classes: [1200 1225 1275 1300 1325 1350 1375 1400 1475]
  character_ngram_3, 89.0%
  unique predicted classes: [1275 1300 1325 1350 1375 1400]
  word_ngram_1, 79.3%
  unique predicted classes: [1275 1325 1350 1375 1400 1425]
  word_ngram_2, 69.9%
  unique predicted classes: [1300 1350 1375 1400 1475]
  word_ngram_3, 69.4%
  unique predicted classes: [1375 1400]
 Controls
  MLE choice: 69.2%
  Uniform: 7.1%
  Weighted: 54.3%
Loading SemEval2015.npz... done (0.2s)
 Classes in data: [1700 1725 1775 1800 1825 1850 1875 1900 1925 1950 1975 2000]
 Running estimations
  Loaded feature set bow_words
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:00<00:00, 176.59it/s]

  Best C: 1e-10
  Loaded feature set tfidf_words
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:14<00:00,  1.42it/s]

  Best C: 5.455594781168514e-06
  Loaded feature set character_ngram_1
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:36<00:00,  1.85s/it]

  Best C: 0.00020691380811147902
  Loaded feature set character_ngram_2
   Reducing feature space
 Making the estimator



  Searching for regularisation parameter C: 100%|██████████| 20/20 [01:37<00:00,  4.86s/it]


  Best C: 3.359818286283781e-05
  Loaded feature set character_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [01:04<00:00,  3.21s/it]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_1
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s]


  Best C: 0.00020691380811147902
  Loaded feature set word_ngram_2
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:16<00:00,  1.25it/s]


  Best C: 0.0012742749857031347
  Loaded feature set word_ngram_3
   Reducing feature space
 Making the estimator


  Searching for regularisation parameter C: 100%|██████████| 20/20 [00:05<00:00,  3.54it/s]


  Best C: 69.5192796177562

 Results
  bow_words, 25.5%
  unique predicted classes: [1700]
  tfidf_words, 29.7%
  unique predicted classes: [1700 1725 1775 1800 1825 1850 1875 1900 1925 1950 1975 2000]
  character_ngram_1, 31.0%
  unique predicted classes: [1700 1725 1775 1800 1825 1850 1875 1900 1925 1950 1975 2000]
  character_ngram_2, 32.9%
  unique predicted classes: [1700 1725 1775 1800 1825 1850 1875 1900 1925 1950 1975 2000]
  character_ngram_3, 35.5%
  unique predicted classes: [1700 1725 1775 1800 1825 1850 1875 1900 1925 1950 1975 2000]
  word_ngram_1, 32.9%
  unique predicted classes: [1700 1725 1775 1800 1825 1850 1875 1900 1925 1950 1975 2000]
  word_ngram_2, 27.2%
  unique predicted classes: [1700 1725 1775 1800 1825 1850 1875 1900 1925 1950 1975 2000]
  word_ngram_3, 27.4%
  unique predicted classes: [1700 1725 1800 1825 1850 1875 1900 1925 1950 1975 2000]
 Controls
  MLE choice: 25.5%
  Uniform: 8.3%
  Weighted: 11.9%


In [5]:
for data_source_filename in results.keys():
    dataset = load_dataset(data_source_filename)
    print(" Saving %s... " % data_source_filename.split("/")[-1], end="")
    t = time.time()
    if 'linearsvc' in dataset.keys():
        dataset.pop('linearsvc')
    np.savez_compressed(data_source_filename, **dataset, linearsvc=results[data_source_filename])
    print("done (%.1fs)" % (time.time()-t), flush=True)

Loading velux_dipl_latin.npz... done (0.2s)
Saving velux_dipl_latin.npz... done (0.4s)
Loading StClare_dipl_danish.npz... done (0.0s)
Saving StClare_dipl_danish.npz... done (0.1s)
Loading velux_dipl_danish.npz... done (0.0s)
Saving velux_dipl_danish.npz... done (0.1s)
Loading velux_facs_danish.npz... done (0.0s)
Saving velux_facs_danish.npz... done (0.1s)
Loading StClare_dipl_latin.npz... done (0.1s)
Saving StClare_dipl_latin.npz... done (0.3s)
Loading SDHK_Latin.npz... done (1.8s)
Saving SDHK_Latin.npz... done (8.1s)
Loading StClare_facs_latin.npz... done (0.1s)
Saving StClare_facs_latin.npz... done (0.4s)
Loading StClare_facs_danish.npz... done (0.0s)
Saving StClare_facs_danish.npz... done (0.1s)
Loading velux_facs_latin.npz... done (0.1s)
Saving velux_facs_latin.npz... done (0.4s)
Loading Colonia.npz... done (7.4s)
Saving Colonia.npz... done (20.0s)
Loading SDHK_Swedish.npz... done (0.7s)
Saving SDHK_Swedish.npz... done (3.3s)
Loading SemEval2015.npz... done (0.2s)
Saving SemEval201