# Setting up the work environment

In [None]:
from google.colab import drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/scripts-hu/issues/superlearner-files')

Mounted at /content/drive/


Installing mlens

In [None]:
pip install mlens

Collecting mlens
  Downloading mlens-0.2.3-py2.py3-none-any.whl (227 kB)
[?25l[K     |█▍                              | 10 kB 29.9 MB/s eta 0:00:01[K     |██▉                             | 20 kB 38.3 MB/s eta 0:00:01[K     |████▎                           | 30 kB 25.3 MB/s eta 0:00:01[K     |█████▊                          | 40 kB 18.4 MB/s eta 0:00:01[K     |███████▏                        | 51 kB 18.0 MB/s eta 0:00:01[K     |████████▋                       | 61 kB 13.6 MB/s eta 0:00:01[K     |██████████                      | 71 kB 13.9 MB/s eta 0:00:01[K     |███████████▌                    | 81 kB 15.5 MB/s eta 0:00:01[K     |█████████████                   | 92 kB 14.6 MB/s eta 0:00:01[K     |██████████████▍                 | 102 kB 13.4 MB/s eta 0:00:01[K     |███████████████▉                | 112 kB 13.4 MB/s eta 0:00:01[K     |█████████████████▎              | 122 kB 13.4 MB/s eta 0:00:01[K     |██████████████████▊             | 133 kB 13.4 MB/s eta 0

Loading packages

In [None]:
import pandas as pd
import numpy as np
import pickle as pk
from sklearn.metrics import accuracy_score # Load sklearn tools
from mlens.ensemble import SuperLearner # Load SuperLearner
import timeit
from sklearn.metrics import f1_score, accuracy_score

[MLENS] backend: threading


Loading classifiers

In [None]:
from sklearn.naive_bayes import MultinomialNB # 1
# 2, 3 and 4 (logistic regression L2/L1 penalty and elastic net)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # 5
from sklearn.ensemble import RandomForestClassifier # 6

## Setting up the Superlearner

In [None]:
# Set params
super_folds = 5
super_tol = .005

# Create a list of base-models
def get_models():
	models = list()
	models.append(MultinomialNB())
	models.append(LogisticRegression(solver = 'liblinear', 
	max_iter = 1000, tol = super_tol, penalty = "l2"))
	models.append(LogisticRegression(solver = 'liblinear', 
	max_iter = 1000, tol = super_tol, penalty = "l1"))
	models.append(LogisticRegression(solver = 'saga', max_iter = 1000, 
	penalty = 'elasticnet', l1_ratio = .5, multi_class = 'multinomial', 
	random_state = np.random.seed(1621447882), tol = super_tol))
	models.append(SVC(probability = True, tol = super_tol))
	models.append(RandomForestClassifier())
	return models

# Create the superlearner
def get_super_learner(X):
	ensemble = SuperLearner(scorer = None, folds = super_folds, shuffle = True, 
	random_state = np.random.seed(3027), sample_size = len(train_val), 
	n_jobs = 1, verbose = True)
	models = get_models() # Add base models
	ensemble.add(models, proba = True)
	ensemble.add_meta(LogisticRegression(solver = 'lbfgs', 
	max_iter = 1000, tol = super_tol), proba = False) # Add the meta model
	return ensemble
 

# Estimating Superlearner with alternative dfm
(bigrams and <20% document frequency)

Load press release data (training and test)

Load press release data (training and test)

In [None]:
df = pd.read_csv("dfmat_alt.csv")

train = df[df.cv_sample != 1] # five folds, use first fold as test set, others as training
train_val = train['label'].values
train = train.iloc[: , 2:]

test = df[df.cv_sample == 1]
test_val = test['label'].values
test = test.iloc[: , 2:]

train = train.values
test = test.values

print(train.shape)
print(test.shape)

(2098, 24317)
(514, 24317)


Run the model

## Five-fold cross-validation

In [None]:
# Prepare five-fold cross-validation

df = pd.read_csv("dfmat_alt.csv")

# Create a list to store the data of each fold
if os.path.isfile("super-pred.csv"):
  super_pred = pd.DataFrame(np.loadtxt("super-pred.csv", delimiter = ","))
  super_pred.columns = ['prediction', 'issue_r1', 'cv_sample']
  results = list(np.loadtxt("cv-accuracy.txt", delimiter = ","))
  times = list(np.loadtxt("cv-time.txt", delimiter = ","))
  print(f'Loaded backup from file. cv_sample:{set(super_pred.cv_sample)}')

else:
  super_pred = pd.DataFrame()
  results = []
  times = []


for i in [1, 2, 3, 4, 5]:
  print(i)

  if len(super_pred) > 0: 
    if i in set(super_pred.cv_sample):
      continue
  
  train = df[df.cv_sample != i] # five folds, use first fold as test set, others as training
  train_val = train['label'].values
  train = train.iloc[: , 2:]
  
  test = df[df.cv_sample == i]
  test_val = test['label'].values
  test = test.iloc[: , 2:]
  
  train = train.values
  test = test.values
  print(train.shape)
  print(test.shape)

  start = timeit.default_timer()
  
  # Create the super learner
  ensemble = get_super_learner(train)
  
  # Fit the super learner
  ensemble.fit(train, train_val)
  
  # Save time
  stop = timeit.default_timer()
  times.append((stop - start))

  y_pred = ensemble.predict(test)

  # Add the accuracy to the list
  results.append(accuracy_score(test_val, y_pred))

  this_super_pred = pd.DataFrame({'prediction': y_pred,
                                 'issue_r1': test_val,
                                 'cv_sample': i},
                                index = list(range(0, len(test_val))))
  

  # Summarize base learners
  print(ensemble.data)

  print(this_super_pred)

  super_pred = super_pred.append(this_super_pred)

  np.savetxt("cv-accuracy.txt", results, delimiter = ",")
  np.savetxt("cv-time.txt", times, delimiter = ",")
  np.savetxt("super-pred.csv", super_pred, delimiter = ",")


Loaded backup from file. cv_sample:{1.0, 2.0, 3.0, 4.0}
1
2
3
4
5
(2095, 24317)
(517, 24317)

Fitting 2 layers
Fit complete                        | 02:22:48

Predicting 2 layers
Predict complete                    | 00:00:29
                                     ft-m   ft-s   pt-m  pt-s
layer-1  logisticregression-1        1.13   0.03   0.07  0.01
layer-1  logisticregression-2        0.63   0.02   0.04  0.00
layer-1  logisticregression-3      530.07  26.80   0.04  0.00
layer-1  multinomialnb               2.50   0.13   0.04  0.00
layer-1  randomforestclassifier     12.84   0.23   0.08  0.00
layer-1  svc                       766.77   5.67  18.45  0.07

     prediction  issue_r1  cv_sample
0           6.0         6          5
1           1.0         1          5
2          12.0         2          5
3         191.0         2          5
4           7.0        10          5
..          ...       ...        ...
512         4.0         4          5
513        12.0        12          5
514   

In [None]:
print(f'Average accuracy: {sum(results)/len(results)}')

Average accuracy: 0.685059276826006
