In [41]:
import argparse
from collections import defaultdict, namedtuple
from io import open
import math
import os
from random import shuffle, uniform
from datetime import datetime
from future.utils import iterkeys, iteritems

from future.builtins import range
from future.utils import iteritems

In [42]:
!pip install wandb -q

In [43]:
import wandb
wandb.login(key="4a6e96eb645ce23f4ada4b7f5106dcbaed287c63")



True

In [None]:
run = wandb.init(
    name = "BKT baseline 2", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # run_id = ### Insert specific run id here if you want to resume a previous run
    # resume = "must" ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "BaseLine Ablations" ### Project should be created in your wandb account 
    #config = config ### Wandb Config for your run
)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Batch Accuracy,▁▆▄▆▅▆▆▅▆▆▅▄▆▄█▆▆▆▇▅▆▆▅█▇▇▆▇▆▇▆█▇▆▅▆▇▆▆█

0,1
Batch Accuracy,85.0


In [None]:
# Sigma is the L2 prior variance, regularizing the baseline model. Smaller sigma means more regularization.
_DEFAULT_SIGMA = 20.0

# Eta is the learning rate/step size for SGD. Larger means larger step size.
_DEFAULT_ETA = 0.1

In [None]:

def load_data(filename):
    """
    This method loads and returns the data in filename. If the data is labelled training data, it returns labels too.

    Parameters:
        filename: the location of the training or test data you want to load.

    Returns:
        data: a list of InstanceData objects from that data type and track.
        labels (optional): if you specified training data, a dict of instance_id:label pairs.
    """

    # 'data' stores a list of 'InstanceData's as values.
    data = []

    # If this is training data, then 'labels' is a dict that contains instance_ids as keys and labels as values.
    training = False
    if filename.find('train') != -1:
        training = True

    if training:
        labels = dict()

    num_exercises = 0
    print('Loading instances...')
    instance_properties = dict()

    with open(filename, 'rt') as f:
        for line in f:
            line = line.strip()

            # If there's nothing in the line, then we're done with the exercise. Print if needed, otherwise continue
            if len(line) == 0:
                num_exercises += 1
                if num_exercises % 100000 == 0:
                    print('Loaded ' + str(len(data)) + ' instances across ' + str(num_exercises) + ' exercises...')
                instance_properties = dict()

            # If the line starts with #, then we're beginning a new exercise
            elif line[0] == '#':
                if 'prompt' in line:
                    instance_properties['prompt'] = line.split(':')[1]
                else:
                    list_of_exercise_parameters = line[2:].split()
                    for exercise_parameter in list_of_exercise_parameters:
                        [key, value] = exercise_parameter.split(':')
                        if key == 'countries':
                            value = value.split('|')
                        elif key == 'days':
                            value = float(value)
                        elif key == 'time':
                            if value == 'null':
                                value = None
                            else:
                                assert '.' not in value
                                value = int(value)
                        instance_properties[key] = value

            # Otherwise we're parsing a new Instance for the current exercise
            else:
                line = line.split()
                if training:
                    assert len(line) == 7
                else:
                    assert len(line) == 6
                assert len(line[0]) == 12

                instance_properties['instance_id'] = line[0]

                instance_properties['token'] = line[1]
                instance_properties['part_of_speech'] = line[2]

                instance_properties['morphological_features'] = dict()
                for l in line[3].split('|'):
                    [key, value] = l.split('=')
                    if key == 'Person':
                        value = int(value)
                    instance_properties['morphological_features'][key] = value

                instance_properties['dependency_label'] = line[4]
                instance_properties['dependency_edge_head'] = int(line[5])
                if training:
                    label = float(line[6])
                    labels[instance_properties['instance_id']] = label
                data.append(InstanceData(instance_properties=instance_properties))

        print('Done loading ' + str(len(data)) + ' instances across ' + str(num_exercises) +
              ' exercises.\n')

    if training:
        return data, labels
    else:
        return data

In [None]:
class InstanceData(object):
    """
    A bare-bones class to store the included properties of each instance. This is meant to act as easy access to the
    data, and provides a launching point for deriving your own features from the data.
    """
    def __init__(self, instance_properties):

        # Parameters specific to this instance
        self.instance_id = instance_properties['instance_id']
        self.token = instance_properties['token']
        self.part_of_speech = instance_properties['part_of_speech']
        self.morphological_features = instance_properties['morphological_features']
        self.dependency_label = instance_properties['dependency_label']
        self.dependency_edge_head = instance_properties['dependency_edge_head']

        # Derived parameters specific to this instance
        self.exercise_index = int(self.instance_id[8:10])
        self.token_index = int(self.instance_id[10:12])

        # Derived parameters specific to this exercise
        self.exercise_id = self.instance_id[:10]

        # Parameters shared across the whole session
        self.user = instance_properties['user']
        self.countries = instance_properties['countries']
        self.days = instance_properties['days']
        self.client = instance_properties['client']
        self.session = instance_properties['session']
        self.format = instance_properties['format']
        self.time = instance_properties['time']
        self.prompt = instance_properties.get('prompt', None)

        # Derived parameters shared across the whole session
        self.session_id = self.instance_id[:8]

    def to_features(self):
        """
        Prepares those features that we wish to use in the LogisticRegression example in this file. We introduce a bias,
        and take a few included features to use. Note that this dict restructures the corresponding features of the
        input dictionary, 'instance_properties'.

        Returns:
            to_return: a representation of the features we'll use for logistic regression in a dict. A key/feature is a
                key/value pair of the original 'instance_properties' dict, and we encode this feature as 1.0 for 'hot'.
        """
        to_return = dict()

        to_return['bias'] = 1.0
        to_return['user:' + self.user] = 1.0
        to_return['format:' + self.format] = 1.0
        to_return['token:' + self.token.lower()] = 1.0

        to_return['part_of_speech:' + self.part_of_speech] = 1.0
        for morphological_feature in self.morphological_features:
            to_return['morphological_feature:' + morphological_feature] = 1.0
        to_return['dependency_label:' + self.dependency_label] = 1.0
        
        time = datetime.now()
        if(time.second %10 == 0 and time.microsecond == 0):
          print(time)
          
        return to_return

In [None]:
class LogisticRegressionInstance(namedtuple('Instance', ['features', 'label', 'name'])):
    """
    A named tuple for packaging together the instance features, label, and name.
    """
    def __new__(cls, features, label, name):
        if label:
            if not isinstance(label, (int, float)):
                raise TypeError('LogisticRegressionInstance label must be a number.')
            label = float(label)
        if not isinstance(features, dict):
            raise TypeError('LogisticRegressionInstance features must be a dict.')
        return super(LogisticRegressionInstance, cls).__new__(cls, features, label, name)


class LogisticRegression(object):
    """
    An L2-regularized logistic regression object trained using stochastic gradient descent.
    """

    def __init__(self, sigma=_DEFAULT_SIGMA, eta=_DEFAULT_ETA):
        super(LogisticRegression, self).__init__()
        self.sigma = sigma  # L2 prior variance
        self.eta = eta  # initial learning rate
        self.weights = defaultdict(lambda: uniform(-1.0, 1.0)) # weights initialize to random numbers
        self.fcounts = None # this forces smaller steps for things we've seen often before

    def predict_instance(self, instance):
        """
        This computes the logistic function of the dot product of the instance features and the weights.
        We truncate predictions at ~10^(-7) and ~1 - 10^(-7).
        """
        a = min(17., max(-17., sum([float(self.weights[k]) * instance.features[k] for k in instance.features])))
        return 1. / (1. + math.exp(-a))

    def error(self, instance):
        return instance.label - self.predict_instance(instance)

    def reset(self):
        self.fcounts = defaultdict(int)

    def training_update(self, instance):
        if self.fcounts is None:
            self.reset()
        err = self.error(instance)
        for k in instance.features:
            rate = self.eta / math.sqrt(1 + self.fcounts[k])
            # L2 regularization update
            if k != 'bias':
                self.weights[k] -= rate * self.weights[k] / self.sigma ** 2
            # error update
            self.weights[k] += rate * err * instance.features[k]
            # increment feature count for learning rate
            self.fcounts[k] += 1

    def train(self, train_set, dev_set, iterations=10):
        for it in range(iterations):
            print('Training iteration ' + str(it+1) + '/' + str(iterations) + '...')
            shuffle(train_set)
            i = 0
            for instance in train_set:
                self.training_update(instance)
                if(i % 100 == 0):
                    print(str(i) + " out of " + str(len(train_set)))
                    i +=1
            predictions = self.predict_test_set(dev_set)
            labels = load_labels("/content/en_es/en_es.slam.20190204.dev.key")

            directory = os.path.dirname("/content/out.pred" + str(it))
            if not os.path.exists(directory):
                os.makedirs(directory)

            with open("/content/out.pred" + str(it), 'wt') as f:
              for instance_id, prediction in iteritems(predictions):
                  f.write(instance_id + ' ' + str(prediction) + '\n')
                  
            predictions = load_labels("/content/out.pred" + str(it))

            actual = []
            predicted = []

            for instance_id in iterkeys(labels):
                try:
                    actual.append(labels[instance_id])
                    predicted.append(predictions[instance_id])
                except KeyError:
                    print('No prediction for instance ID ' + instance_id + '!')

            acc, avg_log_loss, auroc, F1 = evaluate_metrics(actual, predicted)
            print("acc : " + str(acc) + " avg log loss: " + str(avg_log_loss) + " auroc: " + str(auroc) + " F1: " + str(F1))
            print('Saving to WandB')
            wandb.log({'Log Loss': avg_log_loss, 'aucroc': auroc, 'F1': F1,'accuracy': acc})

        
        print('\n')

    def predict_test_set(self, test_set):
        return {instance.name: self.predict_instance(instance) for instance in test_set}


In [None]:
def load_labels(filename):
    """
    This loads labels, either the actual ones or your predictions.

    Parameters:
        filename: the filename pointing to your labels

    Returns:
        labels: a dict of instance_ids as keys and labels between 0 and 1 as values
    """
    labels = dict()

    with open(filename, 'rt') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            else:
                line = line.split()
            instance_id = line[0]
            label = float(line[1])
            labels[instance_id] = label
    return labels


def compute_acc(actual, predicted):
    """
    Computes the accuracy of your predictions, using 0.5 as a cutoff.

    Note that these inputs are lists, not dicts; they assume that actual and predicted are in the same order.

    Parameters (here and below):
        actual: a list of the actual labels
        predicted: a list of your predicted labels
    """
    num = len(actual)
    acc = 0.
    for i in range(num):
        if round(actual[i], 0) == round(predicted[i], 0):
            acc += 1.
    acc /= num
    return acc


def compute_avg_log_loss(actual, predicted):
    """
    Computes the average log loss of your predictions.
    """
    num = len(actual)
    loss = 0.

    for i in range(num):
        p = predicted[i] if actual[i] > .5 else 1. - predicted[i]
        loss -= math.log(p)
    loss /= num
    return loss


def compute_auroc(actual, predicted):
    """
    Computes the area under the receiver-operator characteristic curve.
    This code a rewriting of code by Ben Hamner, available here:
    https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py
    """
    num = len(actual)
    temp = sorted([[predicted[i], actual[i]] for i in range(num)], reverse=True)

    sorted_predicted = [row[0] for row in temp]
    sorted_actual = [row[1] for row in temp]

    sorted_posterior = sorted(zip(sorted_predicted, range(len(sorted_predicted))))
    r = [0 for k in sorted_predicted]
    cur_val = sorted_posterior[0][0]
    last_rank = 0
    for i in range(len(sorted_posterior)):
        if cur_val != sorted_posterior[i][0]:
            cur_val = sorted_posterior[i][0]
            for j in range(last_rank, i):
                r[sorted_posterior[j][1]] = float(last_rank+1+i)/2.0
            last_rank = i
        if i==len(sorted_posterior)-1:
            for j in range(last_rank, i+1):
                r[sorted_posterior[j][1]] = float(last_rank+i+2)/2.0

    num_positive = len([0 for x in sorted_actual if x == 1])
    num_negative = num - num_positive
    sum_positive = sum([r[i] for i in range(len(r)) if sorted_actual[i] == 1])
    auroc = ((sum_positive - num_positive * (num_positive + 1) / 2.0) / (num_negative * num_positive))

    return auroc


def compute_f1(actual, predicted):
    """
    Computes the F1 score of your predictions. Note that we use 0.5 as the cutoff here.
    """
    num = len(actual)

    true_positives = 0
    false_positives = 0
    false_negatives = 0
    true_negatives = 0

    for i in range(num):
        if actual[i] >= 0.5 and predicted[i] >= 0.5:
            true_positives += 1
        elif actual[i] < 0.5 and predicted[i] >= 0.5:
            false_positives += 1
        elif actual[i] >= 0.5 and predicted[i] < 0.5:
            false_negatives += 1
        else:
            true_negatives += 1

    try:
        precision = true_positives / (true_positives + false_positives)
        recall = true_positives / (true_positives + false_negatives)
        F1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        F1 = 0.0

    return F1


def evaluate_metrics(actual, predicted):
    """
    This computes and returns a dictionary of notable evaluation metrics for your predicted labels.
    """
    acc = compute_acc(actual, predicted)
    avg_log_loss = compute_avg_log_loss(actual, predicted)
    auroc = compute_auroc(actual, predicted)
    F1 = compute_f1(actual, predicted)

    return  acc, avg_log_loss,  auroc, F1


def test_metrics():
    actual = [1, 0, 0, 1, 1, 0, 0, 1, 0, 1]
    predicted = [0.8, 0.2, 0.6, 0.3, 0.1, 0.2, 0.3, 0.9, 0.2, 0.7]
    metrics = evaluate_metrics(actual, predicted)
    metrics = {key: round(metrics[key], 3) for key in iterkeys(metrics)}
    assert metrics['accuracy'] == 0.700
    assert metrics['avglogloss'] == 0.613
    assert metrics['auroc'] == 0.740
    assert metrics['F1'] == 0.667
    print('Verified that our environment is calculating metrics correctly.')

In [None]:
training_data, training_labels = load_data("/content/en_es/en_es.slam.20190204.train")

Loading instances...
Loaded 317049 instances across 100000 exercises...
Loaded 635368 instances across 200000 exercises...
Loaded 951536 instances across 300000 exercises...
Loaded 1271940 instances across 400000 exercises...
Loaded 1591344 instances across 500000 exercises...
Loaded 1911212 instances across 600000 exercises...
Loaded 2227444 instances across 700000 exercises...
Loaded 2546704 instances across 800000 exercises...
Done loading 2622957 instances across 824012 exercises.



In [None]:
test_data = load_data("/content/en_es/en_es.slam.20190204.dev")

Loading instances...
Loaded 334439 instances across 100000 exercises...
Done loading 387374 instances across 115770 exercises.



In [95]:
training_data[0].to_features()

{'bias': 1.0,
 'user:XEinXf5+': 1.0,
 'format:reverse_translate': 1.0,
 'token:i': 1.0,
 'part_of_speech:PRON': 1.0,
 'morphological_feature:Case': 1.0,
 'morphological_feature:Number': 1.0,
 'morphological_feature:Person': 1.0,
 'morphological_feature:PronType': 1.0,
 'morphological_feature:fPOS': 1.0,
 'dependency_label:nsubj': 1.0}

In [44]:
exercices = {}
word_dict = {}
unique_word_index = 0;
for instance in training_data:
  user = instance.user
  instance_id = instance.instance_id[:-2]
  if user not in exercices:
    exercices[user] = {}
  if instance_id not in exercices[user] :
    exercices[user][instance_id] = []
  token = instance.token.lower()
  if token in word_dict:
    exercices[user][instance_id].append(word_dict[token])
  else:
    word_dict[token] = unique_word_index
    exercices[user][instance_id].append(word_dict[token])
    unique_word_index += 1

exercices_merged = {}
for user in exercices:
  exercices_merged[user] = list(exercices[user].values())

In [96]:
len(exercices_merged.keys())

2593

In [52]:
test_exercices = {}
for instance in test_data:
  user = instance.user
  instance_id = instance.instance_id[:-2]
  if user not in test_exercices:
    test_exercices[user] = {}
  if instance_id not in test_exercices[user] :
    test_exercices[user][instance_id] = []
  token = instance.token.lower()
  if token in word_dict:
    test_exercices[user][instance_id].append(word_dict[token])
  else:
    test_exercices[user][instance_id].append(-1)

test_exercices_merged = {}
for user in test_exercices:
  test_exercices_merged[user] = list(test_exercices[user].values())

In [53]:
test_exercices_merged["XEinXf5+"]

[[104, 320],
 [32, 28, 320],
 [651, 30, 717],
 [482, 327, 30, 322],
 [77, 110, 324],
 [325],
 [327, 322, 328, 30, 329],
 [847, 8, 2, 331, 336, 77, 106],
 [325, 8, 2, 331, 336, 77, 106],
 [77, 719],
 [77, 610, 8, 2, 697],
 [4, 330, 43, 333],
 [77, 334],
 [62, 339, 37],
 [28, 286],
 [28, 31],
 [77, 341],
 [279, 280, 30, 259],
 [10, 159],
 [0, 165, 166],
 [164, 115],
 [6, 306, 8, 28],
 [77, 303, 35, 304],
 [238, 8, 77, 343],
 [0, 83, 6, 344],
 [238, 8, 6, 345],
 [15, 8, 6, 485],
 [77, 591],
 [77, 487],
 [77, 501],
 [10, 8, 6, 500],
 [6, 58, 8, 2, 629],
 [10, 8, 2, 148],
 [12, 8, 349],
 [77, 350, 153],
 [56, 353],
 [35, 146, 359],
 [6, 113, 8, 358],
 [77, 112, 8, 70, 357],
 [77, 102],
 [77, 180],
 [0, 1, 56, 77, 291],
 [10, 8, 2, 833]]

In [None]:
training_instances = [LogisticRegressionInstance(features=instance_data.to_features(),
                                                  label=training_labels[instance_data.instance_id],
                                                  name=instance_data.instance_id
                                                  ) for instance_data in training_data]

In [None]:
keys_data = {}
for instance in training_instances:
  keys = instance.features.keys()
  for key in keys:
    if key in keys_data:
      keys_data[key] += instance.features[key]
    else:
      keys_data[key] = instance.features[key]

In [None]:
test_instances = [LogisticRegressionInstance(features=instance_data.to_features(),
                                                 label=None,
                                                 name=instance_data.instance_id
                                                 ) for instance_data in test_data]

In [None]:
logistic_regression_model = LogisticRegression()

In [None]:
logistic_regression_model.train(training_instances,test_instances, iterations=20)

Training iteration 1/20...
0 out of 2622957
acc : 0.8596937326717848 avg log loss: 0.3593452346640574 auroc: 0.7479930335504222 F1: 0.0963338598387231
Saving to WandB
Training iteration 2/20...
0 out of 2622957
acc : 0.8605611114839922 avg log loss: 0.35349000906632144 auroc: 0.7607264505626681 F1: 0.1182232234683383
Saving to WandB
Training iteration 3/20...
0 out of 2622957
acc : 0.8612297159850687 avg log loss: 0.35058050424851606 auroc: 0.7663706816152711 F1: 0.14218236364216638
Saving to WandB
Training iteration 4/20...
0 out of 2622957
acc : 0.8614646310800416 avg log loss: 0.3491360286557647 auroc: 0.7694004226107698 F1: 0.14864757674307924
Saving to WandB
Training iteration 5/20...
0 out of 2622957
acc : 0.8617563388353374 avg log loss: 0.34810966982513547 auroc: 0.7714736786751871 F1: 0.153970109640115
Saving to WandB
Training iteration 6/20...
0 out of 2622957
acc : 0.8619060649398256 avg log loss: 0.347310578847728 auroc: 0.7728377371244854 F1: 0.1650694552832839
Saving to W

In [None]:
predictions = logistic_regression_model.predict_test_set(test_instances)

In [None]:
with open("/content/out.pred", 'wt') as f:
    for instance_id, prediction in iteritems(predictions):
        f.write(instance_id + ' ' + str(prediction) + '\n')

In [None]:
print('\nLoading labels for exercises...')
labels = load_labels("/content/en_es/en_es.slam.20190204.dev.key")
print(labels)
print('Loading predictions for exercises...')
predictions = load_labels("/content/out.pred")

actual = []
predicted = []

for instance_id in iterkeys(labels):
    try:
        actual.append(labels[instance_id])
        predicted.append(predictions[instance_id])
    except KeyError:
        print('No prediction for instance ID ' + instance_id + '!')

metrics = evaluate_metrics(actual, predicted)
line = '\t'.join([('%s=%.3f' % (metric, value)) for (metric, value) in iteritems(metrics)])
print('Metrics:\t' + line)


Loading labels for exercises...


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Metrics:	accuracy=0.862	avglogloss=0.346	auroc=0.776	F1=0.180


## Test BKT 

In [None]:
import numpy as np

In [65]:
class BKTLearner(object):
    
    def __init__(self, state_size, slip_prob, transition_prob, guess_prob):
        self.state_size = state_size
        self.state = np.zeros(self.state_size)
        self.slip_prob = slip_prob
        self.transition_prob = transition_prob
        self.guess_prob = guess_prob
    
    def reset(self):
        self.state = np.zeros(self.state_size)
        
    def predictAnswer(self, input):
        answer = []
        for token_index in input:
            p_correct = self.state[token_index] * (1 - self.slip_prob) + (1 - self.state[token_index]) * self.guess_prob 
            value = np.random.choice(np.array([0,1]), p = np.array([1 - p_correct, p_correct]))
            answer.append(value)
        return np.array(answer)
    
    def updateKnowledgeState(self, output_correctness, tokens):
        i = 0
        for token_index in tokens:
            if output_correctness[i] == 1:
                PLt_obs = self.state[token_index]*(1 - self.slip_prob) / (self.state[token_index]*(1 - self.slip_prob) + (1 - self.state[token_index])*self.guess_prob)
            else:
                PLt_obs = self.state[token_index]*(self.slip_prob) / (self.state[token_index]*(self.slip_prob) + (1 - self.state[token_index])*(1 - self.guess_prob))
            #print(PLt_obs ,self.state[token_index],self.slip_prob,self.guess_prob, output_correctness[i], token_index)
            self.state[token_index] = PLt_obs + (1 - PLt_obs)*self.transition_prob
            i += 1 
    
    def trainOneSet(self, excercises):
        for exercise in excercises:
            answer_correctness = self.predictAnswer(exercise)
            self.updateKnowledgeState(answer_correctness, exercise)
    
    def testOneSet(self, excercises):
        answer_correctness = []
        for exercise in excercises:
            answer_correctness_ex = self.predictAnswer(exercise)
            answer_correctness.append(answer_correctness_ex)
        return np.array(answer_correctness)
        
    def computeAccuracyForTest(self, test_response):
        correct = 0;
        total = 0;
        for exercise in test_response:
            for token in exercise:
                correct += token
                total += 1
        
        if(total == 0):
          return 0

        return float(correct)/total * 100
    
    def train(self, exercices_all, train_duration, test_duration):
        i = 0;
        accuracy = 0
        batch = 0
        
        while i < len(exercices_all):
            train_batch = exercices_all[i:train_duration + i]
            self.trainOneSet(train_batch)
            i += train_duration
            test_batch = exercices_all[i:i + test_duration]
            answer_correctness = self.testOneSet(test_batch)
            i += test_duration
            accuracy = self.computeAccuracyForTest(answer_correctness) 
            print("Batch + " + str(batch) + " " + " correct: " + str(accuracy))
            wandb.log({'Batch Accuracy': accuracy})
            batch += 1

In [112]:
learner = BKTLearner(len(word_dict), 0.05, 0.1, 0)

In [92]:
len(inv_word_dict)

1967

In [99]:
inv_word_dict = {}
for key, val in word_dict.items():
  inv_word_dict[val] = key

In [100]:
inv_word_dict[684]

'rather'

In [114]:
learner.train(exercices_merged["XEinXf5+"], 10, 0)

Batch + 0  correct: 0
Batch + 1  correct: 0
Batch + 2  correct: 0
Batch + 3  correct: 0
Batch + 4  correct: 0
Batch + 5  correct: 0
Batch + 6  correct: 0
Batch + 7  correct: 0
Batch + 8  correct: 0
Batch + 9  correct: 0
Batch + 10  correct: 0
Batch + 11  correct: 0
Batch + 12  correct: 0
Batch + 13  correct: 0
Batch + 14  correct: 0
Batch + 15  correct: 0
Batch + 16  correct: 0
Batch + 17  correct: 0
Batch + 18  correct: 0
Batch + 19  correct: 0
Batch + 20  correct: 0
Batch + 21  correct: 0
Batch + 22  correct: 0
Batch + 23  correct: 0
Batch + 24  correct: 0
Batch + 25  correct: 0
Batch + 26  correct: 0
Batch + 27  correct: 0
Batch + 28  correct: 0
Batch + 29  correct: 0
Batch + 30  correct: 0
Batch + 31  correct: 0
Batch + 32  correct: 0
Batch + 33  correct: 0
Batch + 34  correct: 0
Batch + 35  correct: 0


In [102]:
test_exercices["XEinXf5+"]

{'rsAkJBG001': [104, 320],
 'rsAkJBG002': [32, 28, 320],
 'rsAkJBG003': [651, 30, 717],
 'xR1LbGwW01': [482, 327, 30, 322],
 'xR1LbGwW02': [77, 110, 324],
 'ITvkpyv+01': [325],
 'ITvkpyv+02': [327, 322, 328, 30, 329],
 'ITvkpyv+03': [847, 8, 2, 331, 336, 77, 106],
 'ITvkpyv+04': [325, 8, 2, 331, 336, 77, 106],
 '6wunbdfa01': [77, 719],
 '6wunbdfa02': [77, 610, 8, 2, 697],
 '6wunbdfa03': [4, 330, 43, 333],
 'OM/eqPd701': [77, 334],
 'OM/eqPd702': [62, 339, 37],
 'ZjRIRR9s01': [28, 286],
 'ZjRIRR9s02': [28, 31],
 'ZjRIRR9s03': [77, 341],
 'T4rpAznq01': [279, 280, 30, 259],
 'djFqFxWS01': [10, 159],
 'djFqFxWS02': [0, 165, 166],
 'djFqFxWS03': [164, 115],
 '7KfZc1xB01': [6, 306, 8, 28],
 '7KfZc1xB02': [77, 303, 35, 304],
 'BNZ3C6fs01': [238, 8, 77, 343],
 'BNZ3C6fs02': [0, 83, 6, 344],
 '4C0NvZIf01': [238, 8, 6, 345],
 '4C0NvZIf02': [15, 8, 6, 485],
 'Xywa9rJq01': [77, 591],
 'Xywa9rJq02': [77, 487],
 'Xywa9rJq03': [77, 501],
 'obIZU+GP01': [10, 8, 6, 500],
 'obIZU+GP02': [6, 58, 8, 2, 62

In [115]:
res = learner.testOneSet(test_exercices_merged["XEinXf5+"])

  return np.array(answer_correctness)


In [116]:
np.logical_not(np.concatenate(res)).astype(int)

array([1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [117]:
labels = np.array([1, 1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1])

In [109]:
labels

array([1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [122]:
compute_f1(labels,np.logical_not(np.concatenate(res)).astype(int))

0.39080459770114945