In [58]:
"""
evaluation of active learning
cross validation - compare classification on initial (core/bootstrap) set before and 
after the addition of egal queried samples to the training set.
unseen test - evaluate pre and post active learning models on unseen test data. 
author : eoghan cunningham
"""

'\nevaluation of active learning\ncross validation - compare classification on initial (core/bootstrap) set before and \nafter the addition of egal queried samples to the training set.\nunseen test - evaluate pre and post active learning models on unseen test data. \nauthor : eoghan cunningham\n'

In [59]:
import pymongo 
import pandas as pd 
import numpy as np
import random
import json 
import seaborn as sns
import dns

In [60]:
client = pymongo.MongoClient('mongodb+srv://eoghan:Ailbhe123@fypcluster-cqcwt.mongodb.net/test?retryWrites=true&w=majority')

In [61]:
db = client.beta_db
comments = db.comments

In [62]:
# additional active learning samples 
queried_comments_list = list(comments.find({"queried" : 1}))
#initial bootstrap set
init_comments_list = list(comments.find({"label" : {"$exists": True}}))

In [63]:
queried_df = pd.DataFrame(queried_comments_list)
init_df = pd.DataFrame(init_comments_list)

In [64]:
print(queried_df.shape, init_df.shape)

(100, 23) (304, 18)


In [65]:
# drop problematic annotators and aggregate annotations
queried_df = queried_df.drop(['annotator_2', 'annotator_7'],axis = 1)
queried_df['annotation'] = (queried_df[[col for col in queried_df.columns if 'annotator' in col]].mean(axis = 1)+0.01).round()
queried_df['annotation'] = queried_df['annotation'].map(int)
queried_df['annotation'].value_counts()

0    62
1    38
Name: annotation, dtype: int64

In [66]:
init_df['label'] = (init_df[['label','pilot_1','pilot_2','pilot_3']].mean(axis = 1)+0.01).round()
init_df['label'].value_counts()

0.0    171
1.0    133
Name: label, dtype: int64

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from scipy import sparse

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [68]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    
    def __call__(self, doc):
        words = word_tokenize(doc)
        new_words= [word for word in words if word.isalnum()]
        return [self.wnl.lemmatize(t) for t in new_words]

In [69]:
kf = StratifiedKFold(n_splits = 10, random_state = 1)
tfidf_vect = TfidfVectorizer(ngram_range = (1,3), min_df = 5, stop_words = 'english',
                            tokenizer = LemmaTokenizer())
clf = RandomForestClassifier(n_estimators = 500, random_state = 1)

In [70]:
thresholds = np.linspace(0,1,51)
performance = pd.DataFrame()
f = 0

In [71]:
# classification on bootstrap samples
for train, test in kf.split(init_df.comment, init_df.label):
    f += 1
    fold = []
    train, test = init_df.iloc[train], init_df.iloc[test]
    X_train, X_test = train.comment, test.comment
    y_train, y_test = train.label, test.label
    
    X_train_vect = tfidf_vect.fit_transform(X_train)
    X_test_vect = tfidf_vect.transform(X_test)
    
    clf.fit(X_train_vect.toarray(), y_train)
    predicted_proba = clf.predict_proba(X_test_vect.toarray())
    
    for threshold in thresholds:

        pred = (predicted_proba [:,1] >= threshold).astype('int')

        fold.append(metrics.balanced_accuracy_score(y_test, pred))

    performance[f] = fold
performance.index = thresholds

  'stop_words.' % sorted(inconsistent))


In [72]:
best = performance.mean(axis = 1).sort_values(ascending = False).iloc[0:1]
print("BAS : {} achieved at threshold : {}".format(best.iloc[0],best.index[0]))
threshold_pre_al = best.index[0]

BAS : 0.8693564605329313 achieved at threshold : 0.38


In [73]:
thresholds = np.linspace(0,1,51)
performance = pd.DataFrame()
f = 0
# classification with additional training data (AL queries)
for train, test in kf.split(init_df.comment, init_df.label):
    
    f += 1
    fold = []
    train, test = init_df.iloc[train], init_df.iloc[test]
    X_train, X_test = train.comment.append(queried_df.comment, ignore_index = True), test.comment
    y_train, y_test = train.label.append(queried_df.annotation, ignore_index = True), test.label

    X_train_vect = tfidf_vect.fit_transform(X_train)
    X_test_vect = tfidf_vect.transform(X_test)
    
    clf.fit(X_train_vect.toarray(), y_train)
    predicted_proba = clf.predict_proba(X_test_vect.toarray())
    
    for threshold in thresholds:

        pred = (predicted_proba [:,1] >= threshold).astype('int')

        fold.append(metrics.balanced_accuracy_score(y_test, pred))

    performance[f] = fold
performance.index = thresholds

In [74]:
best = performance.mean(axis = 1).sort_values(ascending = False).iloc[0:1]
print("BAS : {} achieved at threshold : {}".format(best.iloc[0],best.index[0]))
threshold_post_al = best.index[0]

BAS : 0.8780273647920707 achieved at threshold : 0.44


In [75]:
# load unseen test set 1
test_df = pd.DataFrame(list(comments.find({'unseen_test' : {'$exists': 'true'}})))
# aggregate annotations 
test_df['annotation'] = (test_df[['annotator_3','me','annotator_1']].mean(axis = 1)+0.01).round()
test_df.annotation.value_counts()

0.0    133
1.0     17
Name: annotation, dtype: int64

In [76]:
X_train = init_df.comment
y_train = init_df.label

X_test = test_df.comment
y_test = test_df.annotation

print(X_train.shape)

X_train_vect = tfidf_vect.fit_transform(X_train)
X_test_vect = tfidf_vect.transform(X_test)
    
clf.fit(X_train_vect.toarray(), y_train)
predicted_proba = clf.predict_proba(X_test_vect.toarray())
pred = (predicted_proba [:,1] >= threshold_pre_al).astype('int')

print("Pre AL performance - trained only on boostrap set: \n")
print(metrics.classification_report(y_test, pred))
print("BAS : {}".format(metrics.balanced_accuracy_score(y_test, pred)))
roc = metrics.roc_curve(y_test,predicted_proba [:,1])
print("AUC : {}".format(metrics.auc(x = roc[0], y = roc[1])))

(304,)
Pre AL performance - trained only on boostrap set: 

              precision    recall  f1-score   support

         0.0       0.94      0.75      0.84       133
         1.0       0.25      0.65      0.36        17

    accuracy                           0.74       150
   macro avg       0.60      0.70      0.60       150
weighted avg       0.86      0.74      0.78       150

BAS : 0.6994692613887661
AUC : 0.7936753648827951


In [77]:
X_train = init_df.comment.append(queried_df.comment, ignore_index = True)
y_train = init_df.label.append(queried_df.annotation, ignore_index = True)

X_test = test_df.comment
y_test = test_df.annotation

print(X_train.shape)

X_train_vect = tfidf_vect.fit_transform(X_train)
X_test_vect = tfidf_vect.transform(X_test)
    
clf.fit(X_train_vect.toarray(), y_train)
predicted_proba = clf.predict_proba(X_test_vect.toarray())
pred = (predicted_proba [:,1] >= threshold_post_al).astype('int')

print("Post AL performance - trained with additional training data:\n")
print(metrics.classification_report(y_test, pred))
print("BAS : {}".format(metrics.balanced_accuracy_score(y_test, pred)))
roc = metrics.roc_curve(y_test,predicted_proba [:,1])
print("AUC : {}".format(metrics.auc(x = roc[0], y = roc[1])))

(404,)
Post AL performance - trained with additional training data:

              precision    recall  f1-score   support

         0.0       0.94      0.85      0.89       133
         1.0       0.33      0.59      0.43        17

    accuracy                           0.82       150
   macro avg       0.64      0.72      0.66       150
weighted avg       0.87      0.82      0.84       150

BAS : 0.7189296771340115
AUC : 0.8244139761167626


In [78]:
# load additional unseen test data
final_test_df = pd.DataFrame(list(comments.find({'final_test' : {'$exists': 'true'}})))
final_test_df['annotation'] = (final_test_df[['annotator_1','annotator_3','me']].mean(axis = 1)+0.01).round()
final_test_df.annotation.value_counts()

0.0    102
1.0     37
Name: annotation, dtype: int64

In [79]:
# evaluate using all unseen test data
X_train = init_df.comment
y_train = init_df.label

X_test = final_test_df.comment.append(test_df.comment, ignore_index = True)
y_test = final_test_df.annotation.append(test_df.annotation, ignore_index = True)

print(X_train.shape)

X_train_vect = tfidf_vect.fit_transform(X_train)
X_test_vect = tfidf_vect.transform(X_test)
    
clf.fit(X_train_vect.toarray(), y_train)
predicted_proba = clf.predict_proba(X_test_vect.toarray())
pred = (predicted_proba [:,1] >= threshold_pre_al).astype('int')

print("Pre AL performance - trained only on bootstrap set:\n")
print(metrics.classification_report(y_test, pred))
print("BAS : {}".format(metrics.balanced_accuracy_score(y_test, pred)))
roc = metrics.roc_curve(y_test,predicted_proba [:,1])
print("AUC : {}".format(metrics.auc(x = roc[0], y = roc[1])))

(304,)
Pre AL performance - trained only on bootstrap set:

              precision    recall  f1-score   support

         0.0       0.93      0.69      0.80       235
         1.0       0.37      0.78      0.50        54

    accuracy                           0.71       289
   macro avg       0.65      0.74      0.65       289
weighted avg       0.83      0.71      0.74       289

BAS : 0.7356973995271867
AUC : 0.8295902285263989


In [81]:
X_train = init_df.comment.append(queried_df.comment, ignore_index = True)
y_train = init_df.label.append(queried_df.annotation, ignore_index = True)

X_test = final_test_df.comment.append(test_df.comment, ignore_index = True)
y_test = final_test_df.annotation.append(test_df.annotation, ignore_index = True)
print(X_train.shape)

X_train_vect = tfidf_vect.fit_transform(X_train)
X_test_vect = tfidf_vect.transform(X_test)
    
clf.fit(X_train_vect.toarray(), y_train)
predicted_proba = clf.predict_proba(X_test_vect.toarray())
pred = (predicted_proba [:,1] >= threshold_post_al).astype('int')

print("Post AL performance - trained with additional training data:\n")
print(metrics.classification_report(y_test, pred))
print("BAS : {}".format(metrics.balanced_accuracy_score(y_test, pred)))
roc = metrics.roc_curve(y_test,predicted_proba [:,1])
print("AUC : {}".format(metrics.auc(x = roc[0], y = roc[1])))

(404,)
Post AL performance - trained with additional training data:

              precision    recall  f1-score   support

         0.0       0.92      0.80      0.86       235
         1.0       0.45      0.70      0.55        54

    accuracy                           0.79       289
   macro avg       0.69      0.75      0.70       289
weighted avg       0.83      0.79      0.80       289

BAS : 0.7539795114263199
AUC : 0.837431048069346
