### Table of Contents
* [1 Imports](#chapter1)
* [2 Experiment](#chapter2)
    * [2.1 Traditional methods](#section_2_1)
    * [2.2 XLM-R](#section_2_2)

# Imports <a class="anchor" id="chapter1"></a>
* Import necessary libraries and data

In [1]:
#libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from utils import configs
from utils import utils
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from scipy.stats import spearmanr
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from statistics import mean
from utils.experiment_models import RandomClassifier as RC
from utils.experiment_models import RepeatedBERT
import ktrain
from ktrain import text

#data
finalized_dataset = pd.read_csv('../../data/arabic_dataset.csv', index_col=0)

#config variables
RANDOM_SEED = configs.RANDOM_SEED

# Experiment <a class="anchor" id="chapter2"></a>

In [2]:
#renaming some columns to be the same name as other dataset
finalized_dataset = finalized_dataset.rename(columns={'Number of Question': 'Question_Nr', 'label': 'Labels', 'Responses':'Response'})
finalized_dataset.head(5)

Unnamed: 0,Question,Right_Answer,Grade,Number,Response,Question_Nr,Labels
0,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,3.0,[1],هي سلوك غير أخلاقي يتم عن طريق وسائل الكترونية...,1,1
1,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,5.0,[2],هي كل سلوك غير أخلاقي يتم بواسطة الاجهزة الالك...,1,1
2,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,2.625,[3],هي سلوك غير قانوني يحمل باستعمال الأجهزة الالك...,1,1
3,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,4.0,[4],هي سلوك غير قانوني تستخدم الوسائل الالكترونية ...,1,1
4,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,3.5,[5],هي كل سلوك غير أخلاقي يتم باستخدام الوسائل الا...,1,1


In [3]:
#global parameters
TEST_SIZE = 0.15
VALID_SIZE = 0.15
ITERATIONS = 10
METRICS = ['average_precision', 'spearman', 'roc_auc', 'averaged_classification_report']
AMOUNT_QUESTIONS = len(finalized_dataset.Question_Nr.unique())

## 2.1 Traditional methods  <a class="anchor" id="section_2_1"></a>

In [4]:
#FOR REPRESENTATION
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1,1))#TfidfVectorizer(max_features=50000, ngram_range=(1,1))

#MODELS
#logistic regressor:
regressor = LogisticRegression(max_iter=400,random_state=RANDOM_SEED)
#random forest:
rf = RandomForestClassifier(random_state=RANDOM_SEED)
#1-nn:
one_NN = KNeighborsClassifier(n_neighbors=1)
#3-nn:
three_NN = KNeighborsClassifier(n_neighbors=3)
#random classifier:
rc = RC.RandomClassifier(random_state=RANDOM_SEED, change_state=True)

MODELS_MAP = {
    'Logistic Regressor':regressor,
    'Random Forest':rf,
    '1-NN':one_NN,
    '3-NN':three_NN,
    'Random Classifier':rc
}

In [5]:
#LVO EXPERIMENT
lvo_map = {} #create lvo map to keep track of predictions,probabilites,scores for each pass
for model_label, _ in MODELS_MAP.items():
    lvo_map[model_label] = {}

#used for concat
auc_scores = []
auprc_zero_scores = []
auprc_one_scores = []
spearmans = []
class_reports = []

#iterate all questions and train/test in a loqo fashion
for question_nr in range(0, AMOUNT_QUESTIONS+1, 1):
    test_data = finalized_dataset[finalized_dataset.Question_Nr == question_nr]
    train_data = finalized_dataset[finalized_dataset.Question_Nr != question_nr]

    x_train = train_data.Response
    y_train = train_data.Labels
    x_test = test_data.Response
    y_test = test_data.Labels

    if len(y_test.unique()) < 2:
        continue #only one class in the test data

    vectorizer.fit(x_train)
    x_train_processed = vectorizer.transform(x_train)
    x_test_processed = vectorizer.transform(x_test)

    corr_data = test_data.Grade

    for model_label, model in MODELS_MAP.items():
        model.fit(x_train_processed, y_train)
        predictions = model.predict(x_test_processed)
        predict_probas = model.predict_proba(x_test_processed)

        positive_probas = np.array(predict_probas)[:,1]
        negative_probas = np.array(predict_probas)[:,0]

        auc = roc_auc_score(y_test, positive_probas)
        auprc_one = average_precision_score(y_test, positive_probas, pos_label=1)
        auprc_zero = average_precision_score(y_test, negative_probas, pos_label=0)
        spearman_corr, _ = spearmanr(positive_probas, corr_data)
        class_report = classification_report(y_test, predictions, output_dict=True)

        auc_scores.append(auc)
        auprc_one_scores.append(auprc_one)
        auprc_zero_scores.append(auprc_zero)
        spearmans.append(spearman_corr)
        class_reports.append(class_report)

        lvo_results = lvo_map.get(model_label)
        lvo_results['true_labels'] = lvo_results.get('true_labels', []) + list(y_test)
        lvo_results['predicted_labels'] =  lvo_results.get('predicted_labels', []) + list(predictions)
        lvo_results['predicted_probas'] = lvo_results.get('predicted_probas', []) + list(predict_probas)
        lvo_results['corr_data'] = lvo_results.get('corr_data',[]) + list(corr_data)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
#PRINT RESULTS
#Micro is what is presented in the paper. Macro is an average per pass.
corr_data = finalized_dataset.Grade
for model, results in lvo_map.items():
    true_labels = results['true_labels']
    predicted_labels = results['predicted_labels']
    predicted_probas = results['predicted_probas']
    corr_data = results['corr_data']
    positive_probas = np.array(predicted_probas)[:,1]
    negative_probas = np.array(predicted_probas)[:,0]

    auc = roc_auc_score(true_labels, positive_probas)
    auprc_one = average_precision_score(true_labels, positive_probas, pos_label=1)
    auprc_zero = average_precision_score(true_labels, negative_probas, pos_label=0)
    spearman_corr, _ = spearmanr(positive_probas, corr_data)
    class_report = classification_report(true_labels, predicted_labels)

    print('Micro (concat):')
    print('Model: {}. AUC: {:0.2f}. AUPRC (Pos class: 0): {:0.2f}. AUPRC (Pos class: 1): {:0.2f}. Spearman {:0.2f}. \n'.
        format(model, auc, auprc_zero, auprc_one, spearman_corr))
    print(class_report + '\n')


    averaged_report = utils.average_report(class_reports)
    print('Macro (Average):')
    print('Model: {}. Average AUC: {:0.2f}. Average AUPRC (Pos class: 0): {:0.2f}. Average AUPRC (Pos class: 1): {:0.2f}. Average Spearman {:0.2f}. \n'.
        format(model, mean(auc_scores), mean(auprc_zero_scores), mean(auprc_one_scores), mean(spearmans)))
    print(utils.format_report(averaged_report))

    print('***************************************************************')


Micro (concat):
Model: Logistic Regressor. AUC: 0.49. AUPRC (Pos class: 0): 0.33. AUPRC (Pos class: 1): 0.66. Spearman -0.00. 

              precision    recall  f1-score   support

           0       0.28      0.13      0.18       723
           1       0.64      0.82      0.72      1361

    accuracy                           0.58      2084
   macro avg       0.46      0.48      0.45      2084
weighted avg       0.51      0.58      0.53      2084


Macro (Average):
Model: Logistic Regressor. Average AUC: 0.50. Average AUPRC (Pos class: 0): 0.39. Average AUPRC (Pos class: 1): 0.67. Average Spearman 0.02. 

0-> precision: 0.35, recall: 0.35, f1-score: 0.29, support: 15.38, 
1-> precision: 0.63, recall: 0.64, f1-score: 0.60, support: 28.96, 
accuracy-> 0.52
macro avg-> precision: 0.49, recall: 0.49, f1-score: 0.44, support: 44.34, 
weighted avg-> precision: 0.60, recall: 0.52, f1-score: 0.52, support: 44.34, 

***************************************************************
Micro (conca

## 2.2 XLM-R <a class="anchor" id="section_2_2"></a>

In [None]:
#xlm-r parameters
MODEL_NAME = "xlm-roberta-base"
BATCH_SIZE = 6
MAX_LEN = 200
CLASS_NAMES = [0,1]
EPOCHS = 15
LR = 1e-5
#xlm-r
t = text.Transformer(MODEL_NAME, maxlen=200, class_names=CLASS_NAMES)
#Create RepeatedBert object for experiments. (See RepeatedBERT.py in utils>experiment_models>RepeatedBERT)
repeated_bert = RepeatedBERT.RepeatedBERT(transformer=t,metrics=METRICS,iterations=ITERATIONS,random_state=RANDOM_SEED)

In [None]:
auc_scores = []
auprc_zero_scores = []
auprc_one_scores = []
spearmans = []
class_reports = []
lvo_results = {}

for question_nr in range(0, AMOUNT_QUESTIONS+1, 1):
    test_data = finalized_dataset[finalized_dataset.Question_Nr == question_nr]
    train_data = finalized_dataset[finalized_dataset.Question_Nr != question_nr]

    x_train = train_data.Response
    y_train = train_data.Labels
    x_test = test_data.Response
    y_test = test_data.Labels

    x_train, x_dev, y_train, y_dev = train_test_split(x_train, y_train, test_size=TEST_SIZE,stratify=y_train,random_state=RANDOM_SEED)

    if len(y_test.unique()) < 2:
        continue #only one class in the test data

    corr_data = test_data.Grade

    trn = t.preprocess_train(list(x_train), list(y_train))
    val = t.preprocess_test(list(x_dev), list(y_dev))
    model = t.get_classifier()
    learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=BATCH_SIZE)
    learner.fit_onecycle(LR,EPOCHS)
    learner.validate(class_names=t.get_classes())
    predictor = ktrain.get_predictor(learner.model, preproc=t)

    predictions = predictor.predict(list(x_test))
    predict_probas = predictor.predict_proba(list(x_test))

    pos_probas = np.array(predict_probas)[:,1]
    neg_probas = np.array(predict_probas)[:,0]

    auc = roc_auc_score(y_test, pos_probas)
    auprc_one = average_precision_score(y_test, pos_probas, pos_label=1)
    auprc_zero = average_precision_score(y_test, neg_probas, pos_label=0)
    spearman_corr, _ = spearmanr(pos_probas, corr_data)
    class_report = classification_report(y_test, predictions, output_dict=True)

    auc_scores.append(auc)
    auprc_one_scores.append(auprc_one)
    auprc_zero_scores.append(auprc_zero)
    spearmans.append(spearman_corr)
    class_reports.append(class_report)

    lvo_results['true_labels'] = lvo_results.get('true_labels',[]) + list(y_test)
    lvo_results['predicted_labels'] = lvo_results.get('predicted_labels',[]) + list(predictions)
    lvo_results['predicted_probas'] = lvo_results.get('predicted_probas',[]) + list(predict_probas)
    lvo_results['corr_data'] = lvo_results.get('corr_data',[]) + list(corr_data)

    print('q : ', question_nr)
    print('y_true: ', y_test)
    print('predictions: ', predictions)
    print('predict probas: ', predict_probas)
    print('corr data: ', corr_data)

    del trn, val, model, learner, predictor # delete for each pass to reduce unec. disk space

In [None]:
#PRINT RESULTS
true_labels = lvo_results['true_labels']
predicted_labels = lvo_results['predicted_labels']
predicted_probas = lvo_results['predicted_probas']
corr_data = lvo_results['corr_data']
positive_probas = np.array(predicted_probas)[:,1]
negative_probas = np.array(predicted_probas)[:,0]

auc = roc_auc_score(true_labels, positive_probas)
auprc_one = average_precision_score(true_labels, positive_probas, pos_label=1)
auprc_zero = average_precision_score(true_labels, negative_probas, pos_label=0)
spearman_corr, _ = spearmanr(positive_probas, corr_data)
class_report = classification_report(true_labels, predicted_labels)

print('Micro (concat):')
print('Model: XLM-R. AUC: {:0.2f}. AUPRC (Pos class: 0): {:0.2f}. AUPRC (Pos class: 1): {:0.2f}. Spearman {:0.2f}. \n'.
    format(auc, auprc_zero, auprc_one, spearman_corr))
print(class_report + '\n')


averaged_report = utils.average_report(class_reports)
print('Macro (Average):')
print('Model: XLM-R. Average AUC: {:0.2f}. Average AUPRC (Pos class: 0): {:0.2f}. Average AUPRC (Pos class: 1): {:0.2f}. Average Spearman {:0.2f}. \n'.
     format(mean(auc_scores), mean(auprc_zero_scores), mean(auprc_one_scores), mean(spearmans)))
print(utils.format_report(averaged_report))

print('***************************************************************')

In [None]:
#Again, Micro is what is presented in the paper. Code ran on COLAB, these were the results:
# Micro (concat):
# Model: XLM-R. AUC: 0.56. AUPRC (Pos class: 0): 0.39. AUPRC (Pos class: 1): 0.70. Spearman 0.17.
#
#               precision    recall  f1-score   support
#
#            0       0.40      0.44      0.42       723
#            1       0.69      0.65      0.67      1361
#
#     accuracy                           0.58      2084
#    macro avg       0.54      0.54      0.54      2084
# weighted avg       0.59      0.58      0.58      2084
#
#
# Macro (Average):
# Model: XLM-R. Average AUC: 0.57. Average AUPRC (Pos class: 0): 0.50. Average AUPRC (Pos class: 1): 0.72. Average Spearman 0.21.
#
# 0-> precision: 0.45, recall: 0.46, f1-score: 0.38, support: 15.38,
# 1-> precision: 0.65, recall: 0.65, f1-score: 0.62, support: 28.96,
# accuracy-> 0.56
# macro avg-> precision: 0.55, recall: 0.56, f1-score: 0.50, support: 44.34,
# weighted avg-> precision: 0.65, recall: 0.56, f1-score: 0.57, support: 44.34,
#
# ***************************************************************