### Table of Contents
* [1 Imports](#chapter1)
* [2 Experiment](#chapter2)
    * [2.1 Traditional methods](#section_2_1)
    * [2.2 XLM-R](#section_2_2)

# Imports <a class="anchor" id="chapter1"></a>
* Import necessary libraries and data

In [20]:
#libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from utils import configs
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from statistics import mean
from statistics import stdev
from utils.experiment_models import RepeatedBaselines
from utils.experiment_models import RepeatedBERT
from utils.experiment_models import RandomClassifier as RC
import numpy as np
import ktrain
from ktrain import text

#data
finalized_dataset = pd.read_csv('../../data/arabic_dataset.csv', index_col=0)

#config variables
RANDOM_SEED = configs.RANDOM_SEED

# Experiment <a class="anchor" id="chapter2"></a>
* Train/test on questions individually

In [21]:
#renaming some columns to be the same name as other dataset
finalized_dataset = finalized_dataset.rename(columns={'Number of Question': 'Question_Nr', 'label': 'Labels', 'Responses':'Response'})
finalized_dataset.head(5)

Unnamed: 0,Question,Right_Answer,Grade,Number,Response,Question_Nr,Labels
0,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,3.0,[1],هي سلوك غير أخلاقي يتم عن طريق وسائل الكترونية...,1,1
1,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,5.0,[2],هي كل سلوك غير أخلاقي يتم بواسطة الاجهزة الالك...,1,1
2,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,2.625,[3],هي سلوك غير قانوني يحمل باستعمال الأجهزة الالك...,1,1
3,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,4.0,[4],هي سلوك غير قانوني تستخدم الوسائل الالكترونية ...,1,1
4,عرف مصطلح الجريمة الإلكترونية,هي كل سلوك غير قانوني يتم باستخدام الأجهزة ال...,3.5,[5],هي كل سلوك غير أخلاقي يتم باستخدام الوسائل الا...,1,1


In [22]:
#global parameters
TEST_SIZE = 0.15
VALID_SIZE = 0.15
ITERATIONS = 10
METRICS = ['average_precision', 'spearman', 'roc_auc', 'averaged_classification_report']

## 2.1 Traditional methods  <a class="anchor" id="section_2_1"></a>

In [23]:
#for representation
vectorizer = TfidfVectorizer(ngram_range=(1,1))

#MODELS
#logistic regressor:
regressor = LogisticRegression(max_iter=400,random_state=RANDOM_SEED)
#random forest:
rf = RandomForestClassifier(random_state=RANDOM_SEED)
#1-nn:
one_NN = KNeighborsClassifier(n_neighbors=1)
#3-nn:
three_NN = KNeighborsClassifier(n_neighbors=3)
#random classifier:
rc = RC.RandomClassifier(random_state=RANDOM_SEED, change_state=True)

MODELS_MAP = {
    'Logistic Regressor':regressor,
    'Random Forest':rf,
    '1-NN':one_NN,
    '3-NN':three_NN,
    'Random Classifier':rc
}

#Create RepeatedBaseLines object for experiments. (See RepeatedBaseLines.py in utils>experiment_models>RepeatedBaselines)
rho = RepeatedBaselines.RepeatedBaselines(models=MODELS_MAP, metrics=METRICS, iterations=ITERATIONS, random_state=RANDOM_SEED)

#FOR SIMPLE FORMATTING OF CLASSIFICATION REPORT
def format_report(report):
  formated_report = ''
  for label, metrics in report.items():
      formated_report += '{}-> '.format(label)
      if type(metrics) == dict:
          for metric, score in metrics.items():
            formated_report += '{}: {:.2f}, '.format(metric, score)
      else:
          formated_report += '{:.2f}'.format(metrics)
      formated_report += '\n'

  return formated_report

#FOR PRINTING RESULTS
def print_results(results):
    for model, result in results.items():
      spearmans = [spearman for spearman, _ in result['spearman'] if not np.isnan(spearman)]

      auprc_results = result['average_precision']
      pos_label_scores = []
      neg_label_scores = []
      for neg, pos in auprc_results:
          pos_label_scores.append(pos[1])
          neg_label_scores.append(neg[1])

      print('Model: {}. Iterations: {}'.format(model,ITERATIONS))
      print('Average AUPRC (Positive: 0): {:0.2f}. Standard deviation: {:0.3f}.'.format(mean(neg_label_scores), stdev(neg_label_scores)))
      print('Average AUPRC (Positive: 1): {:0.2f}. Standard deviation: {:0.3f}.'.format(mean(pos_label_scores), stdev(pos_label_scores)))
      print('Average ROC AUC: {:0.2f}. Standard deviation: {:0.3f}.'.format(mean(result['roc_auc']), stdev(result['roc_auc'])))
      if spearmans:
        print('Average Spearman correlation: {:0.2f}. Standard deviation: {:0.3f}.'.format(mean(spearmans), stdev(spearmans)))
      else:
        print('Spearman: NaN')

      report = result['averaged_classification_report']
      formated_string = format_report(report)
      print(formated_string + '\n\n')

In [24]:
################QUESTION 13########################
responses = finalized_dataset[finalized_dataset.Question_Nr == 13].Response
labels = finalized_dataset[finalized_dataset.Question_Nr == 13].Labels
corr_data = finalized_dataset[finalized_dataset.Question_Nr == 13].Grade

repeated_splits = rho.repeated_split(X=responses, y=labels, test_size=TEST_SIZE, valid_size=VALID_SIZE, stratify=labels)
split_list = []
for x_train, y_train, x_dev, y_dev, x_test, y_test, idx1, idx2 in repeated_splits:
    split_list.append([x_train, x_test, y_train, y_test, idx1, idx2])
conv_repeated_splits = rho.convert_data(split_list=split_list, representation=vectorizer)
results = rho.fit_predict(split_list=conv_repeated_splits,correlation_data=corr_data)
print_results(results)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Model: Logistic Regressor. Iterations: 10
Average AUPRC (Positive: 0): 0.84. Standard deviation: 0.218.
Average AUPRC (Positive: 1): 0.98. Standard deviation: 0.029.
Average ROC AUC: 0.92. Standard deviation: 0.114.
Average Spearman correlation: 0.53. Standard deviation: 0.252.
0-> precision: 0.00, recall: 0.00, f1-score: 0.00, support: 2.00, 
1-> precision: 0.78, recall: 1.00, f1-score: 0.88, support: 7.00, 
accuracy-> 0.78
macro avg-> precision: 0.39, recall: 0.50, f1-score: 0.44, support: 9.00, 
weighted avg-> precision: 0.60, recall: 0.78, f1-score: 0.68, support: 9.00, 



Model: Random Forest. Iterations: 10
Average AUPRC (Positive: 0): 0.77. Standard deviation: 0.234.
Average AUPRC (Positive: 1): 0.97. Standard deviation: 0.035.
Average ROC AUC: 0.90. Standard deviation: 0.132.
Average Spearman correlation: 0.54. Standard deviation: 0.224.
0-> precision: 0.00, recall: 0.00, f1-score: 0.00, support: 2.00, 
1-> precision: 0.78, recall: 1.00, f1-score: 0.88, support: 7.00, 
accurac

In [25]:
################QUESTION 33########################
responses = finalized_dataset[finalized_dataset.Question_Nr == 33].Response
labels = finalized_dataset[finalized_dataset.Question_Nr == 33].Labels
corr_data = finalized_dataset[finalized_dataset.Question_Nr == 33].Grade

repeated_splits = rho.repeated_split(X=responses, y=labels, test_size=TEST_SIZE, valid_size=VALID_SIZE, stratify=labels)
split_list = []
for x_train, y_train, x_dev, y_dev, x_test, y_test, idx1, idx2 in repeated_splits:
    split_list.append([x_train, x_test, y_train, y_test, idx1, idx2])
conv_repeated_splits = rho.convert_data(split_list=split_list, representation=vectorizer)
results = rho.fit_predict(split_list=conv_repeated_splits,correlation_data=corr_data)
print_results(results)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: Logistic Regressor. Iterations: 10
Average AUPRC (Positive: 0): 0.81. Standard deviation: 0.198.
Average AUPRC (Positive: 1): 0.82. Standard deviation: 0.208.
Average ROC AUC: 0.77. Standard deviation: 0.251.
Average Spearman correlation: 0.55. Standard deviation: 0.440.
0-> precision: 0.82, recall: 0.59, f1-score: 0.63, support: 3.30, 
1-> precision: 0.61, recall: 0.78, f1-score: 0.68, support: 3.70, 
accuracy-> 0.69
macro avg-> precision: 0.72, recall: 0.69, f1-score: 0.66, support: 7.00, 
weighted avg-> precision: 0.71, recall: 0.69, f1-score: 0.66, support: 7.00, 



Model: Random Forest. Iterations: 10
Average AUPRC (Positive: 0): 0.91. Standard deviation: 0.154.
Average AUPRC (Positive: 1): 0.94. Standard deviation: 0.115.
Average ROC AUC: 0.91. Standard deviation: 0.153.
Average Spearman correlation: 0.78. Standard deviation: 0.159.
0-> precision: 0.77, recall: 1.00, f1-score: 0.85, support: 3.30, 
1-> precision: 0.80, recall: 0.62, f1-score: 0.68, support: 3.70, 
accurac

## 2.2 XLM-R <a class="anchor" id="section_2_2"></a>

In [26]:
#FOR PRINTING RESULTS
def print_results_bert(results):
    spearmans = [spearman for spearman, _ in results['spearman']]

    auprc_results = results['average_precision']
    pos_label_scores = []
    neg_label_scores = []
    for neg, pos in auprc_results:
        pos_label_scores.append(pos[1])
        neg_label_scores.append(neg[1])

    print('Model: XLM-R. Iterations: {}'.format(ITERATIONS))
    print('Average AUPRC (Positive: 0): {:0.2f}. Standard deviation: {:0.3f}.'.format(mean(neg_label_scores), stdev(neg_label_scores)))
    print('Average AUPRC (Positive: 1): {:0.2f}. Standard deviation: {:0.3f}.'.format(mean(pos_label_scores), stdev(pos_label_scores)))
    print('Average ROC AUC: {:0.2f}. Standard deviation: {:0.3f}.'.format(mean(results['roc_auc']), stdev(results['roc_auc']) if len(results['average_precision']) > 1 else 0))
    print('Average Spearman correlation: {:0.2f}. Standard deviation: {:0.3f}.'.format(mean(spearmans), stdev(spearmans) if len(spearmans) > 1 else 0))

    report = results['averaged_classification_report']
    formated_string = format_report(report)
    print(formated_string + '\n\n')

#xlm-r parameters
MODEL_NAME = "xlm-roberta-base"
BATCH_SIZE = 6
MAX_LEN = 200
CLASS_NAMES = [0,1]
EPOCHS = 30
LR = 1e-5
#xlm-r
t = text.Transformer(MODEL_NAME, maxlen=200, class_names=CLASS_NAMES)
#Create RepeatedBert object for experiments. (See RepeatedBERT.py in utils>experiment_models>RepeatedBERT)
repeated_bert = RepeatedBERT.RepeatedBERT(transformer=t,metrics=METRICS,iterations=ITERATIONS,random_state=RANDOM_SEED)

In [27]:
################QUESTION 13########################
responses = finalized_dataset[finalized_dataset.Question_Nr == 13].Response
labels = finalized_dataset[finalized_dataset.Question_Nr == 13].Labels
corr_data = finalized_dataset[finalized_dataset.Question_Nr == 13].Grade

#create list of splits
split_list = repeated_bert.repeated_split(X=responses, y=labels,test_size=TEST_SIZE,valid_size=VALID_SIZE, stratify=labels)
#fit and predict each split
bert_results = repeated_bert.fit_predict(split_list=split_list,batch_size=BATCH_SIZE,epochs=EPOCHS,lr=LR,correlation_data=corr_data)
#print results
print_results_bert(bert_results)

In [28]:
################QUESTION 33########################
responses = finalized_dataset[finalized_dataset.Question_Nr == 33].Response
labels = finalized_dataset[finalized_dataset.Question_Nr == 33].Labels
corr_data = finalized_dataset[finalized_dataset.Question_Nr == 33].Grade

split_list = repeated_bert.repeated_split(X=responses, y=labels,test_size=TEST_SIZE,valid_size=VALID_SIZE, stratify=labels)
bert_results = repeated_bert.fit_predict(split_list=split_list,batch_size=BATCH_SIZE,epochs=EPOCHS,lr=LR,correlation_data=corr_data)
print_results_bert(bert_results)

In [29]:
#CODE RAN ON COLAB. THESE WERE THE RESULTS:
#RESULTS Q13:
# Model: XLM-R. Iterations: 10
# Average AUPRC (Positive: 0): 0.95. Standard deviation: 0.158.
# Average AUPRC (Positive: 1): 0.99. Standard deviation: 0.016.
# Average ROC AUC: 0.98. Standard deviation: 0.068.
# Average Spearman correlation: 0.65. Standard deviation: 0.148.
# 0-> precision: 0.85, recall: 0.75, f1-score: 0.78, support: 2.00,
# 1-> precision: 0.94, recall: 0.99, f1-score: 0.96, support: 7.00,
# accuracy-> 0.93
# macro avg-> precision: 0.89, recall: 0.87, f1-score: 0.87, support: 9.00,
# weighted avg-> precision: 0.92, recall: 0.93, f1-score: 0.92, support: 9.00,

#RESULTS Q33:
# Model: XLM-R. Iterations: 10
# Average AUPRC (Positive: 0): 0.94. Standard deviation: 0.071.
# Average AUPRC (Positive: 1): 0.95. Standard deviation: 0.064.
# Average ROC AUC: 0.93. Standard deviation: 0.092.
# Average Spearman correlation: 0.81. Standard deviation: 0.084.
# 0-> precision: 0.78, recall: 0.75, f1-score: 0.73, support: 3.30,
# 1-> precision: 0.86, recall: 0.81, f1-score: 0.77, support: 3.70,
# accuracy-> 0.79
# macro avg-> precision: 0.82, recall: 0.78, f1-score: 0.75, support: 7.00,
# weighted avg-> precision: 0.82, recall: 0.79, f1-score: 0.76, support: 7.00,