In [None]:
# to supress the output here
%%capture 
!pip install simpletransformers

In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from simpletransformers.classification import ClassificationModel, ClassificationArgs


In [None]:
mode = 'in' # 'in' or 'cross'

## Load data

In [None]:
#load data


if mode == 'in':
    train_data = pd.read_csv('data/olid-train-all.csv')
    
if mode == 'cross':
    train_data = pd.read_csv('data/hasoc-train-all.csv')
    
test_data = pd.read_csv('data/olid-test.csv')

## Make training data for meta model

In [None]:
# specifying optimal settings for each model
if mode == 'in':
  model_args1 = {
      'max_seq_length': 64,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'num_train_epochs': 1,  # <------------- SET THIS CORRECTLY 
      'learning_rate': 2e-5,  # <------------- SET THIS CORRECTLY
      'adam_epsilon': 1e-8,
      'overwrite_output_dir': True,
      'manual_seed': 123
    }


  model_args2 = {
      'max_seq_length': 64,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'num_train_epochs': 1,  # <------------- SET THIS CORRECTLY 
      'learning_rate': 2e-5,  # <------------- SET THIS CORRECTLY
      'adam_epsilon': 1e-8,
      'overwrite_output_dir': True,
      'manual_seed': 123
    }

  model_args3 = {
      'max_seq_length': 64,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'num_train_epochs': 2,  # <------------- SET THIS CORRECTLY 
      'learning_rate': 3e-5,  # <------------- SET THIS CORRECTLY
      'adam_epsilon': 1e-8,
      'overwrite_output_dir': True,
      'manual_seed': 123
    }


if mode == 'cross':
  model_args1 = {
      'max_seq_length': 64,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'num_train_epochs': 2,  # <------------- SET THIS CORRECTLY 
      'learning_rate': 2e-5,  # <------------- SET THIS CORRECTLY
      'adam_epsilon': 1e-8, 
      'overwrite_output_dir': True,
      'manual_seed': 123
    }


  model_args2 = {
      'max_seq_length': 64,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'num_train_epochs': 3,  # <------------- SET THIS CORRECTLY 
      'learning_rate': 2e-5,  # <------------- SET THIS CORRECTLY
      'adam_epsilon': 1e-8,
      'overwrite_output_dir': True,
      'manual_seed': 123
    }

  model_args3 = {
      'max_seq_length': 64,
      'train_batch_size': 8,
      'eval_batch_size': 8,
      'num_train_epochs': 2,  # <------------- SET THIS CORRECTLY 
      'learning_rate': 3e-5,  # <------------- SET THIS CORRECTLY
      'adam_epsilon': 1e-8,
      'overwrite_output_dir': True,
      'manual_seed': 123
    }

In [None]:
# prepare training data meta model

#how it works: 
# the three models get trained (finetuned) on the train split (80% of original training data), then they make predictions on the test split (20%)
# the predictions from each model get collected
# after the 5 splits, we have collected predictions on the full training data
# these predictions will be used to train the meta model later


##code from slides##
ids = []
index = []
gold_train_y = []
train_x = [] 
pred_model_1 = []
pred_model_2 = []
pred_model_3 = []
rskf = StratifiedKFold(n_splits=5, shuffle=True)

for train_index, test_index in rskf.split(train_data['text'], train_data['labels']):
   
    train_df = train_data.iloc[train_index]
    test_df = train_data.iloc[test_index]
    
    index.extend(test_index)
    gold_train_y.extend(test_df['labels'])
    train_x.extend(test_df['text'])
    
    model1 = ClassificationModel('bert', 'bert-base-uncased', args=model_args1, use_cuda=True)
    model1.train_model(train_df)
    predictions1, prob1 = model1.predict(list(test_df.text))
    pred_model_1.extend(predictions1)
                           
    model2 = ClassificationModel('bert', 'diptanu/fBERT', args=model_args2, use_cuda=True)
    model2.train_model(train_df)
    predictions2, prob2 = model2.predict(list(test_df.text))
    pred_model_2.extend(predictions2)
                           
    model3 = ClassificationModel('bert', 'GroNLP/hateBERT', args=model_args3, use_cuda=True)
    model3.train_model(train_df)
    predictions3, prob3 = model3.predict(list(test_df.text))
    pred_model_3.extend(predictions3)
                           

   


In [None]:
# put all the info from the lists above into a dataframe
output = pd.DataFrame(columns = ['text', 'labels', 'predicted1', 'predicted2', 'predicted3', 'index'])
output.text, output.labels, output.predicted1, output.predicted2, output.predicted3, output.index = train_x, gold_train_y, pred_model_1, pred_model_2, pred_model_3, index
output['id'] = [list(train_data.id)[idx] for idx in index] #getting the OG ids back
meta_train_data = output[['id', 'text', 'labels', 'predicted1', 'predicted2', 'predicted3']] #putting it in the right order we are used to

In [None]:
# save all the predictions from the models so the 5-fold thing doesnt have to be ran again if we want to change the code below
meta_train_data.to_csv(f'data/meta_training_file_{mode}.csv', sep='\t')

## Load additional features (training data)

In [None]:
### use this cell to read in the saved file from above

# if mode == 'in':
#     meta_train_data = pd.read_csv('data/meta_training_file_in.csv', sep=';')
    
# if mode == 'cross':
#     meta_train_data = pd.read_csv('data/meta_training_file_cross.csv', sep=';')


In [None]:
if mode == 'in':
    additional_feat_train_data = pd.read_csv('data/olid_train_scaled_features.csv', sep=';')
    
if mode == 'cross':
    additional_feat_train_data = pd.read_csv('data/hasoc_train_scaled_features.csv', sep=';')


In [None]:
full_meta_train_data = pd.merge(meta_train_data, additional_feat_train_data, on='id')

In [None]:
full_meta_train_data

In [None]:
#vectorize features
#not really doing anything because its already all in numbers

features_df = full_meta_train_data[['predicted1', 'predicted2', 'predicted3', 'text_length', '#words', 'av_wordlen', 'Caps', 'Excl', 'pronouns', 'unknownwords', 'Hate']]

train_features = features_df.values #array, looks like list of lists, each list is a row

gold_labels = list(full_meta_train_data.labels)

In [None]:
train_features

## Train meta model

In [None]:
# fit metamodel

meta_clf = LinearSVC(max_iter=10000) #change parameters?
meta_clf.fit(train_features, gold_labels)


## Train (finetune) base models on full training data

In [None]:
#not done here 

## Make test data for meta model

In [None]:
# 1 make predictions base models on test data
#also not done here, instead reading files with output from previous assignment

meta_test_data = pd.read_csv('data/finetuned-models-predictions.csv', sep=';')



# 2 add features to test data

# #output from add_features.ipynb
if mode == 'in':
    additional_feat_test_data = pd.read_csv('data/olid_test_olid_scaled_features.csv', sep=';')

if mode == 'cross':
    additional_feat_test_data = pd.read_csv('data/olid_test_hasoc_scaled_features.csv', sep=';')


In [None]:
full_meta_test_data = pd.merge(meta_test_data, additional_feat_test_data, on='id')

In [None]:
full_meta_test_data

In [None]:
#represent features as vector
if mode == 'in':
    features_df = full_meta_test_data[['bert_in', 'fbert_in', 'hatebert_in', 'text_length', '#words', 'av_wordlen', 'Caps', 'Excl', 'pronouns', 'unknownwords', 'Hate']]

if mode == 'cross':
    features_df = full_meta_test_data[['bert_cross', 'fbert_cross', 'hatebert_cross', 'text_length', '#words', 'av_wordlen', 'Caps', 'Excl', 'pronouns', 'unknownwords', 'Hate']]

test_features = features_df.values

gold_labels_test = list(full_meta_test_data.gold)

In [None]:
test_features

## Make final predictions on test data with meta model and evaluate

In [None]:
# predict
y_pred = meta_clf.predict(test_features)

In [None]:
#save output for later when needed for error analysis:
#save output for later to be save
output = pd.read_csv('ensemble_output_all_models.csv', sep=';')
output[f'stacking_pred_{mode}'] = y_pred

output.to_csv('ensemble_output_all_models.csv', sep=';')

In [None]:
# eval
print(mode, 'domain setup')
print(classification_report(gold_labels_test, y_pred))
print(confusion_matrix(gold_labels_test, y_pred))