In [1]:
import configuration

from transformers import AutoTokenizer, AutoModel 
from src import settings, load_embeddings, classification
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
from tqdm.auto import tqdm
import seaborn as sns
import pandas as pd

from os.path import basename
import numpy as np
import glob

tqdm.pandas()
pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings("ignore")

In [2]:
# subset of experimental data
data = pd.read_csv('../data/selected/Subset_Data_Uniques_LF.csv')

#tweets from the 'negative class' = Not related to crisis
negative_tweets = pd.read_csv('../data/selected/Subset_Not_related_tweets.csv')

data.shape, negative_tweets.shape

((67001, 66), (13446, 66))

In [3]:
# merging both datasets, where tweets from the negative subset are 
# subsampled to balance our test data in each experiment.  
data = pd.concat([data, negative_tweets], ignore_index=True)

In [4]:
def evaluate(data, columns, features, model, experiments, runs=5, mkw={'n_jobs': 3}):
    for i in range(1, runs + 1):  
        restuls = pd.DataFrame([
            classification.train_v2(data, columns, scenario, experiment, setting,
                                    model(**mkw), features, True)
            for scenario, experiments_ in experiments.items()
                for experiment, setting in tqdm(experiments_.items(), scenario)
        ])
        mask = ~((restuls.test_1 < 50) | (restuls.test_0 < 50)) #experiments with at least 50 instances per class
        restuls[mask].to_csv(f'../results/balanced/RF_{features}_{i}.csv', index=False)
        
def load_data(data, functor, col_index, params):
    if functor is not None:
        subset = pd.concat([data, functor(data, params)], axis=1, sort=False)
        return subset, subset.columns[-col_index:]
    return data.copy(), data.columns[-col_index:]

def evaluate_embeddings(data, functor, col_index, feature_name, 
                        clf, experiments, add_lf=False, params=None):
    subset, columns = load_data(data, functor, col_index, params)
    columns = (list(columns) + list(data.columns[-49:-1])
               if add_lf else columns)
    evaluate(subset, columns, feature_name, clf, experiments)

In [6]:
clf, experiments = RandomForestClassifier, settings.experimental_design()

evaluate_embeddings(data, None, 48, 'LF', clf, experiments)
evaluate_embeddings(data, load_embeddings.glove_features, 100, 'MT+GloVe', clf, experiments)
evaluate_embeddings(data, load_embeddings.muse_features, 300, 'MUSE', clf, experiments)
evaluate_embeddings(data, load_embeddings.muse_features, 300, 'MUSE+LF', clf, experiments, True)

XLMR_params = {'ml':'xlm-roberta-base', 'model':AutoModel, 'tokenizer':AutoTokenizer, 'col_text':'fixed_text_expanded'}
evaluate_embeddings(data, load_embeddings.model_features, 768, 'XLM-R', clf, experiments, False, XLMR_params) 

XLMT_params = {'ml':'cardiffnlp/twitter-xlm-roberta-base', 'model':AutoModel, 'tokenizer':AutoTokenizer, 'col_text':'fixed_text_expanded'}
evaluate_embeddings(data, load_embeddings.model_features, 768, 'XLM-T', clf, experiments, False, XLMT_params) 

BERT_params = {'ml':'bert-base-uncased', 'model':AutoModel, 'tokenizer':AutoTokenizer, 'col_text':'translated'}
evaluate_embeddings(data, load_embeddings.model_features, 768, 'MT+BERT', clf, experiments, False, BERT_params) 

BERTM_params = {'ml':'bert-base-multilingual-cased', 'model':AutoModel, 'tokenizer':AutoTokenizer, 'col_text':'fixed_text_expanded'}
evaluate_embeddings(data, load_embeddings.model_features, 768, 'mBERT', clf, experiments, False, BERTM_params) 