# Train DISTILBERT MODELS
Evaluate them on the hand-tagged sample.

In [None]:
import torch
import emoji
import transformers
from datasets import load_dataset, Dataset, DatasetDict,  concatenate_datasets
from transformers import Trainer, TrainingArguments, pipeline, EarlyStoppingCallback
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, log_loss
import pandas as pd
import numpy as np
import random as rn
from scipy.special import softmax
from sklearn.model_selection import train_test_split

# Set seed
seed_value = 0
# 1. Set `python` built-in pseudo-random generator at a fixed value
rn.seed(seed_value)
# 2. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(seed_value)

MODEL_NAME = 'distilbert-base-uncased'
BATCH_SIZE = 128
EMOTION_NUM_LABELS = 7
EMOTIONS = ['neutral','happy','sad','anger','disgust','surprise','fear']
SEQ_LENGTH = 64
CALLBACK =  [EarlyStoppingCallback(early_stopping_patience = 1)]

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=SEQ_LENGTH)

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

#Emotion Test sets
test_df = pd.read_parquet('../emtract/data/tagged_sample.parquet.snappy', columns=['text','label'])

# Emojis + emoticons
emoticons = pd.read_csv('../emtract/data/dictionaries/emoticons.csv').values
EMOJI_EMOTICONS = list(emoji.UNICODE_EMOJI["en"].keys()) + list(emoticons[:, 0])

In [None]:
# Load tokenizer
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(MODEL_NAME, model_max_length=SEQ_LENGTH)
# Add emoji-emoticons 
num_added_toks = tokenizer.add_tokens(EMOJI_EMOTICONS)
print('We have added', num_added_toks, 'tokens')
 # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
model = transformers.DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=EMOTION_NUM_LABELS).to(device)
model.resize_token_embeddings(len(tokenizer))

## Train Emotion Model on Emotion Metadata

In [None]:
twitter_df = pd.read_parquet('../emtract/data/emotion_sources.parquet.snappy', columns=['text','label'])
twitter_df['label'] = twitter_df['label'].astype(int)
dataset = Dataset.from_pandas(twitter_df)

train_testvalid = dataset.train_test_split(test_size=0.2)

# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'valid': train_testvalid['test']})

emotions_encoded = dataset.map(tokenize, batched=True, batch_size=None)
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
logging_steps = len(emotions_encoded["train"]) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=8,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE*4,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="loss",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False)

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["valid"],
                   callbacks= CALLBACK)
trainer.train();

In [None]:
import os
os.mkdir("models")
model.save_pretrained('./models/emotion-twitter')
tokenizer.save_pretrained('./models/emotion-twitter')

# Transfer it to StockTwits Hand-Tagged
Use 8k observations for training/validation and hold out 2k for testing. Test both models via performance on test set during five-fold CV.

In [None]:
from sklearn.model_selection import KFold

# Grab tokenizer
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained('./models/emotion-twitter', model_max_length=SEQ_LENGTH)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=5, shuffle=True, random_state = 0)

tw_model = transformers.DistilBertForSequenceClassification.from_pretrained("./models/emotion-twitter", num_labels=EMOTION_NUM_LABELS).to(device)
tw_perf, transfer_perf = pd.DataFrame(), pd.DataFrame()

# K-fold Cross Validation model evaluation
for fold, (train_ids, test_ids) in enumerate(kfold.split(test_df)):
    model = transformers.DistilBertForSequenceClassification.from_pretrained("./models/emotion-twitter", num_labels=EMOTION_NUM_LABELS).to(device)
    train_data = Dataset.from_pandas(test_df.loc[train_ids[1000:]])
    valid_data = Dataset.from_pandas(test_df.loc[train_ids[:1000]])
    test_data = Dataset.from_pandas(test_df.loc[test_ids])
    dataset = DatasetDict({'train': train_data, 'valid': valid_data, 'test': test_data})
    emotions_encoded = dataset.map(tokenize, batched=True, batch_size=None)
    emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
    logging_steps = len(emotions_encoded["train"]) // BATCH_SIZE
    training_args = TrainingArguments(output_dir="results",
                                      num_train_epochs=8,
                                      learning_rate=2e-5,
                                      per_device_train_batch_size=BATCH_SIZE,
                                      per_device_eval_batch_size=BATCH_SIZE*4,
                                      load_best_model_at_end=True,
                                      metric_for_best_model="loss",
                                      weight_decay=0.01,
                                      evaluation_strategy="epoch",
                                      save_strategy="epoch",
                                      disable_tqdm=False)

    trainer = Trainer(model=model, args=training_args,
                      compute_metrics=compute_metrics,
                      train_dataset=emotions_encoded["train"],
                      eval_dataset=emotions_encoded["valid"],
                       callbacks= CALLBACK)
    trainer.train();
    
    if fold == 2:
        model.save_pretrained('./models/emotion-transfer')
        tokenizer.save_pretrained('./models/emotion-transfer')
    
    print("Performance via transfer learning...")
    preds_output_transfer = trainer.predict(emotions_encoded["test"])
    print(preds_output_transfer.metrics)
    m = softmax(preds_output_transfer[0], axis=1)
    print(classification_report(preds_output_transfer[1], np.argmax(m, axis=1), target_names=EMOTIONS))
    
    # STORE TEST SET PERFORMANCE
    transfer_perf = transfer_perf.append(pd.DataFrame(preds_output_transfer.metrics, index = [0]))
    
    print("Performance via Twitter model learning...")
    trainer_tw = Trainer(model=tw_model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["valid"],
                   callbacks= CALLBACK)
    preds_output_tw = trainer_tw.predict(emotions_encoded["test"])
    print(preds_output_tw.metrics)
    m = softmax(preds_output_tw[0], axis=1)
    print(classification_report(preds_output_tw[1], np.argmax(m, axis=1), target_names=EMOTIONS))
    
    tw_perf = tw_perf.append(pd.DataFrame(preds_output_tw.metrics, index = [0]))

tw_perf.to_csv("models/tw_perf.csv", index = False)
transfer_perf.to_csv("models/transfer_perf.csv", index = False)

## Explain Emotion Models

In [None]:
import transformers

# load emotion models
transfer_model = transformers.DistilBertForSequenceClassification.from_pretrained("./models/emotion-transfer", num_labels=EMOTION_NUM_LABELS).to(device)
tw_model = transformers.DistilBertForSequenceClassification.from_pretrained("./models/emotion-twitter", num_labels=EMOTION_NUM_LABELS).to(device)

# load tokenizer
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained('./models/emotion-twitter', model_max_length=SEQ_LENGTH)

# build a pipeline object to do predictions
pred_tw = transformers.pipeline("text-classification", model=tw_model, tokenizer=tokenizer, device=0, return_all_scores=True)
pred_transfer = transformers.pipeline("text-classification", model=transfer_model, tokenizer=tokenizer, device=0, return_all_scores=True)

In [None]:
import shap

# Extract shap values for both models

# Twitter only model first
explainer_tw = shap.Explainer(pred_tw)
shap_values_tw = explainer_tw(test_df['text'])

# Transferred model second
explainer_transfer = shap.Explainer(pred_transfer)
shap_values_transfer = explainer_transfer(test_df['text'])