# Train DISTILBERT MODELS
Evaluate them on the hand-tagged sample.

In [None]:
import torch
import transformers
from datasets import load_dataset, Dataset, DatasetDict,  concatenate_datasets
from transformers import Trainer, TrainingArguments, pipeline, EarlyStoppingCallback
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, log_loss
import pandas as pd
import numpy as np
from scipy.special import softmax
from sklearn.model_selection import train_test_split

MODEL_NAME = 'distilbert-base-uncased'
BATCH_SIZE = 128
SENTIMENT_NUM_LABELS = 2
EMOTION_NUM_LABELS = 7
EMOTIONS = ['neutral','happy','sad','anger','disgust','surprise','fear']
SENTIMENTS = ['bearish','bullish']
SEQ_LENGTH = 30
CALLBACK =  [EarlyStoppingCallback(early_stopping_patience = 2)]

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
#Emotion Test sets
test_df = pd.read_parquet('../emtract/data/hand_tagged_sample.parquet.snappy', columns=['text','label'])
test_set = test_df.copy() # for the 5 class classification
test_set = test_set[(test_set.label !=0) & (test_set.label != 4)].reset_index(drop=True)
test_set['label'] = np.where(test_set.label ==2, 0, test_set.label)
test_set['label'] = np.where(test_set.label==5, 7, test_set.label)
test_set['label'] = np.where(test_set.label>=6, test_set.label-2, test_set.label)

test_df = Dataset.from_pandas(test_df)
test_set = Dataset.from_pandas(test_set)

# Huggingface Twitter Emotion Model
Does not have disgust and neutral. Group love and joy together.

In [None]:
# Prepare data
def change_labels(examples):
    label = examples['label']
    if label == 0:
        return {'label': 1}
    if label == 1 or label == 2:
        return {'label': 0}
    if label >= 3:
        return {'label': label-1}
    return {'label': output}


emotions = load_dataset("emotion")

dataset = DatasetDict({
    'train': concatenate_datasets([emotions['train'], emotions['test']]),
    'test': test_set,
    'valid': emotions['test']})

tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(MODEL_NAME, model_max_length=SEQ_LENGTH)

emotions_encoded = dataset.map(tokenize, batched=True, batch_size=None)
emotions_encoded = emotions_encoded.map(change_labels)
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
num_labels = 5
logging_steps = len(emotions_encoded["train"]) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=8,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE*4,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="loss",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False)
model = transformers.DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels).to(device)

In [None]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["valid"],
                 callbacks= CALLBACK)
trainer.train();

In [None]:
#model.save_pretrained('./models/emotion-hf')
#tokenizer.save_pretrained('./models/emotion-hf')

preds_output = trainer.predict(emotions_encoded["test"])
print(preds_output.metrics)

m = softmax(preds_output[0], axis=1)

In [None]:
target_names = ['happy','sad','anger','fear','surprise']
print(classification_report(preds_output[1], np.argmax(m, axis=1), target_names=target_names))

# StockTwits Hand-Tagged
Use 8k observations for training/validation and hold out 2k for testing.

In [None]:
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(MODEL_NAME, model_max_length=SEQ_LENGTH)
model = transformers.DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=EMOTION_NUM_LABELS).to(device)

In [None]:
train_testvalid = test_df.train_test_split(test_size=0.4)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
emotions_encoded = dataset.map(tokenize, batched=True, batch_size=None)
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
logging_steps = len(emotions_encoded["train"]) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=8,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE*4,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="loss",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False)

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["valid"],
                   callbacks= CALLBACK)
trainer.train();

In [None]:
preds_output = trainer.predict(emotions_encoded["test"])
print(preds_output.metrics)

m = softmax(preds_output[0], axis=1)

print(classification_report(preds_output[1], np.argmax(m, axis=1), target_names=EMOTIONS))

# Twitter Emotion

In [None]:
dataset = Dataset.from_pandas(pd.read_parquet('../emtract/data/twitter_emotion_cleaned.parquet.snappy', columns=['text','label']))

# 90% train, 10% test + validation
train_testvalid = dataset.train_test_split(test_size=0.2)

# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_df,
    'valid': train_testvalid['test']})

In [None]:
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(MODEL_NAME, model_max_length=SEQ_LENGTH)
model = transformers.DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=EMOTION_NUM_LABELS).to(device)
emotions_encoded = dataset.map(tokenize, batched=True, batch_size=None)
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
logging_steps = len(emotions_encoded["train"]) // BATCH_SIZE
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=8,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE*4,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="loss",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False)

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["valid"],
                  callbacks = CALLBACK)
trainer.train();

In [None]:
preds_output = trainer.predict(emotions_encoded["test"])
print(preds_output.metrics)

m = softmax(preds_output[0], axis=1)

print(classification_report(preds_output[1], np.argmax(m, axis=1), target_names=EMOTIONS))