In [None]:
# Upgrade transformers by installing both upgrade and simple transformers
!pip install --upgrade transformers
!pip install simpletransformers

Requirement already up-to-date: transformers in /usr/local/lib/python3.7/dist-packages (4.5.1)


In [None]:
# Import packages needed 
import nltk
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import classification_report, confusion_matrix
import argparse
import os
from google.colab import files

# Pre-processing

In [None]:
# Create a common preprocessing function that can be used for all 3  tasks
def preprocess_label(x):
    new_text = []
    for t in x:
        t = t.replace("\n", "")
        t = int(t)
        new_text.append(t)
    return (new_text)

# Emotion Tasks

In [None]:
# Open and read train tweet file
train_tweet_emotion = open("/content/datasets/emotion/train_text.txt", 'r')
train_tweet_emotion_2 = train_tweet_emotion.readlines()

In [None]:
# Open and read train labels file
train_labels_emotion = open("/content/datasets/emotion/train_labels.txt", 'r')
train_labels_emotion_2 = train_labels_emotion.readlines()

In [None]:
# Call pre-processing function 
train_label_emotion = preprocess_label(train_labels_emotion_2) 

In [None]:
# create a dataframe for both labels and tweet files as two columns
df_train_emotion = pd.DataFrame({'tweet':train_tweet_emotion_2, 'label_no':train_label_emotion})

In [None]:
# Create a variable for the roberta-base model, number of labels varies depending on task 
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir=True, manual_seed=42)
model = ClassificationModel(model_type='roberta', model_name='roberta-base', use_cuda=True, num_labels=4, args=model_args)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [None]:
# Use the training dataset on the roberta-base model
model.train_model(df_train_emotion)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/3257 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/408 [00:00<?, ?it/s]

(408, 0.8034599822598929)

In [None]:
# Open and read test tweet file
test_tweet_emotion = open("/content/datasets/emotion/test_text.txt", 'r')
test_tweet_emotion_2 = test_tweet_emotion.readlines()

In [None]:
# Open and read test labels file
test_label_emotion = open("/content/datasets/emotion/test_labels.txt", 'r')
test_label_emotion_2 = test_label_emotion.readlines()

In [None]:
# Use the same function that was created earlier
test_label_emotion = preprocess_label(test_label_emotion_2) 

In [None]:
# Create a dataframe with both user tweets and tweet labels 
df_test_emotion = pd.DataFrame({'tweet':test_tweet_emotion_2, 'label_no':test_label_emotion})

In [None]:
# Use the testing dataset, evaluation is made when comparing the test set to predictions made
result, model_outputs, wrong_predictions = model.eval_model(df_test_emotion)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/1421 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/178 [00:00<?, ?it/s]

In [None]:
# Create a list to append predictions and find f1 score 
predictions = []
for x in model_outputs:
  predictions.append(np.argmax(x))
print("f1 score:", f1_score(df_test_emotion["label_no"], predictions, average='macro',))

f1 score: 0.7834674592128259


In [None]:
# Postprocessing to save predictions as a text file to find Tweeteval score
predictions = str(predictions)
pred = predictions.replace(",", "")
pred = pred.replace(" ", "")
pred = pred.replace("[", "")
pred = pred.replace("]", "")
with open("/content/predictions/emotion.txt", "w") as outfile:
  outfile.write("\n".join(pred))

In [None]:
# Run the tweeteval to compare the test set and the predictions made by the model
%run evaluation_script.py --tweeteval_path ./datasets/ --predictions_path ./predictions/ --task emotion

0 {'precision': 0.8155339805825242, 'recall': 0.9032258064516129, 'f1-score': 0.8571428571428571, 'support': 558}
1 {'precision': 0.816711590296496, 'recall': 0.8487394957983193, 'f1-score': 0.8324175824175825, 'support': 357}
2 {'precision': 0.7448979591836735, 'recall': 0.5934959349593496, 'f1-score': 0.6606334841628959, 'support': 123}
3 {'precision': 0.8408408408408409, 'recall': 0.7329842931937173, 'f1-score': 0.7832167832167832, 'support': 382}
accuracy 0.8169014084507042
macro avg {'precision': 0.8044960927258836, 'recall': 0.7696113826007498, 'f1-score': 0.7833526767350297, 'support': 1420}
weighted avg {'precision': 0.8165194711842892, 'recall': 0.8169014084507042, 'f1-score': 0.8140179724997455, 'support': 1420}
------------------------------
TweetEval Score (emotion): 0.7833526767350297


# Offensive Tasks

In [None]:
# Open and read train tweet text file
train_tweet_offensive = open("/content/datasets/offensive/train_text.txt", 'r')
train_tweet_offensive_2 = train_tweet_offensive.readlines()

In [None]:
# Open and read tweet labels text file
train_labels_offensive = open("/content/datasets/offensive/train_labels.txt", 'r')
train_labels_offensive_2 = train_labels_offensive.readlines()

In [None]:
# Use the preprocess function created before
train_label_offensive = preprocess_label(train_labels_offensive_2) 

In [None]:
# Create a dataframe with both user tweets and tweet labels 
df_train_offensive = pd.DataFrame({'tweet':train_tweet_offensive_2, 'label_no':train_label_offensive})

In [None]:
# Create a variable for the roberta-base model, number of labels varies depending on task 
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir=True, manual_seed=42)
model = ClassificationModel(model_type='roberta', model_name='roberta-base', use_cuda=True, num_labels=2, args=model_args)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [None]:
# Use the training dataset on the roberta-base model
model.train_model(df_train_offensive)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/11916 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1490 [00:00<?, ?it/s]

(1490, 0.5108337895107149)

In [None]:
# Open and read test tweet file
test_tweet_offensive = open("/content/datasets/offensive/test_text.txt", 'r')
test_tweet_offensive_2 = test_tweet_offensive.readlines()

In [None]:
# Open and read test labels file
test_labels_offensive = open("/content/datasets/offensive/test_labels.txt", 'r')
test_labels_offensive_2 = test_labels_offensive.readlines()

In [None]:
# Use the same function that was created earlier
test_label_offensive = preprocess_label(test_labels_offensive_2) 

In [None]:
# Create a dataframe with both user tweets and tweet labels
df_test_offensive = pd.DataFrame({'tweet':test_tweet_offensive_2, 'label_no':test_label_offensive})

In [None]:
# Use the testing dataset, evaluation is made when comparing the test set to predictions made
result, model_outputs, wrong_predictions = model.eval_model(df_test_offensive)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/860 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/108 [00:00<?, ?it/s]

In [None]:
# Create a list to append predictions and find f1 score
predictions = []
for x in model_outputs:
  predictions.append(np.argmax(x))
print("f1 score:", f1_score(df_test_offensive["label_no"], predictions, average='macro',))

f1 score: 0.8101821196509418


In [None]:
# Postprocessing to save predictions as a text file for tweeteval
predictions = str(predictions)
pred = predictions.replace(",", "")
pred = pred.replace(" ", "")
pred = pred.replace("[", "")
pred = pred.replace("]", "")
with open("/content/predictions/offensive.txt", "w") as outfile:
  outfile.write("\n".join(pred))

In [None]:
# Run the tweeteval to compare the test set and the predictions made by the model
%run evaluation_script.py --tweeteval_path ./datasets/ --predictions_path ./predictions/ --task offensive

0 {'precision': 0.8723723723723724, 'recall': 0.938610662358643, 'f1-score': 0.9042801556420234, 'support': 619}
1 {'precision': 0.8031088082901554, 'recall': 0.6458333333333334, 'f1-score': 0.7159353348729792, 'support': 240}
accuracy 0.8568102444703143
macro avg {'precision': 0.8377405903312639, 'recall': 0.7922219978459881, 'f1-score': 0.8101077452575013, 'support': 859}
weighted avg {'precision': 0.8530205034786214, 'recall': 0.8568102444703143, 'f1-score': 0.851657621317727, 'support': 859}
------------------------------
TweetEval Score (offensive): 0.8101077452575013


# Sentiment Tasks

In [None]:
# Open and read train tweet text file
train_tweet_sentiment = open("/content/datasets/sentiment/train_text.txt", 'r')
train_tweet_sentiment_2 = train_tweet_sentiment.readlines()

In [None]:
# Open and read train labels text file
train_labels_sentiment = open("/content/datasets/sentiment/train_labels.txt", 'r')
train_labels_sentiment_2 = train_labels_sentiment.readlines()

In [None]:
# Use the preprocess function created before
train_label_sentiment = preprocess_label(train_labels_sentiment_2) 

In [None]:
# Create a dataframe with both user tweets and tweet labels 
df_train_sentiment = pd.DataFrame({'tweet':train_tweet_sentiment_2, 'label_no':train_label_sentiment})

In [None]:
# Create a variable for the roberta-base model, number of labels varies depending on task 
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir=True, manual_seed=42)
model = ClassificationModel(model_type='roberta', model_name='roberta-base', use_cuda=True, num_labels=3, args=model_args)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

In [None]:
# Use the training dataset on the roberta-base model
model.train_model(df_train_sentiment)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/45615 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/5702 [00:00<?, ?it/s]

(5702, 0.6887746952991555)

In [None]:
# Open and read test tweet file
test_tweet_sentiment = open("/content/datasets/sentiment/test_text.txt", 'r')
test_tweet_sentiment_2 = test_tweet_sentiment.readlines()

In [None]:
# Open and read test labels file
test_labels_sentiment = open("/content/datasets/sentiment/test_labels.txt", 'r')
test_labels_sentiment_2 = test_labels_sentiment.readlines()

In [None]:
# Use the same function that was created earlier
test_label_sentiment = preprocess_label(test_labels_sentiment_2) 

In [None]:
# Create a dataframe with both user tweets and tweet labels
df_test_sentiment = pd.DataFrame({'tweet':test_tweet_sentiment_2, 'label_no':test_label_sentiment})

In [None]:
# Use the testing dataset, evaluation is made when comparing the test set to predictions made
result, model_outputs, wrong_predictions = model.eval_model(df_test_sentiment)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/12284 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1536 [00:00<?, ?it/s]

In [None]:
# Create a list to append predictions and find f1 score
predictions = []
for x in model_outputs:
  predictions.append(np.argmax(x))
print("f1 score:", f1_score(df_test_sentiment["label_no"], predictions, average='macro',))

f1 score: 0.7046589140170777


In [None]:
# Postprocessing to save predictions as a text file for tweeteval
predictions = str(predictions)
pred = predictions.replace(",", "")
pred = pred.replace(" ", "")
pred = pred.replace("[", "")
pred = pred.replace("]", "")
with open("/content/predictions/sentiment.txt", "w") as outfile:
  outfile.write("\n".join(pred))

In [None]:
# Run the tweeteval to compare the test set and the predictions made by the model
%run evaluation_script.py --tweeteval_path ./datasets/ --predictions_path ./predictions/ --task sentiment

0 {'precision': 0.6855791962174941, 'recall': 0.8031218529707955, 'f1-score': 0.7397101449275362, 'support': 3972}
1 {'precision': 0.7632346811171321, 'recall': 0.6168098366178204, 'f1-score': 0.6822543083372147, 'support': 5937}
2 {'precision': 0.6359463276836158, 'recall': 0.7586352148272957, 'f1-score': 0.6918939684978871, 'support': 2374}
accuracy 0.7044695921191891
macro avg {'precision': 0.694920068339414, 'recall': 0.7261889681386373, 'f1-score': 0.7046194739208792, 'support': 12283}
weighted avg {'precision': 0.7135212448985756, 'recall': 0.7044695921191891, 'f1-score': 0.7026971265541155, 'support': 12283}
------------------------------
TweetEval Score (sentiment): 0.7261889681386373
