In [1]:
pip install -U transformers

Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.4
    Uninstalling transformers-4.52.4:
      Successfully uninstalled transformers-4.52.4
Successfully installed transformers-4.53.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

splits = {'train': 'split/train-00000-of-00001.parquet', 'validation': 'split/validation-00000-of-00001.parquet', 'test': 'split/test-00000-of-00001.parquet'}
dairdataset = pd.read_parquet("hf://datasets/dair-ai/emotion/" + splits["train"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
dairdataset.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

# Apply cleaning
dairdataset['cleaned_text'] = dairdataset['text'].apply(clean_text)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
dairdataset.head()

Unnamed: 0,text,label,cleaned_text
0,i didnt feel humiliated,0,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,0,go feeling hopeless damned hopeful around some...
2,im grabbing a minute to post i feel greedy wrong,3,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,2,ever feeling nostalgic fireplace know still pr...
4,i am feeling grouchy,3,feeling grouchy


In [7]:
label2emotion = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise',
}

dairdataset['emotion_label'] = dairdataset['label'].map(label2emotion)

In [8]:
dairdataset.head()

Unnamed: 0,text,label,cleaned_text,emotion_label
0,i didnt feel humiliated,0,didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,0,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing a minute to post i feel greedy wrong,3,im grabbing minute post feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,2,ever feeling nostalgic fireplace know still pr...,love
4,i am feeling grouchy,3,feeling grouchy,anger


In [9]:
reddit_df = pd.read_csv("/content/drive/MyDrive/reddit_emotion_test_set.csv")
reddit_df.head()

Unnamed: 0,post_id,title,selftext,post_score,upvote_ratio,num_comments,created_utc,comment_id,comment_body,comment_score,comment_awards,comment_created_utc,full_content,cleaned_text,normalized_text,emotion,top_emotion
0,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0j5twr,Raped. He raped a minor entrusted under his ca...,27,0,1733404000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.545, 'love': 0.828, 'sadness': 0.969}",sadness
1,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hiscs,Apparently this dude is bro of Dr halina wife ...,74,0,1733371000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.309, 'love': 0.945, 'sadness': 0.95}",sadness
2,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hs8q8,"Like my mom always asked, 'Anak siapa ni?'",18,0,1733375000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'fear': 0.318, 'love': 0.965, 'sadness': 0.931}",love
3,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hpmsh,> She reportedly said the married Amirul Arif ...,28,0,1733374000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'fear': 0.956, 'love': 0.591, 'sadness': 0.386}",fear
4,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hjkkv,nerakazens are doing their job at x. hehehe ...,12,0,1733371000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.431, 'love': 0.917, 'sadness': 0.943}",sadness


In [10]:
emotion2label = {v: k for k, v in label2emotion.items()}
reddit_df['label'] = reddit_df['top_emotion'].map(emotion2label)
reddit_df.head()

Unnamed: 0,post_id,title,selftext,post_score,upvote_ratio,num_comments,created_utc,comment_id,comment_body,comment_score,comment_awards,comment_created_utc,full_content,cleaned_text,normalized_text,emotion,top_emotion,label
0,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0j5twr,Raped. He raped a minor entrusted under his ca...,27,0,1733404000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.545, 'love': 0.828, 'sadness': 0.969}",sadness,0
1,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hiscs,Apparently this dude is bro of Dr halina wife ...,74,0,1733371000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.309, 'love': 0.945, 'sadness': 0.95}",sadness,0
2,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hs8q8,"Like my mom always asked, 'Anak siapa ni?'",18,0,1733375000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'fear': 0.318, 'love': 0.965, 'sadness': 0.931}",love,2
3,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hpmsh,> She reportedly said the married Amirul Arif ...,28,0,1733374000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'fear': 0.956, 'love': 0.591, 'sadness': 0.386}",fear,4
4,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hjkkv,nerakazens are doing their job at x. hehehe ...,12,0,1733371000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.431, 'love': 0.917, 'sadness': 0.943}",sadness,0


In [11]:
dairdataset.rename(columns={'cleaned_text': 'test_text'}, inplace=True)
reddit_df.rename(columns={'normalized_text': 'test_text'}, inplace=True)

In [12]:
reddit_df.head()

Unnamed: 0,post_id,title,selftext,post_score,upvote_ratio,num_comments,created_utc,comment_id,comment_body,comment_score,comment_awards,comment_created_utc,full_content,cleaned_text,test_text,emotion,top_emotion,label
0,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0j5twr,Raped. He raped a minor entrusted under his ca...,27,0,1733404000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.545, 'love': 0.828, 'sadness': 0.969}",sadness,0
1,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hiscs,Apparently this dude is bro of Dr halina wife ...,74,0,1733371000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.309, 'love': 0.945, 'sadness': 0.95}",sadness,0
2,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hs8q8,"Like my mom always asked, 'Anak siapa ni?'",18,0,1733375000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'fear': 0.318, 'love': 0.965, 'sadness': 0.931}",love,2
3,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hpmsh,> She reportedly said the married Amirul Arif ...,28,0,1733374000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'fear': 0.956, 'love': 0.591, 'sadness': 0.386}",fear,4
4,1h704et,Malaysian psychiatrist with ‘promising career’...,,137,0.94,46,1733371000.0,m0hjkkv,nerakazens are doing their job at x. hehehe ...,12,0,1733371000.0,Malaysian psychiatrist with ‘promising career’...,malaysian psychiatrist with promising career c...,malaysian psychiatrist promising career convic...,"{'anger': 0.431, 'love': 0.917, 'sadness': 0.943}",sadness,0


In [13]:
dairdataset.head()

Unnamed: 0,text,label,test_text,emotion_label
0,i didnt feel humiliated,0,didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,0,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing a minute to post i feel greedy wrong,3,im grabbing minute post feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,2,ever feeling nostalgic fireplace know still pr...,love
4,i am feeling grouchy,3,feeling grouchy,anger


In [14]:
from datasets import Dataset, DatasetDict

# Convert pandas DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(dairdataset[['test_text', 'label']])
test_dataset = Dataset.from_pandas(reddit_df[['test_text', 'label']])

# Create DatasetDict
full_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [15]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [16]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

from transformers.utils import is_tf_available
def patched_is_tf_available():
    return False
is_tf_available = patched_is_tf_available

# Now load model safely
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=6,
    id2label=label2emotion,
    label2id={v: k for k, v in label2emotion.items()}
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [17]:
def tokenize_function(examples):
    return tokenizer(examples['test_text'], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = full_dataset.map(tokenize_function, batched=True, batch_size=32)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/13723 [00:00<?, ? examples/s]

In [18]:
tokenized_datasets = tokenized_datasets.remove_columns(['test_text'])
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [19]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import torch

# Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return {'accuracy': acc, 'f1': f1}

In [20]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    load_best_model_at_end=True,
    report_to='none'
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

In [1]:
pip install numpy==1.26.0

Collecting numpy==1.26.0
  Downloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but yo

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1506,5.962795,0.063179,0.159101
2,0.1045,6.807471,0.059025,0.172983
3,0.0684,7.260326,0.071996,0.173721
4,0.048,7.244201,0.075202,0.176945
5,0.0274,7.639999,0.066239,0.182285


TrainOutput(global_step=5000, training_loss=0.08029843940734863, metrics={'train_runtime': 1119.1039, 'train_samples_per_second': 71.486, 'train_steps_per_second': 4.468, 'total_flos': 2649536962560000.0, 'train_loss': 0.08029843940734863, 'epoch': 5.0})