In [11]:
import os
import re
import pickle

import pandas as pd
import numpy as np

from transformers import pipeline
from pretraining.pipeline import *
from pretraining.utils import *

## Pipeline
- get data
- apply preprocessing
- infer emotions
- aggregate and save results

In [22]:
dir_path = f"data/babylm_data/babylm_100M/"
# dir_path = f"data/tinystories/"

data_sources = [f for f in os.listdir(dir_path) if not f.startswith(".")]
data_sources

In [4]:
texts = get_data(dir_path, data_sources, sample_subset=10000)

total = 0
for source, ds in texts.items():
    print(f"{source}: {len(ds)}")
    total += len(ds)
print(f"Total: {total}")

In [6]:
# Select emotion classification model

# Faster model, less emotions category with worse accuracy
# pipe = pipeline(
#     "text-classification", 
#     model="j-hartmann/emotion-english-distilroberta-base", 
#     top_k=None,
#     framework="pt", # pytorch
#     )

# Larger model, more emotions category with better accuracy, slower inference
# NOTE: This model oftens predict neutral emotion
pipe = pipeline(
    "text-classification", 
    model="SamLowe/roberta-base-go_emotions", 
    top_k=None,
    framework="pt", # pytorch
    device="cpu" # cpu / cuda / mps
    )

In [7]:
scores = classify_emotion(texts, pipe)

Processing data/tinystories/train.csv


  0%|          | 0/10000 [00:00<?, ?it/s]

100%|██████████| 10000/10000 [06:53<00:00, 24.18it/s]


In [17]:
# pickle.dump(scores, open("processed_data/babylm.pkl", "wb"))
pickle.dump(scores, open("processed_data/tinystories.pkl", "wb"))

In [15]:
# Alternatively, create a dataframe and save to csv or parquet
# df = pd.DataFrame(scores)
# df.to_csv("processed_data/babylm.csv", index=False)
# df.to_parquet("processed_data/babylm.parquet", index=False)

## Further filter emotion labels

In [18]:
# scores = pickle.load(open("processed_data/babylm.pkl", "rb"))
scores = pickle.load(open("processed_data/tinystories.pkl", "rb"))
len(scores)

10000

In [19]:
scores_filtered = [s for s in scores if (s["label_0"] != "neutral") and (s["score_0"] > 0.5)]
len(scores_filtered)

4637

In [20]:
# pickle.dump(scores_filtered, open("processed_data/babylm_filtered.pkl", "wb"))
pickle.dump(scores_filtered, open("processed_data/tinystories_filtered.pkl", "wb"))

In [21]:
# df = pd.DataFrame(scores)
# set(df.iloc[0].filter(regex="^label_").values) # all possible emotions

In [22]:
# def filter_emotion(df, threshold=0.5):
#     filtered_df = df.copy()
#     scores_df = df.filter(regex="^score_")
#     labels_df = df.filter(regex="^label_")
#     # retrieve confident label(s) with scores >= threshold
#     labels = labels_df.where((scores_df >= threshold).values).values
#     # remove nans
#     labels = [tuple(set(l[~pd.isnull(l)]))for l in labels]
#     filtered_df["filtered_labels"] = labels
#     filtered_df["num_labels"] = [len(l) for l in labels]
#     return filtered_df

# filtered_df = filter_emotion(df)

In [23]:
# emotion_freq = filtered_df.filtered_labels.value_counts()
# emotion_freq.head(40) # label statistics

In [24]:
# emotion_freq.iloc[2:].sum()

In [25]:
# filtered_df.num_labels.value_counts() # number of labels per text

In [26]:
# drop if neutral or no emotion
# tmp_df = filtered_df[~filtered_df.filtered_labels.apply(lambda x: "neutral" in x or not x)]
# tmp_df.groupby("source").filtered_labels.value_counts().groupby(level=0).head(5)

In [27]:
# get samples with specific label
# emo = "optimism"
# filtered_df.loc[filtered_df.filtered_labels.apply(lambda x: emo in x), "text"].head().values