In [1]:
import polars as pl
import numpy as np
import spacy
import matplotlib.pyplot as plt

In [2]:
splits = {'train': 'data/train-00000-of-00001.parquet', 'validation': 'data/validation-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}

In [3]:
df_train = pl.read_parquet('hf://datasets/Jsevisal/go_emotions_wheel/' + splits['train'])
df_test = pl.read_parquet('hf://datasets/Jsevisal/go_emotions_wheel/' + splits['test'])

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

In [5]:
def decontract(sentence):
    sentence = re.sub(r"n\'t", " not", sentence)
    sentence = re.sub(r"\'re", " are", sentence)
    sentence = re.sub(r"\'s", " is", sentence)
    sentence = re.sub(r"\'d", " would", sentence)
    sentence = re.sub(r"\'ll", " will", sentence)
    sentence = re.sub(r"\'t", " not", sentence)
    sentence = re.sub(r"\'ve", " have", sentence)
    sentence = re.sub(r"\'m", " am", sentence)
    return sentence

def removePunctuation(sentence): 
    sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
    sentence = sentence.strip()
    sentence = sentence.replace("\n"," ")
    return sentence

def removeNumber(sentence):
    alpha_sent = ""
    for word in sentence.split():
        alpha_word = re.sub('[^a-z A-Z]+', '', word)
        alpha_sent += alpha_word
        alpha_sent += " "
    alpha_sent = alpha_sent.strip()
    return alpha_sent

def removeStopWords(sentence):
    return stopwords.sub("", sentence)
def stemming(sentence):
    stemmer = SnowballStemmer("english")
    stemmedSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemmedSentence += stem
        stemmedSentence += " "
    stemmedSentence = stemmedSentence.strip()
    return stemmedSentence

In [6]:
x_train, y_train = df_train.select('text').to_series().to_list(), df_train.select('labels').to_series().to_list()
x_test, y_test = df_test.select('text').to_series().to_list(), df_test.select('labels').to_series().to_list()

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tfidf = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

In [11]:
tfidf.fit(x_train)

In [12]:
x_train = tfidf.transform(x_train)
x_test = tfidf.transform(x_test)

In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [14]:
y_train_raw = df_train['labels'].to_list()
y_test_raw = df_test['labels'].to_list()

all_labels_combined = y_train_raw + y_test_raw
all_unique_label_ids = sorted(list(set(item for sublist in all_labels_combined for item in sublist)))

In [15]:
mlb = MultiLabelBinarizer(classes=all_unique_label_ids)
mlb.fit(all_labels_combined)

y_train_multilabel = mlb.transform(y_train_raw)
y_test_multilabel = mlb.transform(y_test_raw)

In [16]:
y_train_multilabel

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [17]:
x_train[0].shape

(1, 605291)

# Random Forest

In [18]:
class_counts = np.sum(y_train_multilabel, axis=0)
total_samples = y_train_multilabel.shape[0]

In [19]:
weights_dict = {i: total_samples / count for i, count in enumerate(class_counts)}

In [25]:
model = RandomForestClassifier(class_weight='balanced', random_state = 42)

In [None]:
model.fit(x_train, y_train_multilabel)

In [None]:
y_pred = model.predict(x_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test_multilabel, y_pred))