In [None]:
import pandas as pd
import numpy as np
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import CountVectorizer

nltk.data.path.append('/usr/nltk_data')

nltk.download('punkt', force=True)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

try:
    print(nltk.data.find('tokenizers/punkt'))
except Exception as e:
    print("Error downloading punkt:", e)
    print("Trying alternate locations or manual download.")

go_emotions_1 = pd.read_csv('goemotions_1.csv')
go_emotions_2 = pd.read_csv('goemotions_2.csv')
go_emotions_3 = pd.read_csv('goemotions_3.csv')

go_emotions_1_sampled = go_emotions_1.sample(frac=0.2, random_state=42)
go_emotions_2_sampled = go_emotions_2.sample(frac=0.2, random_state=42)
go_emotions_3_sampled = go_emotions_3.sample(frac=0.2, random_state=42)

all_go_emotions = pd.concat([go_emotions_1_sampled, go_emotions_2_sampled, go_emotions_3_sampled], ignore_index=True)

emotion_columns = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
    'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
    'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
    'sadness', 'surprise', 'neutral'
]

def get_emotion_labels(row):
    emotions = []
    for emotion in emotion_columns:
        if row[emotion] == 1:
            emotions.append(emotion)
    return ','.join(emotions)

all_go_emotions['emotion'] = all_go_emotions.apply(get_emotion_labels, axis=1)

all_go_emotions = all_go_emotions[['text', 'emotion']]

combined_df = all_go_emotions.copy()

def normalize_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    return text

combined_df['text'] = combined_df['text'].apply(normalize_text)

tokenizer = RegexpTokenizer(r'\w+')
combined_df['tokens'] = combined_df['text'].apply(tokenizer.tokenize)

stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

combined_df['tokens'] = combined_df['tokens'].apply(remove_stopwords)

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

combined_df['tokens'] = combined_df['tokens'].apply(lemmatize_tokens)

combined_df['emotion_list'] = combined_df['emotion'].apply(lambda x: x.split(','))

exploded_df = combined_df.explode('emotion_list')

exploded_df['emotion_list'] = exploded_df['emotion_list'].str.strip()

emotion_counts = exploded_df['emotion_list'].value_counts()
print("Class Distribution Before Handling Minority Classes:")
print(emotion_counts)

class_counts = exploded_df['emotion_list'].value_counts()
minority_classes = class_counts[class_counts < 6].index.tolist()
print(f"\nClasses with less than 6 samples: {minority_classes}")

filtered_df = exploded_df[~exploded_df['emotion_list'].isin(minority_classes)].reset_index(drop=True)

print("\nClass Distribution After Removing Minority Classes:")
print(filtered_df['emotion_list'].value_counts())

filtered_df['processed_text'] = filtered_df['tokens'].apply(lambda x: ' '.join(x))

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(filtered_df['processed_text'])
y = filtered_df['emotion_list']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

resampled_df = pd.DataFrame({
    'processed_text': [' '.join(tokens) for tokens in vectorizer.inverse_transform(X_resampled)],
    'emotion': y_resampled
})

resampled_df.to_csv('preprocessed_data.csv', index=False)

print("\nData preprocessing complete. Preprocessed data saved to 'preprocessed_data.csv'.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


/root/nltk_data/tokenizers/punkt
Class Distribution Before Handling Minority Classes:
emotion_list
neutral           11020
approval           3523
admiration         3413
annoyance          2707
gratitude          2356
disapproval        2329
curiosity          1993
amusement          1824
realization        1760
optimism           1749
disappointment     1672
anger              1651
joy                1588
love               1582
confusion          1485
sadness            1358
caring             1232
excitement         1160
surprise           1110
disgust            1033
desire              754
                    700
fear                608
remorse             527
embarrassment       474
nervousness         350
relief              284
pride               280
grief               116
Name: count, dtype: int64

Classes with less than 6 samples: []

Class Distribution After Removing Minority Classes:
emotion_list
neutral           11020
approval           3523
admiration         3413
ann