In this notebook we will process the files inside the folder DatasetsInUse

First, we will start to process the data from the emotion_tweets_2020 folder

In [53]:
import re
import inflect
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().splitlines()
    return lines

def mapping(key_list):
    emotion_map = {
        '0': "anger",
        '1': "joy",
        '2': "optimism",
        '3': "sadness",
    }

    emotions = [emotion_map[num] for num in key_list]

    return emotions

def specific_case(text):
    result = re.sub(r'(&gt;){3}', 'is better than', text)
    result = result.replace("szn", "season")
    result = re.sub(r'&[gl]t;?', '', result)
    result = result.replace("ó", "o")
    result = result.replace("ñ", "n")
    result = result.replace("é", "e")
    return result

def normalize_repeated_characters(text):
    # Replace 3 or more consecutive characters with just one
    return re.sub(r'(.)\1{2,}', r'\1', text)

def remove_user_mentions(text):
    return re.sub(r'@(\w+)', r'\1', text)

def process_more_sign(text):
    result = re.sub(r'\s*user \+', 'user', text)
    result = re.sub(r'#\++', '', result)
    result = re.sub(r'(?<=\d)\+', ' more ', result)
    result = re.sub(r'(?<=\s)\+', ' plus ', result)
    result = re.sub(r'\+1', ' plus one ', result)
    return result

def process_dollar(text):
    result = re.sub(r'\${2,}', 'cash', text)
    pattern = r'\$(\d+(?:\.\d{2})?)'
    result = re.sub(pattern, lambda match: match.group(1) + ' dollars ', result)
    result = re.sub(r'\$*', '', result)
    return result

def process_euro(text):
    pattern = r'\€(\d+(?:\.\d{2})?)'
    result = re.sub(pattern, lambda match: match.group(1) + ' euros ', text)
    return result

def process_pounds(text):
    pattern = r'\£(\d+(?:\.\d{2})?)'
    result = re.sub(pattern, lambda match: match.group(1) + ' pounds ', text)
    return result

def process_percent(text):
    pattern = r'(?:\s+|\d+(?:\.\d{0,2})?)%'
    result = re.sub(pattern, lambda match: match.group(0).replace('%', ' percent '), text)
    result = re.sub(r'%', '', result)
    return result

def process_equal(text):
    result = re.sub(r'=', ' equals ', text)
    return result

def process_at(text):
    result = re.sub(r'(?<=\s)@(?=\s)', ' at ', text)
    return result

def remove_newlines(text):
    return re.sub(r'\\n', ' ', text)

def process_amp(text):
    return re.sub(r'&amp;?', ' and ', text)

def process_hyphen(text):
    return re.sub(r'(\d+)\s*-\s*(\d+)', r'\1 to \2', text)

def replace_numbers_with_words(text):
    p = inflect.engine()

    number_pattern = r'(\d+\.\d+|\d+)'

    numbers = re.findall(number_pattern, text)

    for number in numbers:
        word_representation = p.number_to_words(number)
        text = re.sub(re.escape(number), word_representation, text)

    return text

def clear_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

def lowercase_text(text):
    return text.lower()

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    return ' '.join(lemmatizer.lemmatize(word, wordnet.VERB) for word in tokens)

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    tokens = nltk.word_tokenize(text)
    return ' '.join(word for word in tokens if word.lower() not in stop_words)

def process_text(text):
    text = specific_case(text)
    text = remove_user_mentions(text)
    text = process_more_sign(text)
    text = process_dollar(text)
    text = process_euro(text)
    text = process_pounds(text)
    text = process_percent(text)
    text = process_hyphen(text)
    text = process_equal(text)
    text = process_at(text)
    text = remove_newlines(text)
    text = process_amp(text)
    text = replace_numbers_with_words(text)
    text = normalize_repeated_characters(text)
    text = clear_special_characters(text)
    text = lowercase_text(text)
    text = lemmatize_text(text)
    text = remove_stopwords(text)
    return text

def create_df(keys_file, values_file):
    keys = read_file(keys_file)
    keys = mapping(keys)    
    values = read_file(values_file)

    processed_values = [process_text(value) for value in values]

    data_dict = {
        "text": processed_values,
        "emotions": keys
    }

    df = pd.DataFrame(data_dict)

    return df

keys_file = "DatasetsInUse/emotion_tweets_2020/train_labels.txt"
values_file = "DatasetsInUse/emotion_tweets_2020/train_text.txt"

df1 = create_df(keys_file, values_file)

Secondly, we will process the data from the emotion folder

In [54]:
import pandas as pd

def get_data(file):
    return pd.read_pickle(file)

file = "DatasetsInUse/emotion/merged_training.pkl"
df2 = get_data(file)
print(df2.shape[0])

416809


Lastly, we will analyse the data from the go_emotion folder

In [55]:
import re

def process_slash_data(text):
    text = re.sub(r'(?<=\s)[rR]/', '', text)
    text = re.sub(r"^(?:r/|R/)", '', text)
    text = re.sub(r'(\w+)\s*/\s*(\w+)', r'\1 or \2', text)
    return text

def process_more_data(text):
    text = re.sub(r'\+(\d+)', r' more \1', text)
    text = re.sub(r'(?<=\d)\+', ' more ', text)
    text = re.sub(r'(?<=\s)\+', ' plus ', text)
    return text

def process_and_data(text):
    text = re.sub(r'\s*&\s*', ' and ', text)
    return text

def specific_case_data(text):
    text = re.sub(r'\sbi\s', ' bisexual ', text)
    text = re.sub(r'9-1-1', '911', text)
    text = re.sub(r'0-0-0-0-0-10-0-0-01-0-01-0-0-10-0-0', '', text)
    text = re.sub(r'(\d+)ish', r'\1', text)
    text = re.sub(r't@gged', 'tagged', text)
    text = re.sub(r'@.@', '', text)
    text = re.sub(r'🐇', 'rabbit', text)
    text = re.sub(r'I\'m', 'I am', text)
    text = re.sub(r'≠', ' does not equal ', text)
    text = re.sub(r'you[´\']re', ' you are ', text)
    text = re.sub(r"🤰", ' emoji ', text)
    patterns = [r'\(fæ-shē\)', r'\/ˈsatʌɪə\/', r'\/ˈteCHē\/', r'\/ˈbɪɡət\/']
    for pattern in patterns:
        text = re.sub(pattern, '', text)
    text = re.sub(r"xıs :ɹǝʍsuɐ", ' answer: six ', text)
    text = re.sub(r"\[NAME\]", ' user ', text)
    text = re.sub(r"pathetic-ness", ' patheticness ', text)
    text = re.sub(r'999999999999999999999999999999999999999999999999999999999999999999999999999999999999999991000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001234567898765432345676543345678987654345678909876543234567898765432345678909876543234567898765432345678987654323456787654345676543456543456434543434343434323456765434567654323454323456543345678987654323456789876565656565656565656565656565454545654565454323456765432345678765456', ' very big ', text)
    text = text.replace("ó", "o")
    text = text.replace("ñ", "n")
    text = text.replace("é", "e")
    text = text.replace("ň", "n")
    text = text.replace("Я", "r")
    text = text.replace("ø", "o")
    text = text.replace("á", "a")
    text = text.replace("ī", "i")
    text = text.replace("ï", "i")
    text = text.replace("🅱", "b")
    text = text.replace("ò", "o")
    text = text.replace("ā", "a")
    text = text.replace("ú", "u")
    text = text.replace("è", "e")
    text = text.replace("Á", "A")
    text = text.replace("ç", "c")
    text = text.replace("abt", "about")
    
    return text

def process_money_data(text):
    text = re.sub(r"(\d+)\s*\€", r' \1 euros ', text)
    text = re.sub(r"(\d+)\s*\$", r' \1 dollars ', text)
    return text

def process_hyphen_decimal_places(text):
    text = re.sub(r'(\d+(?:\.\d*)?)\s*-\s*(\d+(?:\.\d*)?)', r'\1 to \2', text)
    return text

def process_at_data(text):
    text = re.sub(r'@(\w+)', r'\1', text)
    text = re.sub(r'@', ' at ', text)
    return text

def length_word_bigger_one(text):
    words = text.split()  # Split the text into words
    filtered_words = [word for word in words if len(word) > 1]  # Filter words
    return ' '.join(filtered_words)  # Join filtered words back into a sentence

def final_processing(text):
    #print(text)
    text = specific_case_data(text)
    text = process_slash_data(text)
    text = process_more_data(text)
    text = process_dollar(text)
    text = process_euro(text)
    text = process_pounds(text)
    text = process_money_data(text)
    text = process_percent(text)
    text = process_hyphen_decimal_places(text)
    text = process_equal(text)
    text = process_at_data(text)
    text = remove_newlines(text)
    text = process_amp(text)
    text = process_and_data(text)
    text = replace_numbers_with_words(text)
    text = normalize_repeated_characters(text)
    text = clear_special_characters(text)
    text = lowercase_text(text)
    text = lemmatize_text(text)
    text = remove_stopwords(text)
    text = length_word_bigger_one(text)
    #print(text)
    return text

In [56]:
import pandas as pd
import json

# function to divide each emotion string into a list of numbers
# after that, the numbers are mapped to the corresponding emotion in emotions.txt
# then these emotions are grouped
def emotion_mapping(emotions, emotion_map, ekman_map):
    numbers_list = emotions.split(',')
    #print(numbers_list)
    emotions_list = [emotion_map[int(num)] for num in numbers_list]
    #print(emotions_list)
    # group the emotions
    ekman_list = [value for emotion in emotions_list for keys, value in ekman_map.items() if emotion in keys]
    duplicates = [(item, ekman_list.count(item)) for item in set(ekman_list) if ekman_list.count(item) > 1]
    sorted_duplicates = sorted(duplicates, key=lambda x: x[1], reverse=True)
    #print(ekman_list)
    #print(sorted_duplicates)
    if sorted_duplicates == []:
        if ekman_list == []:
            return 'neutral'
        else:
            if 'neutral' in ekman_list:
                return 'neutral'
            else:
                return ekman_list[0]
    else:
        return sorted_duplicates[0][0]

def read_goemotions(df_file):
    column_names = ['text', 'emotions', 'ID']

    df3 = pd.read_csv(df_file, delimiter='\t', names=column_names)

    df3 = df3.drop(columns=['ID'])

    # Reset the index
    df3 = df3.reset_index(drop=True)

    with open('DatasetsInUse/go_emotion/emotions.txt', 'r') as file:
        # Read the contents of the file
        file_contents = file.read()

    lines = file_contents.split('\n')

    map = {index: line for index, line in enumerate(lines)}

    file_path = 'DatasetsInUse/go_emotion/ekman_mapping.json'

    try:
        # Open the JSON file for reading
        with open(file_path, 'r') as json_file:
            # Load and parse the JSON data
            ekman_map = json.load(json_file)

    except FileNotFoundError:
        print(f'JSON file not found: {file_path}')
    except Exception as e:
        print(f'An error occurred: {str(e)}')

    # Create a reverse lookup dictionary
    ekman_map = {tuple(value): key for key, value in ekman_map.items()}

    ekman_map[("neutral", )] = "neutral"

    df3['emotions'] = df3['emotions'].apply(emotion_mapping, args = (map, ekman_map))
    df3['text'] = df3['text'].apply(final_processing)

    return df3

df_file = 'DatasetsInUse/go_emotion/train.tsv'
df3 = read_goemotions(df_file)
df3.head()

Unnamed: 0,text,emotions
0,favourite food anything cook,neutral
1,everyone think hes laugh screw people instead ...,neutral
2,fuck bayless isoing,anger
3,make feel threaten,fear
4,dirty southern wankers,anger


Finally, we will combine the datasets

Training data

In [57]:
def combine_df(df1, df2):
    return pd.concat([df1, df2], ignore_index=True)

df = combine_df(df1, df2)
df = combine_df(df, df3)
df = df.reset_index(drop=True)
print(df.shape[0])
print(df.head())
# df.emotions.unique()

463476
                                                text  emotions
0  worry payment problem may never joyce meyer mo...  optimism
1  roommate okay spell autocorrect terrible first...     anger
2      cute atsu probably shy photos cherry help uwu       joy
3  rooneys fuck untouchable fuck dreadful depay l...     anger
4  pretty depress u hit pan ur favourite highlighter   sadness


Selecting data

In [61]:
class_names = ['joy', 'sadness', 'anger', 'fear', 'love', 'surprise', 'neutral']
selected_data = []

for emotion in class_names:
    # Filter the data for the current class
    class_data = df[df['emotions'] == emotion]
    # Select 6000 samples for the current class
    if len(class_data) >= 6000:
        selected_data.append(class_data.head(6000))
    else:
        selected_data.append(class_data)

# Concatenate the selected data frames

df = pd.concat(selected_data)

df = df.sample(frac=1, random_state=42)

# Check the emotion values count for each class

df.emotions.value_counts()


emotions
joy         6000
neutral     6000
love        6000
surprise    6000
anger       6000
sadness     6000
fear        6000
Name: count, dtype: int64

Training data

In [62]:
import pandas as pd

# Specify the filename for the Pickle file
file_name_pkl = 'clean_training_data.pkl'

# Save the DataFrame to a Pickle file
df.to_pickle(file_name_pkl)