In this notebook we will process the files inside the folder DatasetsInUse

First, we will start to process the data from the emotion_tweets_2020 folder

In [192]:
import re
import inflect
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.read().splitlines()
    return lines

def mapping(key_list):
    emotion_map = {
        '0': "anger",
        '1': "joy",
        '2': "optimism",
        '3': "sadness",
    }

    emotions = [emotion_map[num] for num in key_list]

    return emotions

def specific_case(text):
    result = re.sub(r'(&gt;){3}', 'is better than', text)
    result = result.replace("szn", "season")
    result = re.sub(r'&[gl]t;?', '', result)
    result = result.replace("ó", "o")
    result = result.replace("ñ", "n")
    result = result.replace("é", "e")
    return result

def normalize_repeated_characters(text):
    # Replace 3 or more consecutive characters with just one
    return re.sub(r'(.)\1{2,}', r'\1', text)

def remove_user_mentions(text):
    return re.sub(r'@(\w+)', r'\1', text)

def process_more_sign(text):
    result = re.sub(r'\s*user \+', 'user', text)
    result = re.sub(r'#\++', '', result)
    result = re.sub(r'(?<=\d)\+', ' more ', result)
    result = re.sub(r'(?<=\s)\+', ' plus ', result)
    result = re.sub(r'\+1', ' plus one ', result)
    return result

def process_dollar(text):
    result = re.sub(r'\${2,}', 'cash', text)
    pattern = r'\$(\d+(?:\.\d{2})?)'
    result = re.sub(pattern, lambda match: match.group(1) + ' dollars ', result)
    result = re.sub(r'\$*', '', result)
    return result

def process_euro(text):
    pattern = r'\€(\d+(?:\.\d{2})?)'
    result = re.sub(pattern, lambda match: match.group(1) + ' euros ', text)
    return result

def process_pounds(text):
    pattern = r'\£(\d+(?:\.\d{2})?)'
    result = re.sub(pattern, lambda match: match.group(1) + ' pounds ', text)
    return result

def process_percent(text):
    pattern = r'(?:\s+|\d+(?:\.\d{0,2})?)%'
    result = re.sub(pattern, lambda match: match.group(0).replace('%', ' percent '), text)
    result = re.sub(r'%', '', result)
    return result

def process_equal(text):
    result = re.sub(r'=', ' equals ', text)
    return result

def process_at(text):
    result = re.sub(r'(?<=\s)@(?=\s)', ' at ', text)
    return result

def remove_newlines(text):
    return re.sub(r'\\n', ' ', text)

def process_amp(text):
    return re.sub(r'&amp;?', ' and ', text)

def process_hyphen(text):
    return re.sub(r'(\d+)\s*-\s*(\d+)', r'\1 to \2', text)

def replace_numbers_with_words(text):
    p = inflect.engine()

    number_pattern = r'(\d+\.\d+|\d+)'

    numbers = re.findall(number_pattern, text)

    for number in numbers:
        word_representation = p.number_to_words(number)
        text = re.sub(re.escape(number), word_representation, text)

    return text

def clear_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', ' ', text)

def lowercase_text(text):
    return text.lower()

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    return ' '.join(lemmatizer.lemmatize(word, wordnet.VERB) for word in tokens)

def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    tokens = nltk.word_tokenize(text)
    return ' '.join(word for word in tokens if word.lower() not in stop_words)

def process_text(text):
    text = specific_case(text)
    text = remove_user_mentions(text)
    text = process_more_sign(text)
    text = process_dollar(text)
    text = process_euro(text)
    text = process_pounds(text)
    text = process_percent(text)
    text = process_hyphen(text)
    text = process_equal(text)
    text = process_at(text)
    text = remove_newlines(text)
    text = process_amp(text)
    text = replace_numbers_with_words(text)
    text = normalize_repeated_characters(text)
    text = clear_special_characters(text)
    text = lowercase_text(text)
    text = lemmatize_text(text)
    text = remove_stopwords(text)
    return text

def create_df(keys_file, values_file):
    keys = read_file(keys_file)
    keys = mapping(keys)    
    values = read_file(values_file)

    processed_values = [process_text(value) for value in values]

    #for value, processed_value in zip(values, processed_values):
        #print(f"Original Value: {value} - Processed Value: {processed_value}")

    # Create a dictionary to store the data
    data_dict = {
        "text": processed_values,
        "emotions": keys  # Use the processed values
    }

    # Create a Pandas DataFrame from the dictionary
    df = pd.DataFrame(data_dict)

    # Display the DataFrame
    # df.head()

    return df

def create_dictionary(keys_file, values_file):
    keys = read_file(keys_file)
    keys = mapping(keys)    
    values = read_file(values_file)
    
    # print(keys[:5])

    # print(values[2839])

    if len(keys) != len(values):
        print("Error: The number of keys and values does not match.")
        return None

    non_alphanumeric_characters = {}
    
    for key, value in zip(keys, values):
        
        #if key == "Optimism":
        #    print(value)
        # Use a regular expression to find non-alphanumeric characters
        #ats = []
        #ats = re.findall(r"(\d)\1{2,}", value)
        characters = re.findall(r'[^a-zA-Z0-9\s]', value)
        for character in characters:
            if character not in non_alphanumeric_characters:
                non_alphanumeric_characters[character] = value

        #if ats != []:
            #print(value)

    print("Non-alphanumeric characters in the text:")
    for ch, sent in non_alphanumeric_characters.items():
        print(ch)
        print(sent)

    return emotion_tweets

keys_file = "DatasetsInUse/emotion_tweets_2020/train_labels.txt"
values_file = "DatasetsInUse/emotion_tweets_2020/train_text.txt"

df = create_df(keys_file, values_file)

print(df.shape[0])

df.head()

# resulting_dict = create_dictionary(keys_file, values_file)

#for key in list(resulting_dict.keys())[:3]:
    #value = resulting_dict[key]
    #print(key, value)

3257


Unnamed: 0,text,emotions
0,worry payment problem may never joyce meyer mo...,optimism
1,roommate okay spell autocorrect terrible first...,anger
2,cute atsu probably shy photos cherry help uwu,joy
3,rooneys fuck untouchable fuck dreadful depay l...,anger
4,pretty depress u hit pan ur favourite highlighter,sadness


Secondly, we will process the data from the emotion folder

In [178]:
import pandas as pd

df = pd.read_pickle("DatasetsInUse/emotion/merged_training.pkl")

non_alphanumeric_characters = {}

for row in df.itertuples():
    characters = re.findall(r'[^a-zA-Z0-9\s]', row.text)
    for character in characters:
            if character not in non_alphanumeric_characters:
                non_alphanumeric_characters[character] = row.text
    #print(row.emotions, row.text)

# print("Non-alphanumeric characters in the text:")
for ch, sent in non_alphanumeric_characters.items():
    print(ch)
    print(sent)

df.emotions.unique()
df.head()

Unnamed: 0,text,emotions
27383,i feel awful about it too because it s my job ...,sadness
110083,im alone i feel awful,sadness
140764,ive probably mentioned this before but i reall...,joy
100071,i was feeling a little low few days back,sadness
2837,i beleive that i am much more sensitive to oth...,love


In [45]:
import re

def normalize_repeated_characters(text):
    # Replace 3 or more consecutive characters with just one
    return re.sub(r'(.)\1{2,}', r'\1', text)

# Example
text = "I'm sooooo happyyyy today!!!"
normalized_text = normalize_repeated_characters(text)

print(normalized_text)

I'm so happy today!


In [53]:
def remove_user_mentions(text):
    return re.sub(r'@\w+', 'user', text)

# Example
text = "Hey @user, great post!"
cleaned_text = remove_user_mentions(text)

print(cleaned_text)

Hey user, great post!


In [65]:
import string

def remove_special_characters(text):
    return ''.join(char for char in text if char not in string.punctuation)

# Example
text = "Hello, Twitter! £§£€5 #NLP"
cleaned_text = remove_special_characters(text)
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', cleaned_text)

print(cleaned_text)

Hello Twitter     5 NLP


In [185]:
def replace_szn_with_season(text):
    # Use the str.replace() method to replace "szn" with "season"
    #result = text.replace("szn", "season")
    result = text.replace("é", "e")
    return result

# Example
original_text = "@user szn 3 &gt;&gt;&gt; szn 1 &gt;&gt;&gt; szn 2. Just to warn you. Don't let szn 2 discourage you. "
text = "bdjdb berbéu é jhdbé éhdhdh ggdéff"
modified_text = replace_szn_with_season(text)

print("Original Text:", text)
print("Modified Text:", modified_text)

Original Text: bdjdb berbéu é jhdbé éhdhdh ggdéff
Modified Text: bdjdb berbeu e jhdbe ehdhdh ggdeff


In [100]:
import re

def replace_is_better_than(text):
    # Use re.sub to replace "is better than" with ">>>"
    result = re.sub(r'(&gt;){3}', 'is better than', text)
    return result

# Example
original_text = "@user szn 3 &gt;&gt;&gt; szn 1 &gt;&gt;&gt; szn 2. Just to warn you. Don't let szn 2 discourage you. "
modified_text = replace_is_better_than(original_text)

print("Original Text:", original_text)
print("Modified Text:", modified_text)

Original Text: @user szn 3 &gt;&gt;&gt; szn 1 &gt;&gt;&gt; szn 2. Just to warn you. Don't let szn 2 discourage you. 
Modified Text: @user szn 3 is better than szn 1 is better than szn 2. Just to warn you. Don't let szn 2 discourage you. 


In [108]:
import re

def replace_user_plus_at_beginning(text):
    pattern = r'\s*user \+'
    replacement = 'user'
    result = re.sub(pattern, replacement, text)
    return result

# Examples
text1 = ' user + account created'
text2 = 'user + account created'
text3 = '    user + account created'

print(replace_user_plus_at_beginning(text1))  # 'user account created'
print(replace_user_plus_at_beginning(text2))  # 'user+ account created'
print(replace_user_plus_at_beginning(text3))  # 'user account created'


user account created
user account created
user account created


In [112]:
import re

def find_more_than_one_dollar_signs(text):
    pattern = r'\${2,}'
    matches = re.findall(pattern, text)
    return matches

# Example
text = "The price is $$10.00, $$$25.50, and $$$$50.75."
result = find_more_than_one_dollar_signs(text)
print(result)  # ['$$', '$$$', '$$$$']

['$$', '$$$', '$$$$']


In [152]:
import re

def add_dollars_to_numbers(text):
    # Define a regular expression pattern to match a dollar sign followed by a number
    pattern = r'\$(\d+(?:\.\d{2})?)'
    
    # Use re.sub with a lambda function to replace matched text
    result = re.sub(pattern, lambda match: match.group(1) + ' dollars', text)
    
    return result

# Example
text = "The price is $10.00, but the total is $50.75."
result = add_dollars_to_numbers(text)
print(result)  # "The price is 10.00 dollars, but the total is 50.75 dollars."


The price is 10.00 dollars, but the total is 50.75 dollars.


In [119]:
import re

def replace_percentages_with_string(text):
    pattern = r'(?:\s+|\d+(?:\.\d{0,2})?)%'
    
    # Use re.sub with a lambda function to replace matched text
    result = re.sub(pattern, lambda match: match.group(0).replace('%', ' percent '), text)
    
    return result

# Example
text = "The interest rate is 5%, and the discount is 10.25 %."
result = replace_percentages_with_string(text)
print(result)  # "The interest rate is 5 percent, and the discount is 10.25 percent."


The interest rate is 5 percent , and the discount is 10.25  percent .


In [124]:
import re

text = "Send me an email at user@example.com. Mention me @ mention if you have questions."

matches = re.findall(r'\s@\s', text)

print(matches)


[' @ ']


In [132]:
import re

text = "Send me an email at user@example.com. Mention me @mention if you have questions."

# Use a capturing group to capture the words after "@" and replace the entire match
result = re.sub(r'@(\w+)', r'\1', text)

print(result)


Send me an email at userexample.com. Mention me mention if you have questions.


In [151]:
import re

def process_at(text):
    result = re.sub(r'(?<=\s)@(?=\s)', ' at ', text)
    # result = re.sub(r'(?:\s)(@)(?:\s)', lambda match: match.group(1).replace('@', ' at '), text)
    return result

# Example usage
text = "Send me an email at user @ example.com. Mention me @ mention if you have questions."

result = process_at(text)
print(result)


Send me an email at user  at  example.com. Mention me  at  mention if you have questions.


In [158]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = nltk.word_tokenize(text)
    return ' '.join(lemmatizer.lemmatize(word, wordnet.VERB) for word in tokens)

# Example
text = "I am                running in        the           park               ."
lemmatized_text = lemmatize_text(text)

print(lemmatized_text)

I be run in the park .


In [179]:
import pandas as pd

# Sample data
keys = ["Anger", "Joy", "Optimism", "Sadness", "Joy", "Joy"]
values = ["I am Rachel", "You are out of your mind", "Lalalala", "No more", "Yay", "Let's go!"]

# Define a function to process each element in the "Values" list
def process_value(value):
    # Apply your processing function here
    return value.upper()  # For example, convert to uppercase

# Apply the processing function to each element in the "Values" list
processed_values = [process_value(value) for value in values]

# Create a dictionary to store the data
data_dict = {
    "text": processed_values,
    "emotions": keys  # Use the processed values
}

# Create a Pandas DataFrame from the dictionary
df = pd.DataFrame(data_dict)

# Display the DataFrame
df.head()


Unnamed: 0,text,emotions
0,I AM RACHEL,Anger
1,YOU ARE OUT OF YOUR MIND,Joy
2,LALALALA,Optimism
3,NO MORE,Sadness
4,YAY,Joy
