In [4]:
import pandas as pd
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [5]:
df_combine = pd.read_csv('../datasets/00_output_datasets/dataset_combined.csv')

## Pre-processing standard Workflow

### Text Cleaning (Removing special characters & numbers and handling contractions)

In [6]:
# Function to clean text: remove special characters, numbers, and expand contractions
def clean_text(text):
    # Dictionary of English Contractions
    contractions_dict = {
        "I'm": "I am",
        "you're": "you are",
        "he's": "he is",
        "she's": "she is",
        "it's": "it is",
        "we're": "we are",
        "they're": "they are",
        "don't": "do not",
        "can't": "cannot",
        "won't": "will not",
        "isn't": "is not",
        "aren't": "are not",
        "didn't": "did not",
        "haven't": "have not",
        "wouldn't": "would not",
        "shouldn't": "should not",
        "couldn't": "could not"
            # Add more contractions as needed
    }
    # Regular expression for finding contractions
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    # Function for expanding contractions
    def expand_contractions(s, contractions_dict=contractions_dict):
        def replace(match):
            return contractions_dict[match.group(0)]
        return contractions_re.sub(replace, s)

    # Expand Contractions
    text = expand_contractions(text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    
    return text

# Apply the cleaning function to both DataFrames
df_combine['text'] = df_combine['text'].apply(clean_text)

# Display the head of the combined DataFrame to verify the changes
display(df_combine.head())

Unnamed: 0,label,text
0,hate,I hate women
1,hate,I hate trans people
2,hate,I hate gay people
3,hate,I hate black people
4,hate,I hate disabled people


### Normalization

In [7]:
# Download necessary NLTK resources
nltk.download('punkt')  # For tokenization
nltk.download('wordnet')  # For lemmatization

# Now, initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /Users/jlangela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jlangela/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
# Download the 'omw-1.4' resource to fix an error that I encountered further down below
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jlangela/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
# Convert text to lowercase
df_combine['text'] = df_combine['text'].str.lower()

# Initialize the NLTK WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to lemmatize text
def lemmatize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Lemmatize each word in the text
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Rejoin lemmatized tokens into a single string
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

# Apply the lemmatization function to the 'text' column
df_combine['text'] = df_combine['text'].apply(lemmatize_text)

# Display the head of the DataFrame to verify the changes
df_combine.head()

Unnamed: 0,label,text
0,hate,i hate woman
1,hate,i hate trans people
2,hate,i hate gay people
3,hate,i hate black people
4,hate,i hate disabled people


In [10]:
for index, row in df_combine.iterrows():
    if row['text'] == '':
        df_combine.drop(index, inplace=True)

In [11]:
## saving for BERT Model which tokenizes the text on its own
df_combine.to_csv('../datasets/01_preprocessed_datasets/dataset_preprocessed_no_transformation.csv', index=False)

### Tokenization

In [12]:
# Ensure you have the necessary NLTK resource downloaded
nltk.download('punkt')

# Assuming df_combine is your DataFrame and it has a column named 'text' with normalized text
# Define a function to tokenize text
def tokenize_text(text):
    # Use NLTK's word_tokenize function to split the text into tokens
    tokens = word_tokenize(text)
    return tokens

# Apply the tokenization function to each row in the 'text' column
df_combine['tokens'] = df_combine['text'].apply(tokenize_text)

# Display the first few rows to check the tokenized text
display(df_combine.head())

[nltk_data] Downloading package punkt to /Users/jlangela/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,label,text,tokens
0,hate,i hate woman,"[i, hate, woman]"
1,hate,i hate trans people,"[i, hate, trans, people]"
2,hate,i hate gay people,"[i, hate, gay, people]"
3,hate,i hate black people,"[i, hate, black, people]"
4,hate,i hate disabled people,"[i, hate, disabled, people]"


### Removing Stop Words

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jlangela/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords

# Get the list of English stop words
stop_words = set(stopwords.words('english'))

def remove_stop_words(tokens):
    """Remove stop words from a list of tokens"""
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [15]:
#Applying it to tokens column
df_combine['text'] = df_combine['tokens'].apply(remove_stop_words)
df_combine['text_processed'] = df_combine['text'].apply(lambda x: ' '.join(x).replace(',', ''))

df_combine.drop(columns=['tokens', 'text'], inplace=True)
df_combine = df_combine[df_combine['text_processed'] != ""]
df_combine.rename(columns={'text_processed': 'text'}, inplace=True)

# Display the DataFrame to verify stop words are removed
display(df_combine.head())

Unnamed: 0,label,text
0,hate,hate woman
1,hate,hate trans people
2,hate,hate gay people
3,hate,hate black people
4,hate,hate disabled people


In [16]:
## saving for BERT Model which tokenizes the text on its own
df_combine.to_csv('../datasets/01_preprocessed_datasets/dataset_preprocessed_stopwords.csv', index=False)