# Data Preprocessing

In [None]:
import re
import pandas as pd
from contractions import fix
import matplotlib.pyplot as plt
import seaborn as sns
import nltk


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
sns.set()

## X dataset

In [None]:
x_df = pd.read_csv('../data/annotated/dehatebert/classified_x_df_CNERG.csv')
x_df.info()

In [None]:
x_df.describe()

In [None]:
x_df.isnull().sum()

In [None]:
print(f"Duplicates: {x_df.duplicated().sum()}")

In [None]:
x_df

In [None]:
def censor_words(text, banned_words):
    def censor_match(match):
        word = match.group(0)
        censored_word = re.sub(r'([aeiouAEIOU])', '*', word, count=1)
        return censored_word

    pattern = r"\b(" + "|".join(re.escape(word) for word in banned_words) + r")\b"
    return re.sub(pattern, censor_match, text, flags=re.IGNORECASE)


In [None]:
from wordcloud import WordCloud

text_data = " ".join(x_df["Full_Text"].astype(str))
banned_words = {"fuck", "fucking", "shit", "bitch", "faggot", "nigga"}

censored_text = censor_words(text_data, banned_words)

wordcloud = WordCloud(
    width=800, height=400,
    background_color="black",
    colormap="viridis",
    max_words=200,  
    regexp=r"\b[a-zA-Z*]+\b" 
).generate(censored_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("X Word Cloud Before Text Processing", fontsize=14)
plt.show()

In [None]:
def clean_text(text):
    """
    Function that removes @, special characters or hashtags.
    """
    text = text.lower()
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

In [None]:
from wordsegment import load, segment

load()

def fix_hashtags(text):
    words = text.split()
    processed_words = []
    
    for word in words:
        if word.startswith("#"):
            clean_word = word[1:]  
            
            if re.search(r'[A-Z]', clean_word):
                clean_word = re.sub(r'([a-z])([A-Z])', r'\1 \2', clean_word)
            else:
                clean_word = " ".join(segment(clean_word))
            
            processed_words.append(clean_word)
        else:
            processed_words.append(word)

    return " ".join(processed_words)

In [None]:
x_df["cleaned_text"] = x_df["Full_Text"].astype(str).apply(clean_text)
x_df.tail(30)

In [None]:
x_df["cleaned_text"] = x_df["cleaned_text"].astype(str).apply(fix_hashtags)
x_df

In [None]:
x_df["cleaned_text"] = x_df["cleaned_text"].apply(fix)
x_df

In [None]:
x_df["tokens"] = x_df["cleaned_text"].apply(word_tokenize)
print(x_df[["cleaned_text", "tokens"]].head())

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.discard("not")
x_df["tokens"] = x_df["tokens"].apply(lambda words: [w for w in words if w not in stop_words])

In [None]:
lemmatizer = WordNetLemmatizer()

x_df["tokens"] = x_df["tokens"].apply(lambda words: [lemmatizer.lemmatize(w) for w in words])
x_df["processed_text"] = x_df["tokens"].apply(lambda words: ' '.join(words))

In [None]:
x_df.head()

In [None]:
x_df[["processed_text", "label"]].to_csv("../data/processed/processed_x_dataset.csv", index=False)

In [None]:
from wordcloud import WordCloud

text_data = " ".join(x_df["processed_text"].astype(str))
banned_words = {"fuck", "fucking", "shit", "bitch", "faggot", "nigga"}
censored_text = censor_words(text_data, banned_words)

wordcloud = WordCloud(
    width=800, height=400,
    background_color="black",
    colormap="viridis",
    max_words=200,
    regexp=r"\b[a-zA-Z*]+\b"   
).generate(censored_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("X Word Cloud After Text Processing", fontsize=14)
plt.show()


In [None]:
class_count = x_df['label'].value_counts()
plt.figure(figsize=(3,3))
bars = class_count.plot(kind='bar', color=['lightgreen','orange','red'])

for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, 
             bar.get_height(), 
             f"{bar.get_height():,}",
             ha="center", va="bottom", fontsize=10)
    
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution for the X Dataset (dehatebert)")
plt.xticks(rotation=0)
plt.show()

## Reddit dataset

In [None]:
reddit_df = pd.read_csv("../data/annotated/dehatebert/classified_reddit_df_CNERG.csv")
reddit_df.info()

In [None]:
reddit_df.describe()

In [None]:
reddit_df.isnull().sum()

In [None]:
print(f"Duplicates: {reddit_df.duplicated().sum()}")

In [None]:
from wordcloud import WordCloud

text_data = " ".join(reddit_df["comment"].astype(str))
banned_words = {"fuck", "fucking", "shit", "bitch", "faggot", "nigga"}

censored_text = censor_words(text_data, banned_words)

wordcloud = WordCloud(
    width=800, height=400,
    background_color="black",
    colormap="viridis",
    max_words=200,  
    regexp=r"\b[a-zA-Z*]+\b" 
).generate(censored_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Reddit Word Cloud Before Text Processing", fontsize=14)
plt.show()

In [None]:
reddit_df["cleaned_text"] = reddit_df["comment"].astype(str).apply(clean_text)
reddit_df.tail(30)

In [None]:
reddit_df["cleaned_text"] = reddit_df["cleaned_text"].astype(str).apply(fix_hashtags)
reddit_df

In [None]:
reddit_df["cleaned_text"] = reddit_df["cleaned_text"].apply(fix)
reddit_df

In [None]:
reddit_df["tokens"] = reddit_df["cleaned_text"].apply(word_tokenize)
print(reddit_df[["cleaned_text", "tokens"]].head())

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.discard("not")
reddit_df["tokens"] = reddit_df["tokens"].apply(lambda words: [w for w in words if w not in stop_words])
reddit_df["tokens"]

In [None]:
lemmatizer = WordNetLemmatizer()

reddit_df["tokens"] = reddit_df["tokens"].apply(lambda words: [lemmatizer.lemmatize(w) for w in words])
reddit_df["processed_text"] = reddit_df["tokens"].apply(lambda words: ' '.join(words))
reddit_df["processed_text"].tail(30)

In [None]:
to_be_removed = ["removed","deleted"]

bad_values = reddit_df["processed_text"].value_counts().loc[to_be_removed]
plt.figure(figsize=(7,5))
bars = bad_values.plot(kind='bar', color=['lightblue','orange'])
for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, 
             bar.get_height(), 
             f"{bar.get_height():,}",
             ha="center", va="bottom", fontsize=10)
plt.xlabel('Type of record')
plt.ylabel('Count')
plt.title('No. of rows scraped from Reddit that have been removed or deleted')
plt.xticks(rotation=0)
plt.show()



In [None]:
reddit_df = reddit_df[~reddit_df["processed_text"].isin(to_be_removed)]
reddit_df

In [None]:
from wordcloud import WordCloud

text_data = " ".join(reddit_df["comment"].astype(str))
banned_words = {"fuck", "fucking", "shit", "bitch", "faggot", "nigga"}

censored_text = censor_words(text_data, banned_words)

wordcloud = WordCloud(
    width=800, height=400,
    background_color="black",
    colormap="viridis",
    max_words=200,  
    regexp=r"\b[a-zA-Z*]+\b" 
).generate(censored_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Reddit Word Cloud After Text Processing", fontsize=14)
plt.show()

In [None]:
reddit_class = reddit_df['label'].value_counts()

In [None]:
plt.figure(figsize=(3,3))
reddit_bar = reddit_class.plot(kind='bar', color=['lightgreen','orange','red'])
for bar in reddit_bar.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, 
             bar.get_height(), 
             f"{bar.get_height():,}",
             ha="center", va="bottom", fontsize=10)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title("Class Distribution for the Reddit Dataset (dehatebert)")
plt.xticks(rotation=0)
plt.show()

In [None]:
reddit_df[['processed_text', 'label']].to_csv('../data/processed/processed_reddit_dataset.csv')

## Kaggle dataset

In [None]:
k_df = pd.read_csv('../data/HateSpeechDatasetBalanced.csv')


In [None]:
print(len(k_df))
k_df.drop_duplicates(subset='Content', inplace=True)
print(f"Length w/o duplicates {len(k_df)}")


In [None]:
k_df["cleaned_text"] = k_df["Content"].astype(str).apply(clean_text)
k_df.tail()

In [None]:
k_df["cleaned_text"] = k_df["Content"].astype(str).apply(fix_hashtags)
k_df.head()

In [None]:
k_df["cleaned_text"] = k_df["Content"].apply(fix)
k_df

In [None]:
k_df["tokens"] = k_df["cleaned_text"].apply(word_tokenize)
print(k_df[["cleaned_text", "tokens"]].head())

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.discard("not")
k_df["tokens"] = k_df["tokens"].apply(lambda words: [w for w in words if w not in stop_words])

In [None]:
lemmatizer = WordNetLemmatizer()

k_df["tokens"] = k_df["tokens"].apply(lambda words: [lemmatizer.lemmatize(w) for w in words])
k_df["processed_text"] = k_df["tokens"].apply(lambda words: ' '.join(words))

In [None]:
k_df.rename(columns={'Label':'label'}, inplace=True)

In [None]:
k_df

In [None]:
k_df[["processed_text", "label"]].to_csv("../data/processed/processed_kaggle_dataset.csv", index=False)