# Data Preprocessing

In [2]:
import re
import pandas as pd
from contractions import fix
import matplotlib.pyplot as plt
import seaborn as sns
import nltk


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
sns.set()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arbru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arbru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arbru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## X dataset

In [None]:
x_df = pd.read_csv('../data/annotated/dehatebert/classified_x_df_CNERG.csv')
x_df.info()

In [None]:
x_df.describe()

In [None]:
x_df.isnull().sum()

In [None]:
print(f"Duplicates: {x_df.duplicated().sum()}")

In [None]:
x_df

In [16]:
def censor_words(text, banned_words):
    def censor_match(match):
        word = match.group(0)
        censored_word = re.sub(r'([aeiouAEIOU])', '*', word, count=1)
        return censored_word

    pattern = r"\b(" + "|".join(re.escape(word) for word in banned_words) + r")\b"
    return re.sub(pattern, censor_match, text, flags=re.IGNORECASE)


In [None]:
from wordcloud import WordCloud

text_data = " ".join(x_df["Full_Text"].astype(str))
banned_words = {"fuck", "fucking", "shit", "bitch", "faggot", "nigga"}

censored_text = censor_words(text_data, banned_words)

wordcloud = WordCloud(
    width=800, height=400,
    background_color="black",
    colormap="viridis",
    max_words=200,  
    regexp=r"\b[a-zA-Z*]+\b" 
).generate(censored_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("X Word Cloud Before Text Processing", fontsize=14)
plt.show()

In [17]:
def clean_text(text):
    """
    Function that removes @, special characters or hashtags.
    """
    text = text.lower()
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

In [18]:
from wordsegment import load, segment

load()

def fix_hashtags(text):
    words = text.split()
    processed_words = []
    
    for word in words:
        if word.startswith("#"):
            clean_word = word[1:]  
            
            if re.search(r'[A-Z]', clean_word):
                clean_word = re.sub(r'([a-z])([A-Z])', r'\1 \2', clean_word)
            else:
                clean_word = " ".join(segment(clean_word))
            
            processed_words.append(clean_word)
        else:
            processed_words.append(word)

    return " ".join(processed_words)

In [None]:
x_df["cleaned_text"] = x_df["Full_Text"].astype(str).apply(clean_text)
x_df.tail(30)

In [None]:
x_df["cleaned_text"] = x_df["cleaned_text"].astype(str).apply(fix_hashtags)
x_df

In [None]:
x_df["cleaned_text"] = x_df["cleaned_text"].apply(fix)
x_df

In [None]:
x_df["tokens"] = x_df["cleaned_text"].apply(word_tokenize)
print(x_df[["cleaned_text", "tokens"]].head())

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.discard("not")
x_df["tokens"] = x_df["tokens"].apply(lambda words: [w for w in words if w not in stop_words])

In [None]:
lemmatizer = WordNetLemmatizer()

x_df["tokens"] = x_df["tokens"].apply(lambda words: [lemmatizer.lemmatize(w) for w in words])
x_df["processed_text"] = x_df["tokens"].apply(lambda words: ' '.join(words))

In [None]:
x_df.head()

In [None]:
x_df[["processed_text", "label"]].to_csv("../data/processed/processed_x_dataset.csv", index=False)

In [None]:
from wordcloud import WordCloud

text_data = " ".join(x_df["processed_text"].astype(str))
banned_words = {"fuck", "fucking", "shit", "bitch", "faggot", "nigga"}
censored_text = censor_words(text_data, banned_words)

wordcloud = WordCloud(
    width=800, height=400,
    background_color="black",
    colormap="viridis",
    max_words=200,
    regexp=r"\b[a-zA-Z*]+\b"   
).generate(censored_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("X Word Cloud After Text Processing", fontsize=14)
plt.show()


In [None]:
class_count = x_df['label'].value_counts()
plt.figure(figsize=(3,3))
bars = class_count.plot(kind='bar', color=['lightgreen','orange','red'])

for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, 
             bar.get_height(), 
             f"{bar.get_height():,}",
             ha="center", va="bottom", fontsize=10)
    
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution for the X Dataset (dehatebert)")
plt.xticks(rotation=0)
plt.show()

## Reddit dataset

In [4]:
reddit_df = pd.read_csv("../data/annotated/dehatebert/classified_reddit_df_CNERG.csv")
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46638 entries, 0 to 46637
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      46638 non-null  int64  
 1   title           46638 non-null  object 
 2   body            19125 non-null  object 
 3   author          46638 non-null  object 
 4   comment         46638 non-null  object 
 5   comment_author  38974 non-null  object 
 6   score           46638 non-null  int64  
 7   upvote_ratio    46638 non-null  float64
 8   created_utc     46638 non-null  object 
 9   subreddit       46638 non-null  object 
 10  label           46638 non-null  object 
 11  confidence      46638 non-null  float64
dtypes: float64(2), int64(2), object(8)
memory usage: 4.3+ MB


In [None]:
reddit_df.describe()

In [None]:
reddit_df.isnull().sum()

In [None]:
print(f"Duplicates: {reddit_df.duplicated().sum()}")

In [None]:
from wordcloud import WordCloud

text_data = " ".join(reddit_df["comment"].astype(str))
banned_words = {"fuck", "fucking", "shit", "bitch", "faggot", "nigga"}

censored_text = censor_words(text_data, banned_words)

wordcloud = WordCloud(
    width=800, height=400,
    background_color="black",
    colormap="viridis",
    max_words=200,  
    regexp=r"\b[a-zA-Z*]+\b" 
).generate(censored_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Reddit Word Cloud Before Text Processing", fontsize=14)
plt.show()

In [None]:
reddit_df["cleaned_text"] = reddit_df["comment"].astype(str).apply(clean_text)
reddit_df.tail(30)

In [None]:
reddit_df["cleaned_text"] = reddit_df["cleaned_text"].astype(str).apply(fix_hashtags)
reddit_df

In [None]:
reddit_df["cleaned_text"] = reddit_df["cleaned_text"].apply(fix)
reddit_df

In [None]:
reddit_df["tokens"] = reddit_df["cleaned_text"].apply(word_tokenize)
print(reddit_df[["cleaned_text", "tokens"]].head())

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.discard("not")
reddit_df["tokens"] = reddit_df["tokens"].apply(lambda words: [w for w in words if w not in stop_words])
reddit_df["tokens"]

In [None]:
lemmatizer = WordNetLemmatizer()

reddit_df["tokens"] = reddit_df["tokens"].apply(lambda words: [lemmatizer.lemmatize(w) for w in words])
reddit_df["processed_text"] = reddit_df["tokens"].apply(lambda words: ' '.join(words))
reddit_df["processed_text"].tail(30)

In [None]:
to_be_removed = ["removed","deleted"]

bad_values = reddit_df["processed_text"].value_counts().loc[to_be_removed]
plt.figure(figsize=(7,5))
bars = bad_values.plot(kind='bar', color=['lightblue','orange'])
for bar in bars.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, 
             bar.get_height(), 
             f"{bar.get_height():,}",
             ha="center", va="bottom", fontsize=10)
plt.xlabel('Type of record')
plt.ylabel('Count')
plt.title('No. of rows scraped from Reddit that have been removed or deleted')
plt.xticks(rotation=0)
plt.show()



In [None]:
reddit_df = reddit_df[~reddit_df["processed_text"].isin(to_be_removed)]
reddit_df

In [None]:
from wordcloud import WordCloud

text_data = " ".join(reddit_df["comment"].astype(str))
banned_words = {"fuck", "fucking", "shit", "bitch", "faggot", "nigga"}

censored_text = censor_words(text_data, banned_words)

wordcloud = WordCloud(
    width=800, height=400,
    background_color="black",
    colormap="viridis",
    max_words=200,  
    regexp=r"\b[a-zA-Z*]+\b" 
).generate(censored_text)


plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Reddit Word Cloud After Text Processing", fontsize=14)
plt.show()

In [None]:
reddit_class = reddit_df['label'].value_counts()

In [None]:
plt.figure(figsize=(3,3))
reddit_bar = reddit_class.plot(kind='bar', color=['lightgreen','orange','red'])
for bar in reddit_bar.patches:
    plt.text(bar.get_x() + bar.get_width() / 2, 
             bar.get_height(), 
             f"{bar.get_height():,}",
             ha="center", va="bottom", fontsize=10)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title("Class Distribution for the Reddit Dataset (dehatebert)")
plt.xticks(rotation=0)
plt.show()

In [None]:
reddit_df[['processed_text', 'label']].to_csv('../data/processed/processed_reddit_dataset.csv')

## Kaggle dataset

In [12]:
k_df = pd.read_csv('../data/HateSpeechDatasetBalanced.csv')


In [13]:
print(len(k_df))
k_df.drop_duplicates(subset='Content', inplace=True)
print(f"Length w/o duplicates {len(k_df)}")


726119
Length w/o duplicates 700067


In [21]:
k_df["cleaned_text"] = k_df["Content"].astype(str).apply(clean_text)
k_df.tail()

Unnamed: 0,Content,Label,cleaned_text
726114,i mute this telecasting and played kanye west ...,1,i mute this telecasting and played kanye west ...
726115,but hell yeah he s not a bachelor but looooooo...,1,but hell yeah he s not a bachelor but looooooo...
726116,great video musician but s not my musician lol...,1,great video musician but s not my musician lol...
726117,not great pop video yeah he s not a pedophile ...,1,not great pop video yeah he s not a pedophile ...
726118,great video yeah he s non a paedophile lolllll...,1,great video yeah he s non a paedophile lolllll...


In [22]:
k_df["cleaned_text"] = k_df["Content"].astype(str).apply(fix_hashtags)
k_df.head()

Unnamed: 0,Content,Label,cleaned_text
0,denial of normal the con be asked to comment o...,1,denial of normal the con be asked to comment o...
1,just by being able to tweet this insufferable ...,1,just by being able to tweet this insufferable ...
2,that is retarded you too cute to be single tha...,1,that is retarded you too cute to be single tha...
3,thought of a real badass mongol style declarat...,1,thought of a real badass mongol style declarat...
4,afro american basho,1,afro american basho


In [23]:
k_df["cleaned_text"] = k_df["Content"].apply(fix)
k_df

Unnamed: 0,Content,Label,cleaned_text
0,denial of normal the con be asked to comment o...,1,denial of normal the con be asked to comment o...
1,just by being able to tweet this insufferable ...,1,just by being able to tweet this insufferable ...
2,that is retarded you too cute to be single tha...,1,that is retarded you too cute to be single tha...
3,thought of a real badass mongol style declarat...,1,thought of a real badass mongol style declarat...
4,afro american basho,1,afro american basho
...,...,...,...
726114,i mute this telecasting and played kanye west ...,1,i mute this telecasting and played kanye west ...
726115,but hell yeah he s not a bachelor but looooooo...,1,but hell yeah he s not a bachelor but looooooo...
726116,great video musician but s not my musician lol...,1,great video musician but s not my musician lol...
726117,not great pop video yeah he s not a pedophile ...,1,not great pop video yeah he s not a pedophile ...


In [24]:
k_df["tokens"] = k_df["cleaned_text"].apply(word_tokenize)
print(k_df[["cleaned_text", "tokens"]].head())

                                        cleaned_text  \
0  denial of normal the con be asked to comment o...   
1  just by being able to tweet this insufferable ...   
2  that is retarded you too cute to be single tha...   
3  thought of a real badass mongol style declarat...   
4                                afro american basho   

                                              tokens  
0  [denial, of, normal, the, con, be, asked, to, ...  
1  [just, by, being, able, to, tweet, this, insuf...  
2  [that, is, retarded, you, too, cute, to, be, s...  
3  [thought, of, a, real, badass, mongol, style, ...  
4                            [afro, american, basho]  


In [25]:
stop_words = set(stopwords.words('english'))
stop_words.discard("not")
k_df["tokens"] = k_df["tokens"].apply(lambda words: [w for w in words if w not in stop_words])

In [26]:
lemmatizer = WordNetLemmatizer()

k_df["tokens"] = k_df["tokens"].apply(lambda words: [lemmatizer.lemmatize(w) for w in words])
k_df["processed_text"] = k_df["tokens"].apply(lambda words: ' '.join(words))

In [31]:
k_df.rename(columns={'Label':'label'}, inplace=True)

In [32]:
k_df

Unnamed: 0,Content,label,cleaned_text,tokens,processed_text
0,denial of normal the con be asked to comment o...,1,denial of normal the con be asked to comment o...,"[denial, normal, con, asked, comment, tragedy,...",denial normal con asked comment tragedy emotio...
1,just by being able to tweet this insufferable ...,1,just by being able to tweet this insufferable ...,"[able, tweet, insufferable, bullshit, prof, tr...",able tweet insufferable bullshit prof trump na...
2,that is retarded you too cute to be single tha...,1,that is retarded you too cute to be single tha...,"[retarded, cute, single, life]",retarded cute single life
3,thought of a real badass mongol style declarat...,1,thought of a real badass mongol style declarat...,"[thought, real, badass, mongol, style, declara...",thought real badass mongol style declaration w...
4,afro american basho,1,afro american basho,"[afro, american, basho]",afro american basho
...,...,...,...,...,...
726114,i mute this telecasting and played kanye west ...,1,i mute this telecasting and played kanye west ...,"[mute, telecasting, played, kanye, west, cliqu...",mute telecasting played kanye west clique know...
726115,but hell yeah he s not a bachelor but looooooo...,1,but hell yeah he s not a bachelor but looooooo...,"[hell, yeah, not, bachelor, looooooooooooooooo...",hell yeah not bachelor loooooooooooooooooooooo...
726116,great video musician but s not my musician lol...,1,great video musician but s not my musician lol...,"[great, video, musician, not, musician, lollll...",great video musician not musician lollllllllll...
726117,not great pop video yeah he s not a pedophile ...,1,not great pop video yeah he s not a pedophile ...,"[not, great, pop, video, yeah, not, pedophile,...",not great pop video yeah not pedophile yeah lo...


In [33]:
k_df[["processed_text", "label"]].to_csv("../data/processed/processed_kaggle_dataset.csv", index=False)