Load Data & Fix Encoding Issues

In [1]:
import pandas as pd
import unicodedata

# Load the dataset
df = pd.read_csv("CLAN_data.csv")


# Fix encoding issues
def fix_encoding(text):
    if isinstance(text, str):
        return (
            unicodedata.normalize("NFKD", text)
            .encode("utf-8", "ignore")
            .decode("utf-8", "ignore")
        )
    return text


df["Social Media Post"] = df["Social Media Post"].apply(fix_encoding)
df["Normalized Claim"] = df["Normalized Claim"].apply(fix_encoding)

# Display a sample
df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President \n @realDonaldTrump\n : ""Biden's pla...",Biden’s energy plan would get rid of seniors’ ...
1,2,IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY\nDr. D. Mensah Asare says t...,A widely popular social media post claims that...


Remove Mentions, Hashtags, URLs

In [2]:
import re


# Remove Twitter Handles
def remove_handles(text):
    return re.sub(r"@\w+", "", text)


# Remove URLs & Hashtags
def remove_urls_hashtags(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"#\w+", "", text)  # Remove hashtags
    return text.strip()


df["Social Media Post"] = df["Social Media Post"].apply(remove_handles)
df["Social Media Post"] = df["Social Media Post"].apply(remove_urls_hashtags)

df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President \n \n : ""Biden's plan would mean Ame...",Biden’s energy plan would get rid of seniors’ ...
1,2,IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY\nDr. D. Mensah Asare says t...,A widely popular social media post claims that...


Expand Contractions (Normalize Text)
```bash
"can't" → "cannot"

"it's" → "it is"
```

In [3]:
import contractions


# Expand Contractions
def expand_contractions(text):
    return contractions.fix(text) if isinstance(text, str) else text


df["Social Media Post"] = df["Social Media Post"].apply(expand_contractions)
df["Normalized Claim"] = df["Normalized Claim"].apply(expand_contractions)

df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President \n \n : ""Biden's plan would mean Ame...",Biden’s energy plan would get rid of seniors’ ...
1,2,IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY\nDr. D. Mensah Asare says t...,A widely popular social media post claims that...


Remove Special Characters & Extra Spaces

In [4]:
# Remove special characters, punctuations, and extra spaces
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9.,!?\'\"\s]", " ", text)  # Keep basic punctuations
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text.lower()  # Convert to lowercase for consistency


df["Social Media Post"] = df["Social Media Post"].apply(clean_text)
df["Normalized Claim"] = df["Normalized Claim"].apply(clean_text)

df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"president ""biden's plan would mean america's s...",biden s energy plan would get rid of seniors a...
1,2,important announcement coronavirus last evenin...,"if someone with the new coronavirus sneezes, i..."
2,3,heart is delighted to hear,heart is delighted to hear
3,4,an allowed appeal is one where the initial ref...,the vast majority of people coming across the ...
4,5,warm water therapy dr. d. mensah asare says th...,a widely popular social media post claims that...


Handle Missing Values & Duplicates

In [5]:
# Remove rows with missing data
df.dropna(subset=["Social Media Post", "Normalized Claim"], inplace=True)

# Remove duplicate posts
df.drop_duplicates(subset=["Social Media Post"], keep="first", inplace=True)

df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"president ""biden's plan would mean america's s...",biden s energy plan would get rid of seniors a...
1,2,important announcement coronavirus last evenin...,"if someone with the new coronavirus sneezes, i..."
2,3,heart is delighted to hear,heart is delighted to hear
3,4,an allowed appeal is one where the initial ref...,the vast majority of people coming across the ...
4,5,warm water therapy dr. d. mensah asare says th...,a widely popular social media post claims that...


Save the Final Processed Data

In [6]:
# Save the cleaned dataset
df.to_csv("CLAN_data_cleaned.csv", index=False)

print("Data preprocessing complete. Cleaned file saved as CLAN_data_cleaned.csv")

Data preprocessing complete. Cleaned file saved as CLAN_data_cleaned.csv
