Cleaning & Normalization of data

In [7]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-2.1.0-cp38-cp38-win_amd64.whl (39 kB)
Collecting anyascii
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24


In [8]:
import pandas as pd
import re
import unicodedata
import contractions

# Load Data
df = pd.read_csv("CLAN_data.csv")
df

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President \n @realDonaldTrump\n : ""Biden's pla...",Biden’s energy plan would get rid of seniors’ ...
1,2,IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY\nDr. D. Mensah Asare says t...,A widely popular social media post claims that...
...,...,...,...
2806,2807,Kamala Harris failed the bar 1st time\nAmy Con...,Kamala Harris failed the bar exam on her first...
2807,2808,We warned you. Illegals can now become police ...,Those in the country illegally can be police o...
2808,2809,"Four weeks ago, all UK PRIMARY aged children g...",Nasal flu vaccines given to children contain s...
2809,2810,There is NOTHING WRONG with asking why SWING S...,The claim: Democrat-led states where Trump was...


In [9]:
import unicodedata


def fix_encoding(text):
    if isinstance(text, str):
        return (
            unicodedata.normalize("NFKD", text)
            .encode("utf-8", "ignore")
            .decode("utf-8", "ignore")
        )
    return text


df["Social Media Post"] = df["Social Media Post"].apply(fix_encoding)
df["Normalized Claim"] = df["Normalized Claim"].apply(fix_encoding)

# Check a sample after encoding fix
df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President \n @realDonaldTrump\n : ""Biden's pla...",Biden’s energy plan would get rid of seniors’ ...
1,2,IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY\nDr. D. Mensah Asare says t...,A widely popular social media post claims that...


In [10]:
def remove_handles(text):
    return re.sub(r"@\w+", "", text) if isinstance(text, str) else text


df["Social Media Post"] = df["Social Media Post"].apply(remove_handles)

# Check a sample after removing handles
df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President \n \n : ""Biden's plan would mean Ame...",Biden’s energy plan would get rid of seniors’ ...
1,2,IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY\nDr. D. Mensah Asare says t...,A widely popular social media post claims that...


In [11]:
def expand_contractions(text):
    return contractions.fix(text) if isinstance(text, str) else text


df["Social Media Post"] = df["Social Media Post"].apply(expand_contractions)
df["Normalized Claim"] = df["Normalized Claim"].apply(expand_contractions)

# Check a sample after expanding contractions
df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President \n \n : ""Biden's plan would mean Ame...",Biden’s energy plan would get rid of seniors’ ...
1,2,IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY\nDr. D. Mensah Asare says t...,A widely popular social media post claims that...


In [12]:
def remove_urls_hashtags(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"#\w+", "", text)  # Remove hashtags
    return text.strip()


df["Social Media Post"] = df["Social Media Post"].apply(remove_urls_hashtags)

# Check a sample after removing URLs & hashtags
df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President \n \n : ""Biden's plan would mean Ame...",Biden’s energy plan would get rid of seniors’ ...
1,2,IMPORTANT ANNOUNCEMENT - CORONAVIRUS\nLast eve...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY\nDr. D. Mensah Asare says t...,A widely popular social media post claims that...


In [13]:
def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9.,!?\'\"\s]", " ", text)  # Keep basic punctuations
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text


df["Social Media Post"] = df["Social Media Post"].apply(clean_text)
df["Normalized Claim"] = df["Normalized Claim"].apply(clean_text)

# Check a sample after text cleaning
df.head()

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President ""Biden's plan would mean America's s...",Biden s energy plan would get rid of seniors a...
1,2,IMPORTANT ANNOUNCEMENT CORONAVIRUS Last evenin...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY Dr. D. Mensah Asare says th...,A widely popular social media post claims that...


In [14]:
df.dropna(subset=["Social Media Post", "Normalized Claim"], inplace=True)

# Check if missing values exist
print(df.isnull().sum())

PID                  0
Social Media Post    0
Normalized Claim     0
dtype: int64


In [15]:
df.drop_duplicates(subset=["Social Media Post"], keep="first", inplace=True)

# Check shape after removing duplicates
print("Dataset size after deduplication:", df.shape)

Dataset size after deduplication: (2292, 3)


In [16]:
df

Unnamed: 0,PID,Social Media Post,Normalized Claim
0,1,"President ""Biden's plan would mean America's s...",Biden s energy plan would get rid of seniors a...
1,2,IMPORTANT ANNOUNCEMENT CORONAVIRUS Last evenin...,"If someone with the new coronavirus sneezes, i..."
2,3,Heart is delighted to hear,Heart is delighted to hear
3,4,An allowed appeal is one where the initial ref...,The vast majority of people coming across the ...
4,5,WARM WATER THERAPY Dr. D. Mensah Asare says th...,A widely popular social media post claims that...
...,...,...,...
2797,2798,"Foh with that weak malala slander, Twitter fin...",Malala Yousafzai has been silent about the cri...
2801,2802,The plight of Hindu girls madly in love with M...,The plight of Hindu girls madly in love with M...
2803,2804,I am hearing that James Comey has 50 counts of...,James Comey has 50 counts of TREASON. And John...
2805,2806,Dr. Lonnie Herman reviews how your food is pre...,Irradiated food causes cancer


In [17]:
df.to_csv("CLAN_data_cleaned.csv", index=False)
print("Data preprocessing complete. Cleaned file saved as CLAN_data_cleaned.csv")

Data preprocessing complete. Cleaned file saved as CLAN_data_cleaned.csv
