In [None]:
import pandas as pd
import re
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Final_Annotation.csv')

print("Dataset loaded successfully!\n")
print(f"Total number of tweets: {len(df)}")
print(f"\nDataset shape: {df.shape}")
print(f"\nColumn names: {list(df.columns)}")

# Display first few rows
print("\n" + "="*80)
print("SAMPLE ROWS")
print("="*80)
df.head(10)

Dataset loaded successfully!

Total number of tweets: 37314

Dataset shape: (37314, 4)

Column names: ['Tweet Number', 'Clause Number', 'Clause', 'Final Sentiment']

SAMPLE ROWS


Unnamed: 0,Tweet Number,Clause Number,Clause,Final Sentiment
0,1,1,wag naman sana,Neutral
1,1,2,at maapektuhan kaming tuwid bumoto,Negative
2,2,1,kung sino man yang 12k na bumoto kay quiboloy ...,Negative
3,3,1,na dinaya ang botohan kaloka,Negative
4,4,1,so far bam kiko luke amp heidi frontrunner sa ...,Positive
5,5,1,inanunsyo ng commission on elections comelec a...,Neutral
6,5,2,at minority party sa bilangpilipino2025 ang la...,Neutral
7,6,1,done na hindi ko lang na ss nun nag umpisa na ...,Neutral
8,7,1,eleksyon 2025 vico sotto vs sarah discaya ang ...,Neutral
9,8,1,just in the supreme court has issued new tempo...,Neutral


In [6]:

df.isnull().sum()

# View sentiment distribution
df['Final Sentiment'].value_counts()

Final Sentiment
Neutral     21226
Negative    12074
Positive     4014
Name: count, dtype: int64

In [7]:
SLANG_DICT = {
    # English slang
    "omg": "oh my god",
    "idk": "i don't know",
    "btw": "by the way",
    "lol": "laughing out loud",
    "lmao": "laughing my ass off",
    "rofl": "rolling on floor laughing",
    "tbh": "to be honest",
    "brb": "be right back",
    "smh": "shaking my head",
    "ikr": "i know right",
    "afaik": "as far as i know",
    "asap": "as soon as possible",
    "fyi": "for your information",
    "imo": "in my opinion",
    "imho": "in my humble opinion",
    "pls": "please",
    "plz": "please",
    "thx": "thanks",
    "ty": "thank you",
    "tysm": "thank you so much",
    "yw": "you're welcome",
    "np": "no problem",
    "u": "you",
    "ur": "your",
    "r": "are",
    "b": "be",
    "bc": "because",
    "cuz": "because",
    "rn": "right now",
    "w": "with",
    "w/": "with",
    "wo": "without",
    "w/o": "without",
    
    # Taglish/Filipino slang
    "k": "ka",
    "q": "ko",
    "d": "de",
    "di": "hindi",
    "daw": "daw",
    "raw": "raw",
    "ung": "yung",
    "yng": "yang",
    "lng": "lang",
    "nman": "naman",
    "nmn": "naman",
    "pra": "para",
    "cguro": "siguro",
    "kc": "kasi",
    "ksi": "kasi",
    "kse": "kasi",
    "s": "sa",
    "dn": "din",
    "rn": "rin",
    "mganda": "maganda",
    "gnda": "ganda",
    "hnd": "hindi",
    "ayaw": "ayaw",
    "gnto": "ganito",
    "gnyan": "ganyan",
    "amp": "",
    "char": "",
    "chz": "",
    "haha": "haha",
    "hehe": "hehe",
    "hihi": "hihi",
    "huhu": "huhu",
    "waaa": "waa",
}

def clean_tweet(text):
    """Clean Taglish text."""
    if not isinstance(text, str):
        return ""
    
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#", "", text)  # Remove hashtags
    text = re.sub(r"[^\w\s]", " ", text)  # Remove special characters
    text = re.sub(r"(.)\1{2,}", r"\1\1", text)  # Normalize repeated chars
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    
    # Normalize slang
    words = text.split()
    normalized = [SLANG_DICT.get(w, w) for w in words]
    text = " ".join(normalized)
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


In [12]:
df_cleaned = df.copy()

# Store original for comparison
df_cleaned['Original_Clause'] = df_cleaned['Clause']

# Apply cleaning and overwrite Clause column
df_cleaned['Clause'] = df_cleaned['Clause'].apply(clean_tweet)

# Remove empty clauses
df_cleaned = df_cleaned[df_cleaned['Clause'].str.strip() != '']

# View before/after examples
sample = df_cleaned.sample(5)
sample[['Original_Clause', 'Clause', 'Final Sentiment']]

# ## 6. Prepare Final Dataset

df_final = df_cleaned[['Tweet Number', 'Clause Number', 'Clause', 'Final Sentiment']].copy()

# View final dataset
df_final.head(10)

# Check final sentiment distribution
df_final['Final Sentiment'].value_counts()


Final Sentiment
Neutral     21226
Negative    12074
Positive     4014
Name: count, dtype: int64

In [13]:
df_final.to_csv('cFinal_Annotation.csv', index=False)

# Verify saved file
df_verify = pd.read_csv('cFinal_Annotation.csv')
df_verify.head()

Unnamed: 0,Tweet Number,Clause Number,Clause,Final Sentiment
0,1,1,wag naman sana,Neutral
1,1,2,at maapektuhan kaming tuwid bumoto,Negative
2,2,1,kung sino man yang 12k na bumoto kay quiboloy ...,Negative
3,3,1,na dinaya ang botohan kaloka,Negative
4,4,1,so far bam kiko luke heidi frontrunner sa bara...,Positive
