# Load libraries

In [1]:
# %pip install scikit-learn
# %pip install nltk
# %pip install emoji

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Load data

In [3]:
# Load the datasets
rappler_docs = pd.read_excel('rappler_corpus.xlsx', index_col=0)
youtube_docs = pd.read_excel('youtube_corpus.xlsx', index_col=0)

# Standardize date_published column
# Rappler docs come in 2025-08-07T14:01:01Z format
rappler_docs['date_published'] = pd.to_datetime(
    rappler_docs['date_published']).dt.tz_convert(tz='Asia/Manila').dt.tz_localize(None)
# Youtube docs come in UTC timezone
youtube_docs['date_published'] = pd.to_datetime(
    youtube_docs['date_published']).dt.tz_convert(tz='Asia/Manila').dt.tz_localize(None)

# Assign source to each dataset
rappler_docs['source'] = 'rappler'
youtube_docs['source'] = 'youtube'

# Combine the datasets
corpus = pd.concat([
  rappler_docs,
  youtube_docs
], ignore_index=True, axis=0)

corpus

Unnamed: 0,title,link,date_published,text,source,like_count,reply_parent_id
0,Hindi Ito Marites: The Hague and Filipinos' pu...,https://www.rappler.com/newsbreak/podcasts-vid...,2025-08-15 16:00:29,"When local courts offer no refuge, or when fac...",rappler,,
1,FACT CHECK: No Guinness record of Duterte as ‘...,https://www.rappler.com/newsbreak/fact-check/n...,2025-08-12 10:00:00,Claim: Former president Rodrigo Duterte holds ...,rappler,,
2,WATCH: Tuloy ba ang pre-trial hearing ni Duter...,https://www.rappler.com/philippines/video-will...,2025-08-11 19:00:00,"MANILA, Philippines – May bagong hiling ang de...",rappler,,
3,Duterte team wants ICC Prosecutor Khan disqual...,https://www.rappler.com/video/daily-wrap/augus...,2025-08-08 22:07:53,Here are today’s headlines – the latest news i...,rappler,,
4,Duterte team wants Prosecutor Khan disqualifie...,https://www.rappler.com/philippines/duterte-de...,2025-08-08 07:00:00,"MANILA, Philippines – Prosecutor Karim Khan, o...",rappler,,
...,...,...,...,...,...,...,...
846,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 19:24:46,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg
847,"That vp is rude, entitled and arrogant like he...",https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 21:13:32,"That vp is rude, entitled and arrogant like he...",youtube,3.0,UgwBBzeGtqWna6y4KUJ4AaABAg
848,What country is this? Very interesting.,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 22:16:52,What country is this? Very interesting.,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg
849,@@cvoutdoors9859palamunin,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 23:14:26,@@cvoutdoors9859palamunin,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg


# Preprocess text

## Load stopwords

In [4]:
from pandas.errors import EmptyDataError
from nltk.corpus import stopwords

nltk.download('stopwords')

try:
  BASIC_STOPWORDS = list(
    # Handle empty data error
    pd.read_csv('basic_stopwords.txt', header=None).values.flatten()
  )
except EmptyDataError:
  BASIC_STOPWORDS = []

try:
  DOMAIN_STOPWORDS = list(
    pd.read_csv('domain_stopwords.txt', header=None).values.flatten()
  )
except EmptyDataError:
  DOMAIN_STOPWORDS = []

EN_STOPWORDS_LIST = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Clean corpus

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import emoji


def clean_corpus(corpus, text_column='text'):
  '''
  Clean the text data in the specified column of the DataFrame.
  '''
  cleaned_corpus = corpus.copy()

  # Force text_column as string
  cleaned_corpus['cleaned_text'] = cleaned_corpus[text_column].astype(str)

  # Transform into lowercase
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.lower()

  # Remove usernames, non-alphanumeric characters, and links
  # docs['cleaned_text'] = docs['cleaned_text'].str.replace(r'(@[A-Za-z0-9_]+)|([^A-Za-z0-9_ \t])|(\w+:\/\/\S+)', '')

  # # Lemmatize (by default, lemmatize nouns)
  # # Other options:
  # #   'v' for verbs
  # #   'a' for adjectives
  # #   'r' for adverbs
  # #   's' for satellites adjectives (adjectives that appear after verbs)
  # lemmatizer = WordNetLemmatizer()
  # cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
  #   lambda text: ' '.join(
  #     [lemmatizer.lemmatize(word, pos='n') for word in str(text).split()]
  #   )
  # )

  # # Stemmer
  # stemmer = PorterStemmer()
  # cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
  #     lambda text: ' '.join(
  #       [stemmer.stem(word) for word in str(text).split()]
  #     )
  # )

  # Remove NLTK stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [
        word for word in text.split() if word not in EN_STOPWORDS_LIST
      ]
    )
  )

  # Remove basic stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in BASIC_STOPWORDS]
    )
  )

  # Remove domain stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in DOMAIN_STOPWORDS]
    )
  )

  # Remove trailing and leading whitespaces
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.strip()

  # Remove non-alphanumeric characters
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\W', ' ', regex=True)

  # Remove numbers
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\d+', ' ', regex=True)

  # Remove emojis using emoji library
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      emoji.replace_emoji(text, replace=' ').split()
    )
  )

  # Remove NaN values
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].replace(np.nan, '', regex=True)

  return cleaned_corpus

In [6]:
cleaned_corpus = clean_corpus(corpus, text_column='text')
cleaned_corpus

Unnamed: 0,title,link,date_published,text,source,like_count,reply_parent_id,cleaned_text
0,Hindi Ito Marites: The Hague and Filipinos' pu...,https://www.rappler.com/newsbreak/podcasts-vid...,2025-08-15 16:00:29,"When local courts offer no refuge, or when fac...",rappler,,,local courts offer refuge faced formidable for...
1,FACT CHECK: No Guinness record of Duterte as ‘...,https://www.rappler.com/newsbreak/fact-check/n...,2025-08-12 10:00:00,Claim: Former president Rodrigo Duterte holds ...,rappler,,,claim former president rodrigo duterte holds g...
2,WATCH: Tuloy ba ang pre-trial hearing ni Duter...,https://www.rappler.com/philippines/video-will...,2025-08-11 19:00:00,"MANILA, Philippines – May bagong hiling ang de...",rappler,,,manila philippines may bagong hiling ang defen...
3,Duterte team wants ICC Prosecutor Khan disqual...,https://www.rappler.com/video/daily-wrap/augus...,2025-08-08 22:07:53,Here are today’s headlines – the latest news i...,rappler,,,today s headlines latest news philippines arou...
4,Duterte team wants Prosecutor Khan disqualifie...,https://www.rappler.com/philippines/duterte-de...,2025-08-08 07:00:00,"MANILA, Philippines – Prosecutor Karim Khan, o...",rappler,,,manila philippines prosecutor karim khan leave...
...,...,...,...,...,...,...,...,...
846,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 19:24:46,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,ul l dignified pinapakyuhan nga si bong daza s...
847,"That vp is rude, entitled and arrogant like he...",https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 21:13:32,"That vp is rude, entitled and arrogant like he...",youtube,3.0,UgwBBzeGtqWna6y4KUJ4AaABAg,vp rude entitled arrogant like father used spe...
848,What country is this? Very interesting.,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 22:16:52,What country is this? Very interesting.,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,country this interesting
849,@@cvoutdoors9859palamunin,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 23:14:26,@@cvoutdoors9859palamunin,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,cvoutdoors palamunin


### Cleaning emojis

In [7]:
sample_sentence = "hello world @helloWorld 😅"
sample_sentence_2 = "Wait ko si dugong mag salita na JOKE LNG😁☺️<br>Kayu naman naniniwla agad😂"

# Clean emojis
sample_sentence_without_emoji = emoji.replace_emoji(
  sample_sentence, replace=' EMOJI')
sample_sentence_2_without_emoji = emoji.replace_emoji(
  sample_sentence_2, replace=' EMOJI')

print(sample_sentence_2)
print(sample_sentence_2_without_emoji)

Wait ko si dugong mag salita na JOKE LNG😁☺️<br>Kayu naman naniniwla agad😂
Wait ko si dugong mag salita na JOKE LNG EMOJI EMOJI<br>Kayu naman naniniwla agad EMOJI


In [8]:
cleaned_corpus.to_excel('cleaned_corpus.xlsx', index=False)