# Load libraries

In [None]:
%pip install scikit-learn
%pip install nltk
%pip install emoji

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import emoji

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load data

In [16]:
# Load the datasets
rappler_docs = pd.read_excel('rappler_corpus.xlsx', index_col=0)
youtube_docs = pd.read_excel('youtube_corpus.xlsx', index_col=0)

# Standardize date_published column
# Rappler docs come in 2025-08-07T14:01:01Z format
rappler_docs['date_published'] = pd.to_datetime(
    rappler_docs['date_published']).dt.tz_convert(tz='Asia/Manila').dt.tz_localize(None)
# Youtube docs come in UTC timezone
youtube_docs['date_published'] = pd.to_datetime(
    youtube_docs['date_published']).dt.tz_convert(tz='Asia/Manila').dt.tz_localize(None)

# Assign source to each dataset
rappler_docs['source'] = 'rappler'
youtube_docs['source'] = 'youtube'

# Combine the datasets
corpus = pd.concat([
  rappler_docs,
  youtube_docs
], ignore_index=True, axis=0)

corpus

Unnamed: 0,title,link,date_published,text,source,like_count,reply_parent_id
0,Hindi Ito Marites: The Hague and Filipinos' pu...,https://www.rappler.com/newsbreak/podcasts-vid...,2025-08-15 16:00:29,"When local courts offer no refuge, or when fac...",rappler,,
1,FACT CHECK: No Guinness record of Duterte as ‘...,https://www.rappler.com/newsbreak/fact-check/n...,2025-08-12 10:00:00,Claim: Former president Rodrigo Duterte holds ...,rappler,,
2,WATCH: Tuloy ba ang pre-trial hearing ni Duter...,https://www.rappler.com/philippines/video-will...,2025-08-11 19:00:00,"MANILA, Philippines – May bagong hiling ang de...",rappler,,
3,Duterte team wants ICC Prosecutor Khan disqual...,https://www.rappler.com/video/daily-wrap/augus...,2025-08-08 22:07:53,Here are today’s headlines – the latest news i...,rappler,,
4,Duterte team wants Prosecutor Khan disqualifie...,https://www.rappler.com/philippines/duterte-de...,2025-08-08 07:00:00,"MANILA, Philippines – Prosecutor Karim Khan, o...",rappler,,
...,...,...,...,...,...,...,...
846,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 19:24:46,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg
847,"That vp is rude, entitled and arrogant like he...",https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 21:13:32,"That vp is rude, entitled and arrogant like he...",youtube,3.0,UgwBBzeGtqWna6y4KUJ4AaABAg
848,What country is this? Very interesting.,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 22:16:52,What country is this? Very interesting.,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg
849,@@cvoutdoors9859palamunin,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 23:14:26,@@cvoutdoors9859palamunin,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg


# Preprocess text

## Load stopwords

In [19]:
from pandas.errors import EmptyDataError

try:
  basic_stopwords = list(
    # Handle empty data error
    pd.read_csv('basic_stopwords.txt', header=None).values.flatten()
  )
except EmptyDataError:
  basic_stopwords = []

try:
  domain_stopwords = list(
    pd.read_csv('domain_stopwords.txt', header=None).values.flatten()
  )
except EmptyDataError:
  domain_stopwords = []

In [33]:
def preprocess_text(corpus, text_column='text'):
  cleaned_corpus = corpus.copy()

  # Lowercase
  cleaned_corpus['cleaned_text'] = cleaned_corpus[text_column].str.lower()

  # Lemmatize (by default, lemmatize nouns)
  # Other options:
  #   'v' for verbs
  #   'a' for adjectives
  #   'r' for adverbs
  #   's' for satellites adjectives (adjectives that appear after verbs)
  lemmatizer = WordNetLemmatizer()
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [lemmatizer.lemmatize(word, pos='n') for word in str(text).split()]
    )
  )

  # Stemmer
  stemmer = PorterStemmer()
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
      lambda text: ' '.join(
        [stemmer.stem(word) for word in str(text).split()]
      )
  )

  # Remove NLTK stopwords
  en_stopwords_list = stopwords.words('english')
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [
        word for word in text.split() if word not in en_stopwords_list
      ]
    )
  )

  # Remove basic stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in basic_stopwords]
    )
  )

  # Remove domain stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in domain_stopwords]
    )
  )

  # Remove trailing and leading whitespaces
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.strip()

  # Remove non-alphanumeric characters
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\W', ' ', regex=True)

  # Remove numbers
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\d+', ' ', regex=True)

  # Remove emojis using emoji library
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in list(emoji.EMOJI_DATA.keys())]
    )
  )

  return cleaned_corpus['cleaned_text']

In [35]:
corpus['cleaned_text'] = preprocess_text(corpus)
corpus

Unnamed: 0,title,link,date_published,text,source,like_count,reply_parent_id,cleaned_text
0,Hindi Ito Marites: The Hague and Filipinos' pu...,https://www.rappler.com/newsbreak/podcasts-vid...,2025-08-15 16:00:29,"When local courts offer no refuge, or when fac...",rappler,,,local court offer refuge face formid foreign a...
1,FACT CHECK: No Guinness record of Duterte as ‘...,https://www.rappler.com/newsbreak/fact-check/n...,2025-08-12 10:00:00,Claim: Former president Rodrigo Duterte holds ...,rappler,,,claim former presid rodrigo dutert hold guin w...
2,WATCH: Tuloy ba ang pre-trial hearing ni Duter...,https://www.rappler.com/philippines/video-will...,2025-08-11 19:00:00,"MANILA, Philippines – May bagong hiling ang de...",rappler,,,manila philippin may bagong hile ang defens te...
3,Duterte team wants ICC Prosecutor Khan disqual...,https://www.rappler.com/video/daily-wrap/augus...,2025-08-08 22:07:53,Here are today’s headlines – the latest news i...,rappler,,,today headlin latest news philippin around wor...
4,Duterte team wants Prosecutor Khan disqualifie...,https://www.rappler.com/philippines/duterte-de...,2025-08-08 07:00:00,"MANILA, Philippines – Prosecutor Karim Khan, o...",rappler,,,manila philippin prosecutor karim khan leav in...
...,...,...,...,...,...,...,...,...
846,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 19:24:46,Ul*l dignified. Pinapakyuhan nga si Bong Daza ...,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,ul l dignified pinapakyuhan nga si bong daza s...
847,"That vp is rude, entitled and arrogant like he...",https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 21:13:32,"That vp is rude, entitled and arrogant like he...",youtube,3.0,UgwBBzeGtqWna6y4KUJ4AaABAg,vp rude entitl arrog like father they r use sp...
848,What country is this? Very interesting.,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 22:16:52,What country is this? Very interesting.,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,countri this veri interesting
849,@@cvoutdoors9859palamunin,https://www.youtube.com/watch?v=yfoq-0gGTLM&lc...,2024-08-27 23:14:26,@@cvoutdoors9859palamunin,youtube,0.0,UgwBBzeGtqWna6y4KUJ4AaABAg,cvoutdoors palamunin


In [36]:
corpus.to_excel('cleaned_corpus.xlsx', index=False)