In [2]:
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import re
import nltk
import requests
import pyLDAvis
import pyLDAvis.gensim_models

# Make sure to download nltk stopwords if not already
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dnaso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dnaso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Define paths to transcripts and tags directories
transcripts_dir = '../cleaned_transcripts/'
tags_dir = '../tags/'
csv_path = '../cleaned_results.xlsx'

# Load the CSV file and filter for "related" videos
csv_data = pd.read_excel(csv_path)
related_videos = csv_data[csv_data["related"] == "yes"]

# Load transcripts
transcripts = []
tags = []

# Process each related video based on its video_id
for video_id in related_videos["Video Id"]:
    # Construct paths based on video ID naming conventions
    transcript_file = os.path.join(transcripts_dir, f"{video_id}_captions.txt")
    tag_file = os.path.join(tags_dir, f"{video_id}.txt")
    
    # Read the transcript and tag files if they exist
    try:
        with open(transcript_file, "r", encoding="utf-8") as file:
            transcripts.append(file.read())
        with open(tag_file, "r", encoding="utf-8") as file:
            tags.append(file.read().strip())
    except FileNotFoundError:
        print(f"Files for video ID {video_id} not found, skipping.")

# Combine the filtered data into a DataFrame
data = pd.DataFrame({"video_id": related_videos["Video Id"], "tags": tags, "transcripts": transcripts})
data["text"] = data["tags"] + " " + data["transcripts"]

# Display the first few rows to verify
print(data.head())

       video_id                                               tags  \
0   M0lKXfyJh-I  ['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...   
1   cgnBQ9qOqmY  ['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...   
2   kgG1oJiE5qs  ['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...   
3   LB20skQI1_k  ['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...   
51  EKNuv99XeCA  ['Chinese',, 'Sassa, Dagdag',, 'La, Vie, En, R...   

                                          transcripts  \
0   This is Pinoy life,\nHere in the Philippines,\...   
1   I \nam now a total germaphobe,\nAlways disinfe...   
2   Manila, oh nana\nI love to party with my frien...   
3   It's Mikey b*tch!\nAt the palengke (Filipino m...   
51  Hello.my name is max, from china!\nthanks for ...   

                                                 text  
0   ['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...  
1   ['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...  
2   ['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...  
3   ['Mikey,

In [15]:
# Step 2: Text Preprocessing
stop_words = set(stopwords.words("english"))

# Function to fetch stopwords from GitHub URL
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    github_stopwords = response.text.splitlines()  # Split by new lines
    return set(github_stopwords)

# GitHub URL for stopwords
github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)


custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                     'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                     'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow' ]
broad_terms = ['philippines', 'philippine', 'british', 'filipino', 'video', 'http', 'korea', 'korean', 
               'youtube', 'google', 'united', 'america', 'american']
kpop_keywords = ['kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 'seventeen', 
                 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook']
more_keywords = [
    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 
    'journalist', 'headline', 'reporter', 'current events', 'special report', 
    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'guys'
]

# Add custom and broad terms
stop_words.update(custom_stop_words, broad_terms, kpop_keywords, more_keywords, github_stopwords)

def preprocess_text(text):
    # Lowercase and remove non-alphabet characters
    text = re.sub(r'\W+', ' ', text.lower())
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and filter out short words
    words = [word for word in words if word not in stop_words and len(word) > 1]
    return words

data["cleaned_text"] = data["text"].apply(preprocess_text)

# Vectorize text with TF-IDF to remove low-impact words
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=5)
tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(doc) for doc in data["cleaned_text"]])

# Filter words in each transcript based on TF-IDF scores
def filter_by_tfidf(doc):
    features = tfidf_vectorizer.get_feature_names_out()
    vector = tfidf_vectorizer.transform([" ".join(doc)]).toarray()[0]
    return [features[i] for i in vector.argsort()[-15:]]  # Top 15 tf-idf terms

data["filtered_words"] = data["cleaned_text"].apply(filter_by_tfidf)

data

Unnamed: 0,video_id,tags,transcripts,text,cleaned_text,filtered_words
0,M0lKXfyJh-I,"['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...","This is Pinoy life,\nHere in the Philippines,\...","['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...","[mikey, bustos, pinoy, manila, comedy, lol, pi...","[para, tondo, ulam, mikey, parody, bustos, sil..."
1,cgnBQ9qOqmY,"['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...","I \nam now a total germaphobe,\nAlways disinfe...","['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...","[mikey, bustos, pinoy, manila, comedy, lifesty...","[cover, vaccines, fluids, panic, coronavirus, ..."
2,kgG1oJiE5qs,"['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...","Manila, oh nana\nI love to party with my frien...","['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...","[mikey, bustos, pinoy, manila, comedy, lifesty...","[hot, fort, bonifacio, cinnamon, shot, mabuhay..."
3,LB20skQI1_k,"['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...",It's Mikey b*tch!\nAt the palengke (Filipino m...,"['Mikey, Bustos',, 'Pinoy',, 'Filipino',, 'Phi...","[mikey, bustos, pinoy, manila, comedy, lifesty...","[veggies, sending, pilipinas, plants, bicol, b..."
51,EKNuv99XeCA,"['Chinese',, 'Sassa, Dagdag',, 'La, Vie, En, R...","Hello.my name is max, from china!\nthanks for ...","['Chinese',, 'Sassa, Dagdag',, 'La, Vie, En, R...","[chinese, sassa, dagdag, vie, en, rose, 107, b...","[feel, 107, lyrics, coz, language, chinese, so..."
...,...,...,...,...,...,...
2080,AI7Us_5-IaQ,"['El, Nido',, 'Palawan',, 'philippines',, 'isl...","We've got to El Nido, the weather is terrible ...","['El, Nido',, 'Palawan',, 'philippines',, 'isl...","[nido, palawan, island, hopping, island, backp...","[walking, scuba, lagoons, cheers, barefoot, of..."
2081,dbdGpkHLx9Y,"['coron',, 'coron, palawan',, 'coron, philippi...",Welcome to Ocam ocam beach. To get here you ca...,"['coron',, 'coron, palawan',, 'coron, philippi...","[coron, coron, palawan, coron, malcapuya, beac...","[beaches, saltwater, lagoon, palawan, safari, ..."
2082,V-r2M6E4iDM,"['red, rock, waterfall',, 'pulang, bato',, 'pu...",welcome to dumaguete\nalso known as the city o...,"['red, rock, waterfall',, 'pulang, bato',, 'pu...","[red, rock, waterfall, pulang, bato, pulang, b...","[motorcycle, result, tourists, presence, drive..."
2083,8X_d-IVydP4,[],foreign\n[Music]\ngood morning it is half past...,[] foreign\n[Music]\ngood morning it is half p...,"[morning, woken, nido, day, nido, plan, action...","[ping, calmer, film, island, paradise, fresh, ..."


In [None]:
# Created File Using only Cleaned Text
with open('cleaned_aggregate.txt', 'w', encoding='utf-8') as file:
    for row in data['cleaned_text']:
        file.write(" ".join(row).replace(',', '') + '\n')

# Created File using filtered text
with open('filtered_aggregate.txt', 'w', encoding='utf-8') as file:
    for row in data['filtered_words']:
        file.write(" ".join(row).replace(',', '') + '\n')


# to use HLTM:
# java -cp HLTA.jar;HLTA-deps.jar tm.hlta.HTD "file name of text" "output name"