In [None]:
import pandas as pd
import os
import random

# Load the Excel file
file_path = "cleaned_results.xlsx"  # Change this to your actual file path
sheet_name = "Sheet1"  # Change if your sheet has a different name

# Read the Excel file
df = pd.read_excel(file_path, sheet_name=sheet_name, engine="openpyxl")

# Ensure columns exist
if "Video Id" not in df.columns or "Video Title" not in df.columns:
    raise ValueError("Columns 'Video Id' and 'Video Title' not found in the Excel file.")

# Randomly select 100 rows
random_rows = df.sample(n=10)  # `random_state` ensures reproducibility

# Define directories
output_dir = "YouTube_Video_Details"
transcripts_dir = 'cleaned_transcripts/'  # Folder containing transcript files

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Iterate over selected rows and create text files
for _, row in random_rows.iterrows():
    video_id = str(row["Video Id"]).strip()
    video_title = str(row["Video Title"]).strip()

    # Generate a safe filename
    safe_title = "".join(c if c.isalnum() or c in " -_()" else "_" for c in video_title)
    filename = f"{video_id} - {safe_title}.txt"
    
    # Define output file path
    output_file_path = os.path.join(output_dir, filename)

    # Try to find the transcript file
    transcript_file_path = os.path.join(transcripts_dir, f"{video_id}_captions.txt")
    
    # Read transcript content if available
    transcript_content = ""
    if os.path.exists(transcript_file_path):
        with open(transcript_file_path, "r", encoding="utf-8") as transcript_file:
            transcript_content = transcript_file.read()
    else:
        transcript_content = "Transcript not available."

    # Write the video details along with transcript
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        output_file.write(f"Video ID: {video_id}\n")
        output_file.write(f"Title: {video_title}\n\n")
        output_file.write("Transcript:\n")
        output_file.write(transcript_content)

print(f"Successfully created {len(random_rows)} text files in '{output_dir}' directory.")


In [67]:
import os
import random
import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import numpy as np
import spacy
import string
from wordcloud import WordCloud
import unicodedata
import matplotlib.pyplot as plt
from collections import Counter
from nltk.util import ngrams
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import FastText

# Load the FastText model
#fasttext_model = FastText.load("fasttext_300dim_20epochs.model")

# Folder paths
transcripts_folder_path = 'standard_dataset/'
tags_folder_path = 'tags/'

# Function to load random documents
def load_random_documents(folder_path, sample_size=100):
    all_files = [file_name for file_name in os.listdir(folder_path) if file_name.endswith('.txt')]
    sampled_files = random.sample(all_files, min(sample_size, len(all_files)))

    documents = []
    file_names = []
    for file_name in sampled_files:
        with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
            content = file.read().lower()
            if len(content.split()) >= 50:
                documents.append(content)
                file_names.append(file_name)
    return documents, file_names

def load_filipino_context_words():
    with open("filipino_context_words.txt", "r", encoding="utf-8") as file:
        filipino__words = [line.strip() for line in file.readlines()]
    return filipino__words
    
documents, file_names = load_random_documents(transcripts_folder_path)

filipino_context_words = load_filipino_context_words()

# Function to load video tags
def load_video_tags(folder_path):
    video_tags = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            video_id = file_name.split('_')[0]
            with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()
    return video_tags

video_tags = load_video_tags(tags_folder_path)

# Fetch stopwords
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    return set(response.text.splitlines())

github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

stop_words = set(stopwords.words('english'))
custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                     'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                     'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow' ]
broad_terms = ['philippines', 'philippine', 'british', 'filipino', 'video', 'http', 'korea', 'korean', 
               'youtube', 'google', 'united', 'america', 'american']
kpop_keywords = ['kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 'seventeen', 
                 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook']
more_keywords = [
    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 'news', 'netherlands', 'psy', 'subtitle', 'description', 'link', 
    'journalist', 'headline', 'reporter', 'current events', 'special report', 
    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'gonna', 'life', 
    'lol', 'guys', 'tho', 'cute', 'hmm', 'huh', 'channel', 'subscribe', 'day6', 'mandarin', 'chinese', 'beautiful',
    'chuckles', 'fbe', 'hit', 'laughs', 'yo', 'ka', 'word', 'living', 'boi', 'minimum', 'ya', 'successful', 'perfectly', 'yeap', 
    'wondering', 'fantastic', 'hurry', 'german', 'age', 'country', 'subscribing', 'bluesy', 'jump', 'pretty', 'understanding', 'personalized',
    'and', 'the', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'up', 'about', 'over', 'into', 'through', 'between', 'under', 'against', 'all',
    'you', 'haha', 'hahaha', 'ha', 'hey', 'bye', 'hello', 'hi', 'oh', 'blah', 'easy', 'alright', 'ta', 'day', 'ooh', 'en', 'do', 'lot', 'comment', 'notification', 
    'korean', 'jjajangmyeon', 'jajangmyeon', 'damn', 'yall', 'month', 'week', 'year', 'ohhh', 'pvf', 'dude', 'mmm', 'kagilagilalas', 'ofcourse', 'australia', 'uxo', 
    'atleast', 'yusuf', 'bangkok', 'ot', 'anytime', 'allover', 'kala', 'nope', 'wan', 'brazil', 'smooth', 'ot', 'timeshere', 'batchof', 'yep', 'opo', 'del',
    'gosh', 'po', 'ourself', 'wo', 'wait', 'ugh', 'nyc', 'whoa', 'nicaragua', 'yup', 'em', 'bout', 'le', 'omg', 'overwhelm', 'maam', 'nicer', 'haha', 'hahaha', 'ha', 
    'nbcs', 'lana', 'rc', 'whatsoever', 'oxy', 'decade', 'whyd', 'unknown', 'ahhhhh', 'ohoh', 'ohto', 'ohhhh', 'bruh', 'ooe', 'ahmedabad', 'mexico', 
    'understand', 'excuse', 'kinda', 'applause', 'oooh', 'thiswhat', 'nevermind', 'ahh', 'againthank', 'toto', 'aww', 'nah', 'bbmas', 'ay', 'op', 'huh', 'huhu',
    'tada', 'beacuse', 'voila', 'upstairs', 'thatswhy', 'yea', 'that', 'armenia', 'or', 'not', 'funwhat', 'aka', 'armeniathat', 'woosexy', 'worth', 'laugh', 'box', 
    'xd', 'vb', 'eff', 'ananya', 'welsh', 'latron', 'shout', 'whatwhat', 'what', 'pause', 'why', 'thats', 'byebye', 'iv', 'bye', 'ado', 'ownup', 'dom', 'jomm', 'sir', 
    'budgie', 'nomac', 'lavocha', 'germany', 'why', 'walang', 'superduper', 'philip', 'mom', 'jre', 'giddy', 'intro', 'dupe', 'europe', 'dream', 'team', 'dislike', 'content', 
    'yoongi', 'royale', 'ilu', 'jhope', 'day', 'jin', 'ecc', 'nyhs', 'nego', 'chavez', 'pb', 'everyones', 'epic', 'matter', 'oneonone', 'region', 'change', 'ho', 'seetoh', 
    'atin', 'vpn', 'facetune', 'busu', 'mackie', 'clyd', 'china', 'rest', 'friend', 'woah', 'dindins', 'poster', 'vibe', 'woman', 'boss', 'woah', 'type', 'mahana', 'joke', 
    'taller', 'insane', 'whang', 'psa', 'manatee', 'recommend', 'caesar', 'mmmhmm', 'mosul', 'dun', 'clue', 'naysayer', 'hindi', 'ko', 'pero', 'bulgaria', 'question', 'video', 
    'yobi', 'hindu', 'expat', 'option', 'gap', 'eu', 'simo', 'kouignamann', 'bct', 'month', 'cfo'
]
stop_words.update(custom_stop_words, kpop_keywords, broad_terms, more_keywords, github_stopwords)

lemmatizer = WordNetLemmatizer()

# Identify Filipino words using SpaCy model
# def is_filipino_word(word):
#     doc = nlp(word)
#     embedding = doc.vector
#     return np.linalg.norm(embedding) >= 0.85  # Adjust threshold as needed

# Helper function to check if a word contains only Latin script characters
def is_latin_script(word):
    for char in word:
        if not ('LATIN' in unicodedata.name(char, '') or char.isdigit()):
            return False
    return True

def preprocess_text_with_tags(doc, video_id, ngram_range=(1, 2), tag_weight=1, filipino_weight=2, ngram_weight_factor=1):
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(doc)
    tokens_with_pos = pos_tag(tokens)

    # Step 1: Generate n-grams
    ngram_tokens = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngram_tokens.extend([' '.join(gram) for gram in ngrams(tokens, n)])

    # Step 2: Count n-gram frequency
    ngram_frequencies = Counter(ngram_tokens)

    # Step 3: Filter tokens based on stop words and POS tags
    filtered_tokens = []
    for word, pos in tokens_with_pos:
        if not word.isalpha() or not is_latin_script(word) or word in stop_words:
            continue
        if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
            continue  # Remove verbs and adjectives
        lemmatized_word = lemmatizer.lemmatize(word)
        filtered_tokens.append(lemmatized_word)

    # Step 4: Assign weight based on n-grams and Filipino similarity
    weighted_tokens = []
    for token in filtered_tokens:
        token_weight = 1 + ngram_frequencies.get(token, 0) * ngram_weight_factor  # Weight based on n-grams

        # Apply weight based on FastText similarity to Filipino words
        #filipino_similarity = get_filipino_similarity(token)
        #if filipino_similarity > 0.3:  # Lower threshold to capture more words
            #weighted_tokens.extend([token] * int(filipino_weight * filipino_similarity * token_weight))
        #else:
        weighted_tokens.extend([token] * int(token_weight))

    # Step 5: Include video tags
    if video_id in video_tags:
        tags = video_tags[video_id]
        for tag in tags:
            if tag.isalpha() and is_latin_script(tag):
                tag_weight_multiplier = tag_weight
                #filipino_similarity = get_filipino_similarity(tag)
                #if filipino_similarity > 0.3:
                    #tag_weight_multiplier *= filipino_weight * filipino_similarity
                weighted_tokens.extend([tag] * int(tag_weight_multiplier))

    return ' '.join(weighted_tokens)

def get_filipino_similarity(word):
    """Check if a word belongs to the Filipino context based on clustering."""
    if word in filipino_context_words:
        return 1  # Maximum weight if the word is in the discovered Filipino words list
    try:
        # Compute similarity to Filipino words
        similarities = [fasttext_model.wv.similarity(word, ref_word) for ref_word in filipino_context_words if ref_word in fasttext_model.wv]
        return max(similarities) if similarities else 0
    except KeyError:
        return 0


# Function to calculate the number of topics dynamically
def dynamic_num_topics(preprocessed_doc, words_per_topic=200, min_topics=1, max_topics=5):
    num_tokens = len(preprocessed_doc.split())
    return max(min_topics, min(num_tokens // words_per_topic, max_topics))

# Function to display filtered/non-filtered topic-word distributions
def display_topic_word_distributions(lda_model, num_words=20):
    """
    Displays topic-word distributions before and after filtering repeated words.
    
    Args:
        lda_model: Trained LDA model.
        num_words: Number of top words to display per topic.
    """
    # Get non-filtered topics
    non_filtered_topics = lda_model.show_topics(num_words=num_words, formatted=False)
    
    # Get filtered topics
    filtered_topics = filter_repeated_words_across_topics(lda_model, num_words=num_words)
    
    # Display non-filtered topics
    print("\nNon-Filtered Topic-Word Distributions:")
    for t, words in non_filtered_topics:
        print(f"Topic {t}: {words}")
    
    # Display filtered topics
    print("\nFiltered Topic-Word Distributions:")
    for t, words in filtered_topics:
        print(f"Topic {t}: {words}")

# Function to display both filtered and non-filtered WordClouds in a grid layout
def display_wordclouds_grid(lda_model, video_id, num_words=50):
    """
    Displays both filtered and non-filtered WordClouds for each topic in a grid layout.
    Skips topics with no words to plot.
    
    Args:
        lda_model: Trained LDA model.
        video_id: Identifier for the video/document.
        num_words: Number of top words to consider for WordCloud.
    """
    num_words = int(num_words)  # Ensure num_words is an integer
    # Get non-filtered topics
    non_filtered_topics = lda_model.show_topics(num_words=num_words, formatted=False)
    
    # Get filtered topics
    filtered_topics = filter_repeated_words_across_topics(lda_model, num_words=num_words)
    
    # Remove topics with no words to plot (for filtered topics)
    filtered_topics = [(t, words) for t, words in filtered_topics if words]
    
    # If no topics left to plot, return
    if not filtered_topics and not non_filtered_topics:
        print(f"No WordClouds to display for {video_id} (all topics empty).")
        return
    
    # Calculate number of rows and columns for the grid
    num_topics = max(len(non_filtered_topics), len(filtered_topics))
    num_rows = num_topics  # One row per topic
    num_cols = 2  # Two columns: one for non-filtered, one for filtered
    
    # Create subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 5 * num_rows))
    fig.suptitle(f"WordClouds for {video_id} (Left: Non-Filtered, Right: Filtered)", fontsize=16)
    
    # Flatten axes for easy iteration
    axes = axes.flatten()
    
    # Generate and display WordClouds for non-filtered topics
    for i, (t, words) in enumerate(non_filtered_topics):
        # Generate word frequency dictionary for the WordCloud
        word_freq = {word[0]: word[1] for word in words}
        wordcloud = WordCloud(width=400, height=200, background_color="white").generate_from_frequencies(word_freq)
        
        # Display the WordCloud in the left column
        axes[2 * i].imshow(wordcloud, interpolation="bilinear")
        axes[2 * i].set_title(f"Topic {t} (Non-Filtered)", fontsize=12)
        axes[2 * i].axis("off")
    
    # Generate and display WordClouds for filtered topics
    for i, (t, words) in enumerate(filtered_topics):
        # Generate word frequency dictionary for the WordCloud
        word_freq = {word[0]: word[1] for word in words}
        wordcloud = WordCloud(width=400, height=200, background_color="white").generate_from_frequencies(word_freq)
        
        # Display the WordCloud in the right column
        axes[2 * i + 1].imshow(wordcloud, interpolation="bilinear")
        axes[2 * i + 1].set_title(f"Topic {t} (Filtered)", fontsize=12)
        axes[2 * i + 1].axis("off")
    
    # Hide unused subplots
    for j in range(2 * i + 2, len(axes)):
        axes[j].axis("off")
    
    plt.tight_layout()
    plt.show()

    # Display the preprocessed document
    plt.figure(figsize=(8, 6))
    plt.text(0.5, 0.5, preprocessed_doc, fontsize=12, wrap=True, ha='center', va='center', multialignment='left')
    plt.axis("off")
    plt.title(f"{video_id}_preprocessedDoc", fontsize=16)
    plt.show()

# Function to display raw LDA results
def display_raw_lda_results(lda_model):
    """
    Displays raw topic-word distributions from the LDA model.
    
    Args:
        lda_model: Trained LDA model.
    """
    print("\nRaw LDA Topic-Word Distributions:")
    topics = lda_model.show_topics(formatted=False)
    for t, words in topics:
        print(f"Topic {t}: {words}")

# Function to display LDA evaluation metrics
def display_lda_evaluation_metrics(lda_model, corpus, dictionary, preprocessed_doc):
    """
    Displays evaluation metrics for the LDA model.
    
    Args:
        lda_model: Trained LDA model.
        corpus: Gensim corpus.
        dictionary: Gensim dictionary.
        preprocessed_doc: Preprocessed document content.
    """
    # Calculate perplexity
    perplexity = lda_model.log_perplexity(corpus)
    print(f"\nPerplexity: {perplexity}")
    
    # Calculate coherence score
    coherence_model = CoherenceModel(model=lda_model, texts=[preprocessed_doc.split()], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    print(f"Coherence Score: {coherence_score}")

# Dictionary to store preprocessed documents mapped to video IDs
preprocessed_docs = {}

# LDA topic modeling
lda_models = []
topic_vectors = []

# Function to filter repeated words across topics
def filter_repeated_words_across_topics(lda_model, num_words=20):
    """
    Filters repeated words across topics for a single document.
    Ensures each word appears in only one topic.

    Args:
        lda_model: Trained LDA model for a document.
        num_words: Number of top words to consider per topic.

    Returns:
        filtered_topics: List of filtered topics where words don't repeat.
    """
    topics = lda_model.show_topics(num_words=num_words, formatted=False)
    used_words = set()
    filtered_topics = []

    for t, words in topics:
        filtered_words = []
        for word, prob in words:
            if word not in used_words:
                filtered_words.append((word, prob))
                used_words.add(word)
        filtered_topics.append((t, filtered_words))

    return filtered_topics

In [68]:
for i, doc in enumerate(documents):
    video_id = file_names[i].split('_captions')[0]
    print(f"Processing file: {file_names[i]}")

    preprocessed_doc = preprocess_text_with_tags(doc, video_id)
    preprocessed_docs[video_id] = preprocessed_doc

    if len(preprocessed_doc) > 0:
        dictionary = corpora.Dictionary([preprocessed_doc.split()])
        corpus = [dictionary.doc2bow(preprocessed_doc.split())]
        
        num_topics = dynamic_num_topics(preprocessed_doc, words_per_topic=200, min_topics=1, max_topics=5)
        #print(f"Number of topics for {video_id}: {num_topics}")

        lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, eta=0.01, alpha='auto', passes=100)
        lda_models.append(lda_model)

        topic_distribution = lda_model.get_document_topics(corpus[0], minimum_probability=0)
        topic_vector = [prob for _, prob in topic_distribution]
        topic_vectors.append(topic_vector)

# Main loop to process and display results
for i, lda_model in enumerate(lda_models):
    video_id = file_names[i].split('_captions')[0]
    preprocessed_doc = preprocessed_docs[video_id]
    
    print(f"\nResults for {video_id}:")

    # Display topic-word distributions (filtered vs non-filtered)
    display_topic_word_distributions(lda_model)
    
    # Display evaluation metrics
    dictionary = corpora.Dictionary([preprocessed_doc.split()])
    corpus = [dictionary.doc2bow(preprocessed_doc.split())]
    display_lda_evaluation_metrics(lda_model, corpus, dictionary, preprocessed_doc)
    
    # Display WordClouds in a grid layout
    # display_wordclouds_grid(lda_model, video_id)

Processing file: W9OyX8n8cRk - _1 Halo-Halo In Rural Siargao Philippines ðŸ_µðŸ__.txt
Processing file: 8DMKzuGsBao - Foreigners Try Filipino Food_ðŸ_µðŸ__ PINOY SPAGHETTI _ LUMPIANG SHANGHAI - BEST Food in the Philippines.txt
Processing file: EKNuv99XeCA - Chinese guy_s reaction to Sassa Dagdag covers La Vie En Rose LIVE on Wish 107_5 Bus.txt
Processing file: 3vouclvykqE - Jeremy Scahill on Trump_s Embrace of Duterte_s Deadly War on Drugs in the Philippines.txt
Processing file: EUkFRzflw20 - My Brother And I Will Be Trying 3 Exciting Pinakbet Variations.txt
Processing file: 3DJpy-yikrc - MOST DANGEROUS CITY IN ASIA_ (Manila Philippines).txt
Processing file: lH-ebkL3l9w - Americans React to Philippines 101_ Filipino Drinking Etiquette.txt
Processing file: pikBCAGvTQQ - Our FIRST DAY in EL NIDO PALAWAN_ ðŸ_µðŸ__ We were IMPRESSED_.txt
Processing file: ubO784YDsN0 - _Filipino Subs_ What REALLY happened when Justin (SB19) Replied to us on Twitter.txt
Processing file: r_sGpp6plHQ - COOKING 

In [74]:
import numpy as np
import umap
import hdbscan
from collections import defaultdict
from collections import Counter

# Ensure all topic vectors have the same length by padding with zeros
max_topics = max(len(vec) for vec in topic_vectors)
topic_matrix = np.array([vec + [0] * (max_topics - len(vec)) for vec in topic_vectors])

# Reduce dimensions using UMAP
umap_reducer = umap.UMAP(n_neighbors=3, n_components=2)
reduced_embeddings = umap_reducer.fit_transform(topic_matrix)

# Apply HDBSCAN clustering
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=5, cluster_selection_epsilon=0.1)
cluster_labels = hdbscan_clusterer.fit_predict(reduced_embeddings)

# Store clustering results
clustered_videos = defaultdict(list)
for i, cluster in enumerate(cluster_labels):
    video_id = file_names[i].split('_captions')[0]
    clustered_videos[cluster].append(video_id)

# Print cluster info
print("\nVideos grouped into clusters:")
for cluster, videos in clustered_videos.items():
    if cluster == -1:
        print(f"Cluster {cluster} (Outliers): {len(videos)} videos")
    else:
        print(f"Cluster {cluster}: {len(videos)} videos")

# Extract representative keywords for each cluster
cluster_keywords = {}

for cluster, videos in clustered_videos.items():
    if cluster == -1:  # Skip outliers
        continue

    word_counts = Counter()
    for video_id in videos:
        try:
            video_index = file_names.index(f"{video_id}")
            lda_model = lda_models[video_index]

            # Extract top words from topics
            for topic in lda_model.show_topics(num_words=10, formatted=False):
                for word, prob in topic[1]:
                    word_counts[word] += prob  # Weight by probability

        except ValueError:
            print(f"Warning: Skipping {video_id}, file not found.")

    # Select top representative words for the cluster
    cluster_keywords[cluster] = [word for word, _ in word_counts.most_common(10)]

# Print cluster keywords
print("\nCluster Keywords:")
for cluster, keywords in cluster_keywords.items():
    print(f"Cluster {cluster} Keywords: {', '.join(keywords)}")



Videos grouped into clusters:
Cluster 2: 10 videos
Cluster 0: 19 videos
Cluster 5: 35 videos
Cluster 3: 7 videos
Cluster 4: 11 videos
Cluster 1: 7 videos
Cluster -1 (Outliers): 5 videos

Cluster Keywords:
Cluster 2 Keywords: lechon, voice, pinakbet, christmas, morissette, lady, spicy, dish, cebu, mix
Cluster 0 Keywords: market, fish, nido, sauce, horse, beer, street, chicken, mango, mall
Cluster 5 Keywords: voice, song, pork, time, sauce, ferry, school, title, rice, phone
Cluster 3 Keywords: salad, trump, egg, duterte, chicken, street, sort, health, insurance, manila
Cluster 4 Keywords: lechon, city, christmas, comfort, beef, business, dish, island, manila, school
Cluster 1 Keywords: sapin, history, month, pun, rice, chicken, war, sisig, crispy, twitter


In [None]:
import os
import random
import requests
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import FastText,  CoherenceModel
import numpy as np
import spacy
import string
from wordcloud import WordCloud
import unicodedata
import matplotlib.pyplot as plt
from collections import Counter
from nltk.util import ngrams
from collections import defaultdict

# Load FastText Model
#fasttext_model = FastText.load("fasttext_300dim_20epochs.model")

def load_filipino_context_words():
    """Load Filipino-related words from a file."""
    with open("filipino_context_words.txt", "r", encoding="utf-8") as file:
        return [line.strip() for line in file.readlines()]

#filipino_context_words = load_filipino_context_words()

# Folder paths
transcripts_folder_path = 'standard_dataset/'
tags_folder_path = 'tags/'

# Function to load random documents
def load_random_documents(folder_path, sample_size=100):
    all_files = [file_name for file_name in os.listdir(folder_path) if file_name.endswith('.txt')]
    sampled_files = random.sample(all_files, min(sample_size, len(all_files)))

    documents = []
    file_names = []
    for file_name in sampled_files:
        with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
            content = file.read().lower()
            if len(content.split()) >= 100:
                documents.append(content)
                file_names.append(file_name)
    return documents, file_names

documents, file_names = load_random_documents(transcripts_folder_path)

# Extract video IDs from fetched transcript filenames
video_ids = {file_name.split("_")[0] for file_name in file_names}

# Function to load video tags only for fetched video IDs
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  # Store as list of words
    return video_tags

video_tags = load_video_tags(tags_folder_path, video_ids)

# Fetch stopwords
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    return set(response.text.splitlines())

github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

stop_words = set(stopwords.words('english'))
custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                     'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                     'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow' ]
broad_terms = ['philippines', 'philippine', 'british', 'filipino', 'video', 'http', 'korea', 'korean', 
               'youtube', 'google', 'united', 'america', 'american']
kpop_keywords = ['kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 'seventeen', 
                 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook']
more_keywords = [
    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 'news', 'netherlands', 'psy', 'subtitle', 'description', 'link', 
    'journalist', 'headline', 'reporter', 'current events', 'special report', 
    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'gonna', 'life', 
    'lol', 'guys', 'tho', 'cute', 'hmm', 'huh', 'channel', 'subscribe', 'day6', 'mandarin', 'chinese', 'beautiful',
    'chuckles', 'fbe', 'hit', 'laughs', 'yo', 'ka', 'word', 'living', 'boi', 'minimum', 'ya', 'successful', 'perfectly', 'yeap', 
    'wondering', 'fantastic', 'hurry', 'german', 'age', 'country', 'subscribing', 'bluesy', 'jump', 'pretty', 'understanding', 'personalized',
    'and', 'the', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'up', 'about', 'over', 'into', 'through', 'between', 'under', 'against', 'all',
    'you', 'haha', 'hahaha', 'ha', 'hey', 'bye', 'hello', 'hi', 'oh', 'blah', 'easy', 'alright', 'ta', 'day', 'ooh', 'en', 'do', 'lot', 'comment', 'notification', 
    'korean', 'jjajangmyeon', 'jajangmyeon', 'damn', 'yall', 'month', 'week', 'year', 'ohhh', 'pvf', 'dude', 'mmm', 'kagilagilalas', 'ofcourse', 'australia', 'uxo', 
    'atleast', 'yusuf', 'bangkok', 'ot', 'anytime', 'allover', 'kala', 'nope', 'wan', 'brazil', 'smooth', 'ot', 'timeshere', 'batchof', 'yep', 'opo', 'del',
    'gosh', 'po', 'ourself', 'wo', 'wait', 'ugh', 'nyc', 'whoa', 'nicaragua', 'yup', 'em', 'bout', 'le', 'omg', 'overwhelm', 'maam', 'nicer', 'haha', 'hahaha', 'ha', 
    'nbcs', 'lana', 'rc', 'whatsoever', 'oxy', 'decade', 'whyd', 'unknown', 'ahhhhh', 'ohoh', 'ohto', 'ohhhh', 'bruh', 'ooe', 'ahmedabad', 'mexico', 
    'understand', 'excuse', 'kinda', 'applause', 'oooh', 'thiswhat', 'nevermind', 'ahh', 'againthank', 'toto', 'aww', 'nah', 'bbmas', 'ay', 'op', 'huh', 'huhu',
    'tada', 'beacuse', 'voila', 'upstairs', 'thatswhy', 'yea', 'that', 'armenia', 'or', 'not', 'funwhat', 'aka', 'armeniathat', 'woosexy', 'worth', 'laugh', 'box', 
    'xd', 'vb', 'eff', 'ananya', 'welsh', 'latron', 'shout', 'whatwhat', 'what', 'pause', 'why', 'thats', 'byebye', 'iv', 'bye', 'ado', 'ownup', 'dom', 'jomm', 'sir', 
    'budgie', 'nomac', 'lavocha', 'germany', 'why', 'walang', 'superduper', 'philip', 'mom', 'jre', 'giddy', 'intro', 'dupe', 'europe', 'dream', 'team', 'dislike', 'content', 
    'yoongi', 'royale', 'ilu', 'jhope', 'day', 'jin', 'ecc', 'nyhs', 'nego', 'chavez', 'pb', 'everyones', 'epic', 'matter', 'oneonone', 'region', 'change', 'ho', 'seetoh', 
    'atin', 'vpn', 'facetune', 'busu', 'mackie', 'clyd', 'china', 'rest', 'friend', 'woah', 'dindins', 'poster', 'vibe', 'woman', 'boss', 'woah', 'type', 'mahana', 'joke', 
    'taller', 'insane', 'whang', 'psa', 'manatee', 'recommend', 'caesar', 'mmmhmm', 'mosul', 'dun', 'clue', 'naysayer', 'hindi', 'ko', 'pero', 'bulgaria', 'question', 'video', 
    'yobi', 'hindu', 'expat', 'option', 'gap', 'eu', 'simo', 'kouignamann', 'bct', 'month', 'cfo'
]
stop_words.update(custom_stop_words, kpop_keywords, broad_terms, more_keywords, github_stopwords)

lemmatizer = WordNetLemmatizer()

# Helper function to check if a word contains only Latin script characters
def is_latin_script(word):
    for char in word:
        if not ('LATIN' in unicodedata.name(char, '') or char.isdigit()):
            return False
    return True

def preprocess_text(doc, video_id, ngram_range=(1, 2), tag_weight=1, filipino_weight=2, ngram_weight_factor=1):
    """Preprocess text while considering Filipino context and video tags."""
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(doc)
    tokens_with_pos = pos_tag(tokens)

    # Apply lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tokens_with_pos]

    # Generate n-grams
    ngram_tokens = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        ngram_tokens.extend([' '.join(gram) for gram in ngrams(lemmatized_tokens, n)])

    # Count n-gram frequency
    ngram_frequencies = Counter(ngram_tokens)

    # Filter stopwords and non-useful words
    filtered_tokens = []
    for word, pos in tokens_with_pos:
        if not word.isalpha() or word in stop_words:
            continue
        if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:
            continue  # Remove verbs and adjectives
        filtered_tokens.append(word)

    # Assign weight based on n-grams and Filipino similarity
    weighted_tokens = []
    for token in filtered_tokens:
        token_weight = 1 + ngram_frequencies.get(token, 0) * ngram_weight_factor
        weighted_tokens.extend([token] * int(token_weight))

    # Include video tags
    # if video_id in video_tags:
    #     tags = video_tags[video_id]
    #     for tag in tags:
    #         if tag.isalpha():
    #             tag_weight_multiplier = tag_weight
    #             #filipino_similarity = get_filipino_similarity(tag)
    #             #if filipino_similarity > 0.3:
    #                 #tag_weight_multiplier *= filipino_weight * filipino_similarity
    #             weighted_tokens.extend([tag] * int(tag_weight_multiplier))

    return ' '.join(weighted_tokens)

# def get_filipino_similarity(word):
#     """Compute similarity to Filipino words using FastText."""
#     if word in filipino_context_words:
#         return 1
#     try:
#         similarities = [fasttext_model.wv.similarity(word, ref_word) for ref_word in filipino_context_words if ref_word in fasttext_model.wv]
#         return max(similarities) if similarities else 0
#     except KeyError:
#         return 0
    
def get_wordnet_pos(treebank_tag):
    """Convert POS tag to WordNet format for lemmatization."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun
    
# Load and preprocess all transcripts together
all_documents = []
video_ids = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
            content = file.read().lower()
            if len(content.split()) >= 100:  # Minimum word count check
                processed_text = preprocess_text(content, video_id)
                all_documents.append(processed_text)
                video_ids.append(video_id)

# Create Dictionary and Corpus for LDA
dictionary = corpora.Dictionary([doc.split() for doc in all_documents])
corpus = [dictionary.doc2bow(doc.split()) for doc in all_documents]

# Train LDA on all transcripts
num_topics = 7  # Adjust based on coherence scoring if needed
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, eta='auto', alpha='auto', passes=100)

# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=[doc.split() for doc in all_documents], dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

# Dictionary to store the number of videos per topic
topic_video_count = defaultdict(int)
video_topic_mapping = {}

for idx, doc_bow in enumerate(corpus):
    video_id = video_ids[idx]
    topic_distribution = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    assigned_topics = [topic for topic, prob in topic_distribution if prob > 0.2]  # Threshold can be tuned
    video_topic_mapping[video_id] = assigned_topics
    for topic in assigned_topics:
        topic_video_count[topic] += 1

# Print results
print("\nVideo-Topic Mapping:")
for video_id, topics in video_topic_mapping.items():
    print(f"Video {video_id} belongs to topics: {topics}")

print("\nNumber of videos per topic:")
for topic, count in sorted(topic_video_count.items()):
    print(f"Topic {topic}: {count} videos")

# Topic-Word Distribution
print("\nTopic-Word Distribution:")
topic_word_distribution = {}
for topic_id in range(num_topics):
    words = lda_model.show_topic(topic_id, topn=15)
    topic_word_distribution[topic_id] = words
    print(f"\nTopic {topic_id}:")
    for word, prob in words:
        print(f"  {word}: {prob:.4f}")

# Representative words per topic
print("\nRepresentative Words Per Topic:")
for topic_id, words in topic_word_distribution.items():
    representative_words = [word for word, _ in words[:5]]  # Take the top 5 words
    print(f"Topic {topic_id}: {', '.join(representative_words)}")


# Print Coherence Score
print("\nCoherence Score (C_v): {:.4f}".format(coherence_score))


Video-Topic Mapping:
Video -U5-r29G-t0 - First time trying Filipino Street Food_.txt belongs to topics: [5]
Video 27yofdZnH9s - School Life in Japan _ What do you think about Filipinos_ _ VLOG_1.txt belongs to topics: [0, 3]
Video 3DJpy-yikrc - MOST DANGEROUS CITY IN ASIA_ (Manila Philippines).txt belongs to topics: [0]
Video 3vouclvykqE - Jeremy Scahill on Trump_s Embrace of Duterte_s Deadly War on Drugs in the Philippines.txt belongs to topics: [1]
Video 4DEqwJI-vdg - TRYING JOLLIBEE FOR THE FIRST TIME IN THE PHILIPPINES ðŸ_µðŸ__ (Jollibee Mukbang in Manila).txt belongs to topics: [4]
Video 57TY0y5zpe0 - SB19 MAPA FILIPINO High School GRADUATION song __ LATINA REACTS.txt belongs to topics: [0]
Video 6lZVXJlqN7s - FIRST DAY in MANILA ðŸ_µðŸ__ Searching Paradise In THE PHILIPPINES.txt belongs to topics: [0, 5]
Video 8d8tfV1uBn4 - Baon Kainan is Portland_s newest Filipino food cart _ Here is Oregon.txt belongs to topics: [5]
Video 8DMKzuGsBao - Foreigners Try Filipino Food_ðŸ_µðŸ__ PIN

In [3]:
# Representative words per topic
print("\nRepresentative Words Per Topic:")
for topic_id, words in topic_word_distribution.items():
    representative_words = [word for word, _ in words[:20]]  # Take the top 5 words
    print(f"Topic {topic_id}: {', '.join(representative_words)}")



Representative Words Per Topic:
Topic 0: coffee, city, island, manila, park, nido, school, market, soup, cebu, fruit, theme, starbucks, bulalo, song
Topic 1: trump, duterte, president, jeremy, ferry, scahill, house, phone, shop, war, drug, obama, mango, experience, gonzález
Topic 2: lady, voice, stuff, horse, hour, egg, beer, skin, fish, incident, nation, morissette, happen, mall, reason
Topic 3: mango, island, history, arena, health, village, mix, halo, insurance, ice, war, lumpia, water, drinking, rice
Topic 4: rice, burger, sort, chicken, spicy, calamansi, pun, beef, sauce, sapin, dish, flavour, singapore, taste, vinegar
Topic 5: salad, jollibee, pinakbet, street, dish, god, market, sauce, manila, sweet, pork, birthday, culture, cream, taste
Topic 6: chicken, christmas, lechon, dish, comfort, boracay, rice, mall, pig, menudo, airport, pancit, meat, cebu, hotel
