In [None]:
# import csv
# import os

# # Paths to output directories
# titles_dir = 'video_titles'
# tags_dir = 'video_tags'

# # Create output directories if they don't exist
# os.makedirs(titles_dir, exist_ok=True)
# os.makedirs(tags_dir, exist_ok=True)

# # Input CSV file path
# csv_file_path = 'Final_PB_Videos.csv'  # Replace with your actual file name

# # Open and read the CSV
# with open(csv_file_path, newline='', encoding='utf-8') as csvfile:
#     reader = csv.reader(csvfile)
#     header = next(reader)  # Skip header row

#     for row in reader:
#         video_id = row[0].strip()
#         video_title = row[1].strip()
#         raw_tags = row[3].strip()

#         # Clean tag string
#         cleaned_tags = raw_tags.strip("[]")  # remove square brackets
#         cleaned_tags = cleaned_tags.replace("'", "").replace('"', "")  # remove quotes
#         cleaned_tags = ", ".join([tag.strip() for tag in cleaned_tags.split(",") if tag.strip()])

#         # Write the title to a file
#         with open(os.path.join(titles_dir, f'{video_id}.txt'), 'w', encoding='utf-8') as title_file:
#             title_file.write(video_title)

#         # Write the cleaned tags to a file
#         with open(os.path.join(tags_dir, f'{video_id}.txt'), 'w', encoding='utf-8') as tags_file:
#             tags_file.write(cleaned_tags)


In [None]:
# import os
# from langdetect import detect, LangDetectException

# # Set the path to your folder with transcripts
# transcripts_dir = 'T3_Transcripts/'  # e.g., 'transcripts/'

# non_english_files = []

# for filename in os.listdir(transcripts_dir):
#     if filename.endswith('.txt'):
#         file_path = os.path.join(transcripts_dir, filename)

#         try:
#             with open(file_path, 'r', encoding='utf-8') as file:
#                 text = file.read().strip()

#                 if not text:
#                     non_english_files.append((filename, 'EMPTY'))
#                 else:
#                     lang = detect(text)
#                     if lang != 'en':
#                         non_english_files.append((filename, lang))

#         except LangDetectException:
#             non_english_files.append((filename, 'UNDETECTABLE'))
#         except Exception as e:
#             non_english_files.append((filename, f'ERROR: {str(e)}'))

# # Print the collected results
# print("=== Non-English / Empty / Undetectable Files ===")
# for fname, reason in non_english_files:
#     print(f"{fname}: {reason}")

In [None]:
# import langid

# non_english_files = []

# for filename in os.listdir(transcripts_dir):
#     if filename.endswith('.txt'):
#         file_path = os.path.join(transcripts_dir, filename)

#         try:
#             with open(file_path, 'r', encoding='utf-8') as file:
#                 text = file.read().strip()

#                 if not text:
#                     non_english_files.append((filename, 'EMPTY'))
#                 else:
#                     lang, confidence = langid.classify(text)

#                     # If not English (en), record it
#                     if lang != 'en':
#                         non_english_files.append((filename, f'{lang} ({confidence:.2f})'))

#         except Exception as e:
#             non_english_files.append((filename, f'ERROR: {str(e)}'))

# # Print the results
# print("=== Non-English / Empty / Problematic Files ===")
# for fname, reason in non_english_files:
#     print(f"{fname}: {reason}")


In [4]:
import os
import string
import unicodedata
import requests
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk import pos_tag
from nltk.util import ngrams
from collections import Counter, defaultdict
from itertools import product, combinations

In [5]:
stop_words = set(stopwords.words("english"))

# Fetch stopwords
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    return set(response.text.splitlines())

github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

stop_words = set(stopwords.words('english'))

with open("stop_words.txt", "r", encoding="utf-8") as file:
    custom_stop_words = set(file.read().splitlines())

stop_words.update(custom_stop_words, github_stopwords)

# Folder paths
transcripts_folder_path = 'T3_Transcripts/'

tags_folder_path = 'video_tags/'
titles_folder_path = 'video_titles/'

# Function to load video tags only for fetched video IDs
def load_video_tags(folder_path, video_ids):
    video_tags = {}
    for video_id in video_ids:
        tag_file = os.path.join(folder_path, f"{video_id}.txt")
        if os.path.exists(tag_file):
            with open(tag_file, "r", encoding="utf-8") as file:
                tags_content = file.read().lower()
                video_tags[video_id] = tags_content.split()  # Store as list of words
        else:
            video_tags[video_id] = []  # Default to empty list if no tags
    return video_tags

# Function to load video titles
def load_video_titles(folder_path, video_ids):
    video_titles = {}
    for video_id in video_ids:
        title_file = os.path.join(folder_path, f"{video_id}")
        if os.path.exists(title_file):
            with open(title_file, "r", encoding="utf-8") as file:
                video_titles[video_id] = file.read().strip()  # Read full title
        else:
            video_titles[video_id] = "Unknown Title"  # Default if no title file
    return video_titles

video_ids = []
transcript_files = []
for file_name in os.listdir(transcripts_folder_path):
    if file_name.endswith('.txt'):
        video_id = file_name.split('_captions')[0]
        video_ids.append(video_id)
        transcript_files.append((video_id, file_name)) 

video_tags = load_video_tags(tags_folder_path, video_ids)
video_titles = load_video_titles(titles_folder_path, video_ids)

## Helper Functions

In [6]:
def is_latin_script(word):
    return all('LATIN' in unicodedata.name(char, '') or char.isdigit() for char in word)

# Function to detect both bigram and trigram collocations
def detect_collocations(tokens, min_freq=2):
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()

    # Find bigrams
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigram_finder.apply_freq_filter(min_freq)
    bigrams = set(['_'.join(bigram) for bigram in bigram_finder.nbest(bigram_measures.pmi, 10)])

    # Find trigrams
    trigram_finder = TrigramCollocationFinder.from_words(tokens)
    trigram_finder.apply_freq_filter(min_freq)
    trigrams = set(['_'.join(trigram) for trigram in trigram_finder.nbest(trigram_measures.pmi, 10)])

    return bigrams, trigrams

def is_valid_ngram(ngram, existing_ngrams):
    """ Check if the n-gram contains alternating or duplicate words """
    words = ngram.split('_')
    unique_words = set(words)
    
    if len(unique_words) == 1:
        return False  
    
    if len(words) > 2 and words[0] == words[2]:  
        return False  
    
    if len(words) > 2 and words[0] == words[1]:
        return False

    if len(words) == 3 and words[1] == words[2]:
        return False 
    
    if len(words) == 2:
        for existing_ngram in existing_ngrams:
            if ngram in existing_ngram:
                return False  
    
    return True

def preprocess_text(doc, video_id, tag_weight=2, ngram_weight_factor=2):
    # Clean punctuation at the end of words
    doc = re.sub(r'([a-zA-Z]+)[,;:!?.]', r'\1', doc)

    # Lowercase and remove punctuation
    doc = doc.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(doc)

    # Remove stopwords & non-latin words
    tokens = [word for word in tokens if word not in stop_words and word.isalpha() and is_latin_script(word)]

    # POS tagging
    tokens_with_pos = pos_tag(tokens)

    # Remove verbs and adjectives before n-gram detection
    pos_exclude = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
    filtered_tokens = [word for word, pos in tokens_with_pos if pos not in pos_exclude]

    # Include video tags in filtered tokens
    if video_id in video_tags:
        tags = video_tags[video_id]
        cleaned_tags = []
        
        for tag in tags:
            tag = re.sub(r'^[,;:!?.\'\"]*([a-zA-Z]+)[,;:!?.\'\"]*$', r'\1', tag)
            tag = tag.lower().translate(str.maketrans('', '', string.punctuation))  
            if tag.isalpha() and tag not in stop_words and is_latin_script(tag):  
                cleaned_tags.append(tag)

        filtered_tokens.extend(cleaned_tags)  

    # Detect meaningful bigram and trigram collocations (tags are included)
    bigrams, trigrams = detect_collocations(filtered_tokens)

    # Generate n-grams
    bigram_tokens = ['_'.join(gram) for gram in ngrams(filtered_tokens, 2)]
    trigram_tokens = ['_'.join(gram) for gram in ngrams(filtered_tokens, 3)]

    # Remove invalid n-grams (duplicates, alternating patterns)
    bigram_tokens = [bigram for bigram in bigram_tokens if is_valid_ngram(bigram, set())]
    trigram_tokens = [trigram for trigram in trigram_tokens if is_valid_ngram(trigram, bigram_tokens)]

    # Keep only meaningful n-grams from detect_collocations()
    bigram_tokens = [bigram for bigram in bigram_tokens if bigram in bigrams]
    trigram_tokens = [trigram for trigram in trigram_tokens if trigram in trigrams]

    # Remove n-grams that are just reordered versions
    unique_ngrams = set()
    filtered_bigrams = []
    filtered_trigrams = []

    for bigram in bigram_tokens:
        sorted_bigram = '_'.join(sorted(bigram.split('_')))
        if sorted_bigram not in unique_ngrams:
            unique_ngrams.add(sorted_bigram)
            filtered_bigrams.append(bigram)

    for trigram in trigram_tokens:
        sorted_trigram = '_'.join(sorted(trigram.split('_')))
        if sorted_trigram not in unique_ngrams:
            unique_ngrams.add(sorted_trigram)
            filtered_trigrams.append(trigram)
    
    # Count n-gram frequency
    bigram_frequencies = Counter(filtered_bigrams)
    trigram_frequencies = Counter(filtered_trigrams)

    # Merge n-grams into single tokens
    bigram_trigram_words = set()
    merged_tokens = []
    i = 0
    while i < len(filtered_tokens) - 2:  # Check for trigrams first
        trigram = f"{filtered_tokens[i]}_{filtered_tokens[i+1]}_{filtered_tokens[i+2]}"
        bigram = f"{filtered_tokens[i]}_{filtered_tokens[i+1]}"

        if trigram in filtered_trigrams:
            merged_tokens.append(trigram)
            bigram_trigram_words.add(trigram)
            i += 3  # Skip next two words since it's part of the trigram
        elif bigram in filtered_bigrams:
            merged_tokens.append(bigram)
            bigram_trigram_words.add(bigram)
            i += 2  # Skip next word since it's part of the bigram
        else:
            merged_tokens.append(filtered_tokens[i])
            i += 1
    
    # Append any remaining words
    while i < len(filtered_tokens):
        merged_tokens.append(filtered_tokens[i])
        i += 1

    # Store n-gram components to remove single tokens later
    ngram_components = set(word for bigram in bigram_tokens for word in bigram.split('_'))
    ngram_components.update(word for trigram in trigram_tokens for word in trigram.split('_'))

    # Remove single tokens if they appear in any n-gram
    filtered_tokens = [word for word in filtered_tokens if word not in ngram_components]

    # Remove duplicates before assigning weight
    unique_tokens = list(set(merged_tokens))

    # Assign weight based on n-gram occurrence
    weighted_tokens = []
    for token in unique_tokens:
        if token in trigram_frequencies:
            token_weight = trigram_frequencies[token] * 2 + ngram_weight_factor  
        elif token in bigram_frequencies:
            token_weight = bigram_frequencies[token] * ngram_weight_factor
        else:
            token_weight = 1
        weighted_tokens.extend([token] * int(token_weight))

    # # Include video tags with weight
    # if video_id in video_tags:
    #     for tag in cleaned_tags:
    #         weighted_tokens.extend([tag] * int(tag_weight))

    return ' '.join(weighted_tokens), list(bigram_trigram_words)

def topic_diversity(model, top_n=10):
    topic_words = [set([word for word, _ in model.show_topic(topic_id, top_n)]) for topic_id in range(model.num_topics)]
    unique_words = set().union(*topic_words)
    return len(unique_words) / (top_n * len(topic_words))

def jaccard_similarity(topic1, topic2):
    return len(set(topic1) & set(topic2)) / len(set(topic1) | set(topic2))

def avg_jaccard_similarity(model, num_words=10):
    topics = [set([word for word, _ in model.show_topic(topic_id, num_words)]) for topic_id in range(model.num_topics)]
    similarities = [jaccard_similarity(t1, t2) for t1, t2 in combinations(topics, 2)]
    return sum(similarities) / len(similarities)

## Fetching Transcripts

In [7]:
all_documents = []
preprocessed_text = []
bigram_trigram_text = {}

for video_id, file_name in transcript_files:
    with open(os.path.join(transcripts_folder_path, file_name), 'r', encoding='utf-8') as file:
        content = file.read().lower()
        if len(content.split()) >= 100: 
            processed_text, bigram_trigram = preprocess_text(content, video_id)
            preprocessed_text.append((video_id, processed_text))
            all_documents.append(processed_text)
            bigram_trigram_text[video_id] = bigram_trigram

all_tokens = [token for doc in all_documents for token in doc.split()]
token_freq = Counter(all_tokens)

high_freq_tokens = {token for token, freq in token_freq.items() if freq > 200 and '_' not in token}

filtered_documents_with_id = []
for video_id, doc in preprocessed_text:
    filtered_doc = [token for token in doc.split() if token not in high_freq_tokens]
    filtered_documents_with_id.append((video_id, filtered_doc))  

filtered_documents_only = [doc for _, doc in filtered_documents_with_id]

filtered_documents = []
for doc in all_documents:
    filtered_doc = [token for token in doc.split() if token not in high_freq_tokens]
    filtered_documents.append(filtered_doc)


dictionary = corpora.Dictionary(filtered_documents)
corpus = [dictionary.doc2bow(doc) for doc in filtered_documents]

In [8]:
print("\nPreprocessed Text Per Video:")
for video_id, processed_text in preprocessed_text:
    video_title = video_titles.get(video_id, "Unknown Title") 
    print(f"\nVideo ID: {video_id} | {video_title}\n- {processed_text}\n")


Preprocessed Text Per Video:

Video ID: --8n6A8Q6M0.txt | $200 Luxury Beach Hotel in The Philippines 🇵🇭
- smells conditioner karachi cools hows agile sky idea masarap resort level crazy doubt camouflage cent summer ride margarita rock service breakfast woohoo doors lobby orange modala flight power balcony nippers comfy strength worm difference_beginner difference_beginner daddy path nap souvenir bread paratha chocolate_muffin chocolate_muffin sweet juice tarzan approve experience workstation greenery bathrobe beginner difference dollar attabad crabs options weekday god fingers volleyball chop destination bacon singlet rocks sleep swim brand tricycle masks waist mall boat ingredients joseph presume donut bathroom switch hang smell quick moadto spending beer_bottle beer_bottle layers watch luke sugar shots bar moadto_strip moadto_strip hot ski bull turquoise map mini kilometers camera mark context cloud formations pink bird license skip bliss waters holes acceleration jeepney gathering 

In [9]:
sorted_high_freq_tokens = sorted(high_freq_tokens, key=lambda token: token_freq[token], reverse=True)
for token in sorted_high_freq_tokens:
    print(token + ":", token_freq[token])


god: 1003
water: 882
fun: 878
videos: 851
family: 771
times: 771
morning: 769
watch: 767
house: 760
money: 740
idea: 733
friends: 731
hope: 728
experience: 700
rice: 687
head: 611
stay: 599
manila: 586
reason: 584
kids: 554
street: 552
culture: 545
moment: 532
care: 530
chicken: 529
couple: 529
job: 526
mind: 519
road: 515
school: 509
walk: 504
baby: 503
pesos: 488
meet: 482
camera: 471
heard: 467
island: 465
car: 458
girl: 435
restaurant: 435
hand: 435
pay: 430
feels: 428
coffee: 426
story: 424
sauce: 421
heart: 419
sense: 413
trip: 403
song: 403
sit: 391
beach: 380
market: 376
pork: 370
play: 370
air: 368
meat: 367
spicy: 363
mix: 359
difference: 359
business: 353
phone: 353
color: 352
support: 349
plan: 346
crazy: 345
todays: 343
view: 341
piece: 337
drink: 335
cook: 332
spot: 332
eyes: 332
pick: 331
size: 325
dinner: 324
forget: 323
language: 321
breakfast: 317
picture: 313
cut: 313
store: 305
peace: 305
egg: 304
lunch: 304
version: 301
restaurants: 300
girls: 300
game: 295
fan: 29

In [10]:
token_freq_dict = {dictionary[id]: freq for id, freq in dictionary.cfs.items()}

print("Top 100 Most Frequent Tokens:")
sorted_tokens = sorted(token_freq_dict.items(), key=lambda x: x[1], reverse=True)
for token, freq in sorted_tokens[:1000]:
    print(f"{token}: {freq}")


Top 100 Most Frequent Tokens:
ice_cream: 202
bite: 200
soup: 197
vinegar: 196
salty: 196
catch: 194
salt: 194
pop: 193
songs: 193
adventure: 193
cebu: 192
banana: 191
snack: 190
mother: 190
bread: 189
beaches: 188
fish: 187
nature: 186
heat: 185
class: 185
rock: 183
service: 183
flavors: 183
airport: 180
tea: 180
church: 179
government: 179
ingredients: 178
meeting: 178
dessert: 178
war: 176
add: 174
cars: 173
card: 172
snacks: 171
dish: 171
flight: 170
buildings: 170
shoes: 170
mountain: 170
tourists: 169
security: 169
bike: 168
challenge: 168
barbecue: 167
stories: 166
options: 165
prices: 164
wall: 164
summer: 163
tourist: 163
chili: 163
environment: 163
bathroom: 161
opportunity: 161
pictures: 161
issue: 159
star: 159
menu: 158
stage: 158
cities: 157
conversation: 156
plate: 155
pepper: 155
noodles: 155
butter: 155
eggs: 155
stick: 153
combination: 152
paper: 152
health: 151
shot: 150
husband: 150
cold: 149
houses: 149
watching: 149
stand: 148
board: 148
games: 148
americans: 148
h

### Generated N-Grams

In [11]:
for n in video_ids:
    video_title = video_titles.get(n, "Unknown Title")
    print("Video ID: ", n, video_title)
    
    if n in bigram_trigram_text:  
        print(f"{bigram_trigram_text[n]}")
    else:
        print("No bigram/trigram data available for this video.")

    print()

Video ID:  --8n6A8Q6M0.txt $200 Luxury Beach Hotel in The Philippines 🇵🇭
['holes_bits', 'rainy_season', 'wet_clothes', 'wear_shoes', 'moadto_strip', 'coffee_wake', 'infinity_pool_beach', 'journey_tara', 'difference_beginner', 'chocolate_muffin', 'beer_bottle']

Video ID:  -1B7cVoZr1c.txt Marine reacts to the Philippine Light Reaction Regiment (LRR)
['creation_unit_company', 'history_lesson', 'unit_experience', 'relationship_support', 'training_equipment', 'task_force', 'command_control', 'scout_rangers_forces']

Video ID:  -7vF5F-1btE.txt Ultimate Filipino Food Festival In The Netherlands!
['pork_barbecue_chicken', 'pork_barbecue', 'rice_bangus']

Video ID:  -9bfDHHneyU.txt SHOWING MY SISTER SB19 'GENTO' Music Video
['party_track', 'dubstep_mix']

Video ID:  -C5iB25BRsA.txt 10 Reasons/Do not Retire TO the Philippines/Moving to the Philippines/Philippine/Dumaguete
['plane_ticket', 'access_money', 'reason_chanep', 'exit_plan', 'heart_attack_hospital', 'bonus_reason']

Video ID:  -cOD9Ggl

## Running LDA 

In [12]:
lda_model_30 = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=30,
        alpha=0.1,
        eta=0.1,
        passes=50,  
        random_state=42
    )

    # Compute Coherence Scores
coherence_c_v = CoherenceModel(model=lda_model_30, texts=filtered_documents, dictionary=dictionary, coherence='c_v').get_coherence()
coherence_u_mass = CoherenceModel(model=lda_model_30, corpus=corpus, dictionary=dictionary, coherence='u_mass').get_coherence()
coherence_npmi = CoherenceModel(model=lda_model_30, texts=filtered_documents, dictionary=dictionary, coherence='c_npmi').get_coherence()

# Compute Topic Diversity
diversity_score = topic_diversity(lda_model_30)

# Compute Jaccard Similarity
jaccard_score = avg_jaccard_similarity(lda_model_30)

print(f"Coherence Score c_v: {coherence_c_v}")
print(f"Coherence Score u_mass: {coherence_u_mass}")
print(f"Coherence Score npmi: {coherence_npmi}")
print(f"Topic Diversity: {diversity_score}")
print(f"Jaccard Similarity: {jaccard_score}")

Coherence Score c_v: 0.5296113674506044
Coherence Score u_mass: -13.593814451445649
Coherence Score npmi: -0.3201796368754099
Topic Diversity: 0.99
Jaccard Similarity: 0.0003629764065335753


### Map a video to multiple topics

In [13]:
# Initialize required structures
video_topic_mapping_30 = {}
video_topic_words_LDA1_30 = {}
video_topic_probabilities = {}
video_topic_to_preprocessed_text = defaultdict(dict)
topic_to_videos = defaultdict(list)
topic_counts_30 = Counter()

prob_threshold = 0.2

for idx, doc_bow in enumerate(corpus):
    # Extract video ID and preprocessed tokens (if available)
    try:
        video_id, preprocessed_tokens = filtered_documents_with_id[idx]
    except:
        video_id = video_ids[idx]
        preprocessed_tokens = None

    # Get topic distribution
    topic_distribution = lda_model_30.get_document_topics(doc_bow, minimum_probability=0)
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]

    # Save topic mapping and probability distribution
    video_topic_mapping_30[video_id] = assigned_topics
    video_topic_probabilities[video_id] = topic_distribution

    # Update video list per topic and topic count
    for topic in assigned_topics:
        topic_to_videos[topic].append(video_id)
        topic_counts_30[topic] += 1

    # Build topic words string
    topic_words_list = []
    for topic in assigned_topics:
        topic_words = [word for word, _ in lda_model_30.show_topic(topic, topn=100)]
        topic_words_list.append(", ".join(topic_words))

        # Map topic words to overlapping preprocessed text if available
        if preprocessed_tokens:
            overlapping_words = set(topic_words).intersection(set(preprocessed_tokens))
            video_topic_to_preprocessed_text[video_id][topic] = list(overlapping_words)

    # Store topic words per video
    video_topic_words_LDA1_30[video_id] = "; ".join(topic_words_list)

# Print number of videos per topic
print("\nNumber of Videos Per Topic:")
for topic, count in sorted(topic_counts_30.items()):
    print(f"Topic {topic}: {count} videos")



Number of Videos Per Topic:
Topic 0: 42 videos
Topic 1: 179 videos
Topic 2: 43 videos
Topic 3: 144 videos
Topic 4: 58 videos
Topic 5: 43 videos
Topic 6: 363 videos
Topic 7: 40 videos
Topic 8: 36 videos
Topic 9: 47 videos
Topic 10: 34 videos
Topic 11: 80 videos
Topic 12: 52 videos
Topic 13: 46 videos
Topic 14: 62 videos
Topic 15: 75 videos
Topic 16: 32 videos
Topic 17: 175 videos
Topic 18: 20 videos
Topic 19: 136 videos
Topic 20: 60 videos
Topic 21: 33 videos
Topic 22: 42 videos
Topic 23: 61 videos
Topic 24: 921 videos
Topic 25: 1059 videos
Topic 26: 46 videos
Topic 27: 715 videos
Topic 28: 147 videos
Topic 29: 74 videos


### Topic-Word distribution per Video

In [14]:
# Dictionary to store the mapping of topic words to preprocessed text for each video
video_topic_to_preprocessed_text = defaultdict(dict)

# Iterate through the corpus and map topic words to preprocessed text
for idx, doc_bow in enumerate(corpus):
    video_id, preprocessed_tokens = filtered_documents_with_id[idx]  # Get video_id and preprocessed tokens
    topic_distribution = lda_model_30.get_document_topics(doc_bow, minimum_probability=0)
    assigned_topics = [topic for topic, prob in topic_distribution if prob >= prob_threshold]

    for topic in assigned_topics:
        # Get the top 100 words for the topic
        topic_words = [word for word, _ in lda_model_30.show_topic(topic, topn=100)]
        
        # Find intersection between topic words and preprocessed tokens
        overlapping_words = set(topic_words).intersection(set(preprocessed_tokens))
        
        # Store the overlapping words
        video_topic_to_preprocessed_text[video_id][topic] = list(overlapping_words)

# Now you can analyze the results
for video_id, topic_mapping in video_topic_to_preprocessed_text.items():
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    for topic, words in topic_mapping.items():
        print(f"  Topic {topic}: {words}")


Video ID: --8n6A8Q6M0.txt | $200 Luxury Beach Hotel in The Philippines 🇵🇭
  Topic 6: ['bathroom', 'paradise', 'grass', 'catch', 'direction', 'hat', 'cebu', 'hop', 'blue', 'ocean', 'boats', 'hole', 'cold', 'fingers', 'horse', 'destination', 'photo', 'sand', 'resort', 'birds', 'rocks', 'board', 'hotels', 'mountains', 'motorbike', 'swim', 'activities', 'beaches', 'tourist', 'tricycle', 'meters', 'coconuts', 'cooler', 'rock', 'hills', 'bohol', 'bird', 'waters', 'tourists']

Video ID: -1B7cVoZr1c.txt | Marine reacts to the Philippine Light Reaction Regiment (LRR)
  Topic 27: ['heat']

Video ID: -7vF5F-1btE.txt | Ultimate Filipino Food Festival In The Netherlands!
  Topic 17: ['bbq', 'families', 'kitchen']
  Topic 27: ['combination', 'fruits', 'vinegar', 'pineapple', 'adobo', 'tomato', 'chili', 'salty', 'cake']

Video ID: -9bfDHHneyU.txt | SHOWING MY SISTER SB19 'GENTO' Music Video
  Topic 25: ['pop', 'songs', 'band', 'track', 'lyrics', 'fans', 'performance', 'industry', 'scene', 'beat']

V

In [15]:
# Print videos per topic
print("\nTop Words Per Topic:")

for topic_id in sorted(topic_to_videos.keys()): 
    top_words = lda_model_30.show_topic(topic_id, 50)
    words_str = ', '.join([word for word, prob in top_words])
    print(f"Topic {topic_id}: {words_str}")


Top Words Per Topic:
Topic 0: isang, nga, din, manok_pula_mukang, kulok_palawan_inalo, mama_mas_merry, inalo_ampera_kulok, ampera_kulok_palawan, skin_color_beauty, color_height_status, kimi_datu_sakebi, saan, robots, pag, talaga, pilipino, sya, bakit, founders, brides, wala, songwriter, meron, di, nila, kay, niya, iba, kami, namin, sabi, prawn_crackers, sila, conductor, tama, ramon, dapat, ika, phases, kaila, norte, notions, nung, elephant, quoteunquote, para, tama_naksimula, capital_moscow, iba_din, mama_mas
Topic 1: wall, tank, property, concrete, truck, james, cement, bamboo, wood, sand, grass, position, pipes, field, wire, neighbors, dirt, jay, bedroom, roof, build, materials, walls, issues, plant, mountain, typhoon, paint, wash, jazz, sky, pump, houses, mark, hang, steps, timber, starts, touch, ceiling, porch, issue, dirty, kitchen, highway, pipe, babe, metal, fence, shower
Topic 2: kids_school, bullets_baloi_bridge, cross_bridge_enemys, enemys_snipers_pilots, pastry_school_bread

#### Topic-words including probabilities

In [16]:
# Print the mapping of topic words to preprocessed text for each video, including topic and word probabilities
print("\nMapping of Topic Words to Preprocessed Text (with Topic and Word Probabilities):")
for video_id, topic_mapping in video_topic_to_preprocessed_text.items():
    video_title = video_titles.get(video_id, "Unknown Title")
    print(f"\nVideo ID: {video_id} | {video_title}")
    
    # Get the topic probability distribution for the current video
    topic_distribution = video_topic_probabilities[video_id]
    
    # Iterate through the assigned topics for the video
    for topic, words in topic_mapping.items():
        # Get the probability of the current topic for the video
        topic_prob = next((prob for t, prob in topic_distribution if t == topic), 0.0)
        
        # Print the topic and its probability
        print(f"  Topic {topic} (Probability: {topic_prob:.4f}):")
        
        # Get the word probabilities for the current topic
        topic_words_with_probs = lda_model_30.show_topic(topic, topn=100)
        
        # Create a dictionary of word probabilities for easy lookup
        word_prob_dict = {word: prob for word, prob in topic_words_with_probs}
        
        # Filter to include only the overlapping words and their probabilities
        overlapping_words_with_probs = [(word, word_prob_dict[word]) for word in words if word in word_prob_dict]
        
        # Print the overlapping words and their probabilities
        for word, prob in overlapping_words_with_probs:
            print(f"    - {word}: {prob:.4f}")  # Print word and its probability


Mapping of Topic Words to Preprocessed Text (with Topic and Word Probabilities):

Video ID: --8n6A8Q6M0.txt | $200 Luxury Beach Hotel in The Philippines 🇵🇭
  Topic 6 (Probability: 0.3930):
    - bathroom: 0.0016
    - paradise: 0.0014
    - grass: 0.0011
    - catch: 0.0016
    - direction: 0.0011
    - hat: 0.0010
    - cebu: 0.0024
    - hop: 0.0015
    - blue: 0.0010
    - ocean: 0.0015
    - boats: 0.0027
    - hole: 0.0012
    - cold: 0.0011
    - fingers: 0.0012
    - horse: 0.0020
    - destination: 0.0022
    - photo: 0.0011
    - sand: 0.0011
    - resort: 0.0013
    - birds: 0.0011
    - rocks: 0.0016
    - board: 0.0011
    - hotels: 0.0010
    - mountains: 0.0020
    - motorbike: 0.0017
    - swim: 0.0034
    - activities: 0.0012
    - beaches: 0.0030
    - tourist: 0.0027
    - tricycle: 0.0017
    - meters: 0.0019
    - coconuts: 0.0011
    - cooler: 0.0010
    - rock: 0.0014
    - hills: 0.0016
    - bohol: 0.0013
    - bird: 0.0011
    - waters: 0.0020
    - tourists: 

### Videos under each topic

In [17]:
# Print the videos under each topic in the desired format
print("\nVideos Under Each Topic:")
for topic, videos in sorted(topic_to_videos.items()):
    # Remove duplicate video IDs
    unique_videos = list(set(videos))  

    # Get the number of unique videos for the current topic
    num_videos = len(unique_videos)
    print(f"Topic {topic} ({num_videos} videos):")

    for video_id in unique_videos:
        video_title = video_titles.get(video_id, "Unknown Title")  # Get the title or default to "Unknown Title"
        print(f"  - {video_id} | {video_title}")



Videos Under Each Topic:
Topic 0 (42 videos):
  - hfXFgWZdQFE.txt | FINNISH GIRL TRYING FILIPINO JOLLIBEE FOR THE FIRST TIME IN PHILIPPINES | Filipino Jollibee Mukbang!
  - ypeYMJuMoSM.txt | How to Speak Conyo: Understanding Conyo Language with foreigner Chris Urbano (TOTES FUN!)
  - baIR7eMg5Ug.txt | filipino dishes from the kitchen - 3 (corned beef)
  - 3Wh8qKGc5K0.txt | #1 thing foreigners WISH they brought with them to the Philippines 🇵🇭 (street interviews)
  - 2G4w9gH8iEs.txt | NYC Strangers React to Top Filipino Singers?!
  - cmlM0cS98qo.txt | JAPANESE VOCAL COACH REACTION / Mona Gonzales Anime Songs(Philippines)
  - SuMb7QO8vh8.txt | Foreigner speaking Tagalog | By Kuya Klim
  - 4W9lohe4EYM.txt | AMERICAN VISITOR SHARE THE DIFFERENCE OF LIFE IN AMERICA COMPARE TO THE PHILIPPINES EXPAT
  - fIi3T50w1KA.txt | WE ARE IN MANILA!... First Impressions... 🇵🇭 | Philippines Travel | Foreigner and Filipina VLOG
  - 0eQRhbi0AcM.txt | Foreigner & Filipina House Build Philippines 29Sep22. @L

### Probability distribution of videos per topic

In [18]:
# Print the probability distribution of topics for each video
print("\nProbability Distribution of Topics Per Video:")
for video_id, topic_distribution in video_topic_probabilities.items():
    video_title = video_titles.get(video_id, "Unknown Title")  # Get the title or default to "Unknown Title"
    print(f"Video ID: {video_id} | {video_title}")
    for topic, prob in sorted(topic_distribution, key=lambda x: x[0]):  # Sort by topic number
        print(f"  - Topic {topic}: {prob:.4f}")


Probability Distribution of Topics Per Video:
Video ID: --8n6A8Q6M0.txt | $200 Luxury Beach Hotel in The Philippines 🇵🇭
  - Topic 0: 0.0003
  - Topic 1: 0.0348
  - Topic 2: 0.0003
  - Topic 3: 0.0003
  - Topic 4: 0.0097
  - Topic 5: 0.0003
  - Topic 6: 0.3930
  - Topic 7: 0.0003
  - Topic 8: 0.0003
  - Topic 9: 0.0003
  - Topic 10: 0.0003
  - Topic 11: 0.0003
  - Topic 12: 0.0003
  - Topic 13: 0.0003
  - Topic 14: 0.0003
  - Topic 15: 0.0003
  - Topic 16: 0.0003
  - Topic 17: 0.0452
  - Topic 18: 0.0003
  - Topic 19: 0.0696
  - Topic 20: 0.0003
  - Topic 21: 0.0028
  - Topic 22: 0.0003
  - Topic 23: 0.0003
  - Topic 24: 0.1980
  - Topic 25: 0.0760
  - Topic 26: 0.0003
  - Topic 27: 0.1061
  - Topic 28: 0.0519
  - Topic 29: 0.0074
Video ID: -1B7cVoZr1c.txt | Marine reacts to the Philippine Light Reaction Regiment (LRR)
  - Topic 0: 0.0006
  - Topic 1: 0.0479
  - Topic 2: 0.0006
  - Topic 3: 0.0006
  - Topic 4: 0.0006
  - Topic 5: 0.0006
  - Topic 6: 0.0526
  - Topic 7: 0.0006
  - Topic

### Identifying words of each video that contributed per topic

In [20]:
# Print the mapping of topic words to preprocessed text for each video
print("\nMapping of Topic Words to Preprocessed Text:")
for video_id, topic_mapping in video_topic_to_preprocessed_text.items():
    video_title = video_titles.get(video_id, "Unknown Title")  # Get the title or default to "Unknown Title"
    print(f"Video ID: {video_id} | {video_title}")
    for topic, words in topic_mapping.items():
        print(f"  Topic {topic}: {words}")



Mapping of Topic Words to Preprocessed Text:
Video ID: --8n6A8Q6M0.txt | $200 Luxury Beach Hotel in The Philippines 🇵🇭
  Topic 6: ['bathroom', 'paradise', 'grass', 'catch', 'direction', 'hat', 'cebu', 'hop', 'blue', 'ocean', 'boats', 'hole', 'cold', 'fingers', 'horse', 'destination', 'photo', 'sand', 'resort', 'birds', 'rocks', 'board', 'hotels', 'mountains', 'motorbike', 'swim', 'activities', 'beaches', 'tourist', 'tricycle', 'meters', 'coconuts', 'cooler', 'rock', 'hills', 'bohol', 'bird', 'waters', 'tourists']
Video ID: -1B7cVoZr1c.txt | Marine reacts to the Philippine Light Reaction Regiment (LRR)
  Topic 27: ['heat']
Video ID: -7vF5F-1btE.txt | Ultimate Filipino Food Festival In The Netherlands!
  Topic 17: ['bbq', 'families', 'kitchen']
  Topic 27: ['combination', 'fruits', 'vinegar', 'pineapple', 'adobo', 'tomato', 'chili', 'salty', 'cake']
Video ID: -9bfDHHneyU.txt | SHOWING MY SISTER SB19 'GENTO' Music Video
  Topic 25: ['pop', 'songs', 'band', 'track', 'lyrics', 'fans', 'per