In [1]:
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import re
import nltk
import requests
import pyLDAvis
import pyLDAvis.gensim_models
from nltk.stem import WordNetLemmatizer
import spacy
import numpy as np
import os

# Make sure to download nltk stopwords if not already
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dnaso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dnaso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# # READING FROM OLD DATASET

# # Define paths to transcripts and tags directories
# transcripts_dir = '../cleaned_transcripts/'
# tags_dir = '../tags/'
# csv_path = '../cleaned_results.xlsx'

# # Load the CSV file and filter for "related" videos
# csv_data = pd.read_excel(csv_path)
# related_videos = csv_data[csv_data["related"] == "yes"]

# # Load transcripts
# transcripts = []
# tags = []

# # Process each related video based on its video_id
# for video_id in related_videos["Video Id"]:
#     # Construct paths based on video ID naming conventions
#     transcript_file = os.path.join(transcripts_dir, f"{video_id}_captions.txt")
#     tag_file = os.path.join(tags_dir, f"{video_id}.txt")
    
#     # Read the transcript and tag files if they exist
#     try:
#         with open(transcript_file, "r", encoding="utf-8") as file:
#             transcripts.append(file.read())
#         with open(tag_file, "r", encoding="utf-8") as file:
#             tags.append(file.read().strip())
#     except FileNotFoundError:
#         print(f"Files for video ID {video_id} not found, skipping.")

# # Combine the filtered data into a DataFrame
# data = pd.DataFrame({"video_id": related_videos["Video Id"], "tags": tags, "transcripts": transcripts})
# data["text"] = data["tags"] + " " + data["transcripts"]

# # Display the first few rows to verify
# print(data.head())


In [3]:
# Define the dataset folder path
dataset_folder = "../standard_dataset"
tags_dir = '../tags/'

# List to store extracted data
data_records = []
tag = []

# Regex pattern to extract Video Id and Title from the filename
filename_pattern = re.compile(r"^(.*)_(.*?)_captions\.txt$")

for file in os.listdir(dataset_folder):
    if file.endswith("_captions.txt"):  # Ensure it's a valid transcript file
        match = filename_pattern.match(file)
        if match:
            video_id, video_title = match.groups()  # Extract Video Id and Title
            file_path = os.path.join(dataset_folder, file)

            # Read transcript content
            with open(file_path, "r", encoding="utf-8") as f:
                transcript = f.read().strip()

            tag_file = os.path.join(tags_dir, f"{video_id}.txt")

            # Reset tag list for each file
            tag = []

            try:
                with open(tag_file, "r", encoding="utf-8") as f:
                    tag.append(f.read().strip())
            except FileNotFoundError:
                print(f"Files for video ID {video_id} not found, skipping.")

            # Append data to the list
            data_records.append({
                "Video Id": video_id,
                "Video Title": video_title,
                "Tags": tag,  # Ensures it's always a list
                "Transcript": transcript
            })

# Convert to DataFrame
data = pd.DataFrame(data_records)

data["text"] = data["Tags"].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x)) + " " + data["Transcript"]


# Display summary
print(f"Loaded {len(data)} transcripts into data.")
print(data.head())

Loaded 100 transcripts into data.
      Video Id                                        Video Title  \
0  --8n6A8Q6M0  $200 Luxury Beach Hotel in The Philippines ðŸ‡...   
1  0IMWasj76yU                  Philippines Army vs Thailand Army   
2  1kErCqgIVMk      Tour of The House We Built in The Philippines   
3  2ftG8JuMzz4  Top 5 Exotic Foods in the Philippines The BRAV...   
4  2TmagN6RhkI  VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi   

                                                Tags  \
0  [['luke',, 'luke, damant',, 'damant',, 'travel...   
1                        [['Teacher, Paul, reacts']]   
2  [['House, build, in, the, Philippines',, 'Hous...   
3  [['best, ever, food, review, show',, 'exotic, ...   
4  [['regine, velasquez',, 'velasquez',, 'regine,...   

                                          Transcript  \
0  Oh, my God.\nIt is just the perfect recipe for...   
1  this\nthis is\nthis is the\nthis\nthis is\nthi...   
2  okay so some of you have asked for a house 

In [4]:
# Step 2: Text Preprocessing
stop_words = set(stopwords.words("english"))

nlp = spacy.load("tl_fasttext_transition")

# Function to fetch stopwords from GitHub URL
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    github_stopwords = response.text.splitlines()  # Split by new lines
    return set(github_stopwords)

# GitHub URL for stopwords
github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)

custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                     'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                     'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow' ]
broad_terms = ['philippines', 'philippine', 'british', 'filipino', 'video', 'http', 'korea', 'korean', 
               'youtube', 'google', 'united', 'america', 'american']
kpop_keywords = ['kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 'seventeen', 
                 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook']
more_keywords = [
    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 
    'journalist', 'headline', 'reporter', 'current events', 'special report', 
    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'gonna', 'life', 
    'lol', 'guys', 'tho', 'cute', 'hmm', 'huh', 'channel', 'subscribe', 'day6', 'mandarin', 'chinese', 'beautiful',
    'chuckles', 'fbe', 'hit', 'laughs', 'yo', 'ka', 'word', 'living', 'boi', 'minimum', 'ya', 'successful', 'perfectly', 'yeap', 
    'wondering', 'fantastic', 'hurry', 'german', 'age', 'country', 'subscribing', 'bluesy', 'jump', 'pretty', 'understanding', 'personalized',
    'and', 'the', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'up', 'about', 'over', 'into', 'through', 'between', 'under', 'against', 'all',
    'you', 'haha', 'hahaha', 'ha', 'hey', 'bye', 'hello', 'hi', 'oh', 'blah', 'easy', 'alright', 'ta', 'day', 'ooh', 'en', 'do', 'lot', 'comment', 'notification', 
    'korean', 'jjajangmyeon', 'jajangmyeon', 'damn', 'yall', 'month', 'week', 'year', 'ohhh', 'pvf', 'dude', 'mmm', 'kagilagilalas', 'ofcourse', 'australia', 'uxo', 
    'atleast', 'yusuf', 'bangkok', 'ot', 'anytime', 'allover', 'kala', 'nope', 'wan', 'brazil', 'smooth', 'ot', 'timeshere', 'batchof', 'yep', 'opo', 'del',
    'gosh', 'po', 'ourself', 'wo', 'wait', 'ugh', 'nyc', 'whoa', 'nicaragua', 'yup', 'em', 'bout', 'le', 'omg', 'overwhelm', 'maam', 'nicer', 'haha', 'hahaha', 'ha', 
    'nbcs', 'lana', 'rc', 'whatsoever', 'oxy', 'decade', 'whyd', 'unknown', 'ahhhhh', 'ohoh', 'ohto', 'ohhhh', 'bruh', 'ooe', 'ahmedabad', 'mexico', 
    'understand', 'excuse', 'kinda', 'applause', 'oooh', 'thiswhat', 'nevermind', 'ahh', 'againthank', 'toto', 'aww', 'nah', 'bbmas', 'ay', 'op', 'huh', 'huhu',
    'tada', 'beacuse', 'voila', 'upstairs', 'thatswhy', 'yea', 'that', 'armenia', 'or', 'not', 'funwhat', 'aka', 'armeniathat', 'woosexy', 'worth', 'laugh', 'box', 
    'xd', 'vb', 'eff', 'ananya', 'welsh', 'latron', 'shout', 'whatwhat', 'what', 'pause', 'why', 'thats', 'byebye', 'iv', 'bye'  
]
stop_words.update(custom_stop_words, kpop_keywords, broad_terms, more_keywords, github_stopwords)

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase and remove non-alphabet characters
    text = re.sub(r'[^a-z0-9\s]', '', text.lower())
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and filter out short words
    words = [word for word in words if word not in stop_words and len(word) > 1 and word.isalpha()]

    # for word in words:
    #     if is_not_filipino_word(word):
    #         words.remove(word)


    words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(words)

def is_not_filipino_word(word):
    # Process the word using SpaCy
    doc = nlp(word)
    # Use the word vector from SpaCy to determine if it's a Filipino word
    embedding = doc.vector
    return np.linalg.norm(embedding) < 0.25  # This threshold might need adjustment



data["cleaned_text"] = data["text"].apply(preprocess_text)

tfidf = TfidfVectorizer(max_df=0.8, min_df=2, stop_words=custom_stop_words)
temp = data["cleaned_text"]
X = tfidf.fit_transform(temp)

vocabulary = tfidf.get_feature_names_out()
vocabulary

def filter_words(text):
    words = word_tokenize(text) # Split text into words
    print(words)
    filtered_words = [word for word in words if word in vocabulary] 
    print(filtered_words) # Keep only words in vocab
    return " ".join(filtered_words)  # Join back into a string

data["tfidf"] = data["cleaned_text"].apply(filter_words)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


['luke', 'luke', 'damant', 'damant', 'traveler', 'traveller', 'luke', 'damant', 'vlogger', 'luke', 'damant', 'chase', 'dream', 'tip', 'manila', 'manila', 'fun', 'phmanila', 'cebu', 'nido', 'palawan', 'cebu', 'city', 'manila', 'city', 'manila', 'god', 'perfect', 'recipe', 'beer', 'bottle', 'goodness', 'tarzan', 'morning', 'bohol', 'gorgeous', 'crystal', 'water', 'honestly', 'vibe', 'perfect', 'temperature', 'spending', 'beach', 'staying', 'meter', 'night', 'hotel', 'couple', 'night', 'arrived', 'night', 'raining', 'miserable', 'weather', 'boy', 'boy', 'weather', 'water', 'australian', 'live', 'sydney', 'live', 'beach', 'absolutely', 'beach', 'absolute', 'paradise', 'beach', 'hotel', 'super', 'overlooking', 'water', 'mall', 'kayaking', 'paddle', 'boarding', 'paddle', 'board', 'beach', 'tide', 'tide', 'walk', 'basically', 'boat', 'couple', 'hour', 'beached', 'expanding', 'enjoying', 'luxury', 'bring', 'journey', 'tara', 'explore', 'luxury', 'hotel', 'cebu', 'night', 'cent', 'photo', 'shoo

In [None]:
# import calamancy
# for model in calamancy.models():
#     print(model)

# nlp = calamancy.load("tl_calamancy_lg-0.2.0")

# def extract_entities(text):
#     doc = nlp(text)
#     entities = [ent.text for ent in doc.ents]
#     scrapped_words = [token.text for token in doc if token.ent_type_ == ""]
#     return entities, scrapped_words

# # Apply the function to each item in data["tfidf"]
# data["entities"], data["scrapped"] = zip(*data["tfidf"].apply(extract_entities))

# # Display the first few rows to verify
# print(data.head())

# # Write scrapped words to a text file
# with open('scrapped_words_fil_2.txt', 'w', encoding='utf-8') as file:
#     for scrapped in data['scrapped']:
#         file.write(" ".join(scrapped) + '\n')

tl_calamancy_md-0.2.0
tl_calamancy_lg-0.2.0
tl_calamancy_trf-0.2.0
tl_calamancy_md-0.1.0
tl_calamancy_lg-0.1.0
tl_calamancy_trf-0.1.0


  self._model.load_state_dict(torch.load(filelike, map_location=device))


      Video Id                                        Video Title  \
0  --8n6A8Q6M0  $200 Luxury Beach Hotel in The Philippines ðŸ‡...   
1  0IMWasj76yU                  Philippines Army vs Thailand Army   
2  1kErCqgIVMk      Tour of The House We Built in The Philippines   
3  2ftG8JuMzz4  Top 5 Exotic Foods in the Philippines The BRAV...   
4  2TmagN6RhkI  VOCAL COACH REACTS - REGINE VELASQUEZ - Araw-Gabi   

                                                Tags  \
0  [['luke',, 'luke, damant',, 'damant',, 'travel...   
1                        [['Teacher, Paul, reacts']]   
2  [['House, build, in, the, Philippines',, 'Hous...   
3  [['best, ever, food, review, show',, 'exotic, ...   
4  [['regine, velasquez',, 'velasquez',, 'regine,...   

                                          Transcript  \
0  Oh, my God.\nIt is just the perfect recipe for...   
1  this\nthis is\nthis is the\nthis\nthis is\nthi...   
2  okay so some of you have asked for a house tou...   
3  Hello there, and welc

In [None]:
# for item in data["entities"]:
#     print(item)

['luke luke damant damant traveler luke damant vlogger luke damant chase', 'manila', 'cebu nido', 'palawan', 'cebu city', 'manila city manila', 'cebu island', 'tropical island', 'cebu airport', 'tropical island cebu', 'luke damant', 'pakistan lake', 'manila', 'virgin island', 'chocolate hill']
[]
[]
[]
['regine velasquez velasquez regine velasquez', 'regine velasquez', 'regine velasquez', 'regine velasquez', 'regine velasquez', 'regine velasquez']
['manila market', 'cebu']
['south china sea', 'south china sea', 'manila', 'sri lanka', 'pakistan', 'manila lot express', 'south china sea', 'south china sea', 'south china sea', 'david michael white', 'taiwan', 'chris']
[]
['netherlands']
['mama sandoval', 'mia sandoval', 'mama mia sandoval', 'christian grey jamill tagalog', 'john recognize', 'john art']
['manila', 'bonifacio global city', 'manila', 'bonifacio street', 'manila', 'makati', 'manila metro', 'manila makati city bgc', 'mia colt', 'aaron happy', 'africa', 'manila']
[]
['manila sea

In [8]:
# Created File Using only Cleaned Text
with open('standard1.txt', 'w', encoding='utf-8') as file:
    for row in data['tfidf']:
        file.write(row + '\n')


# to use HLTM:
# java -cp HLTA.jar;HLTA-deps.jar tm.hlta.HTD "file name of text" "output name"

In [9]:
# Generate text file for titles only

def clean_tags(text):
    # Lowercase and remove non-alphabet characters
    text = re.sub(r'\W+', ' ', text.lower())
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and filter out short words
    words = [word for word in words if word not in stop_words and len(word) > 1 and word.isalpha()]
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

data["cleaned_tags"] = data["Tags"].apply(clean_tags)

with open('tags.txt', 'w', encoding='utf-8') as file:
    for row in data['cleaned_tags']:
        file.write(" ".join(row).replace(',', '') + '\n')

AttributeError: 'list' object has no attribute 'lower'