In [None]:
import os
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
import re
import nltk
import requests
import pyLDAvis
import pyLDAvis.gensim_models

# Make sure to download nltk stopwords if not already
nltk.download("stopwords")
nltk.download("punkt")

In [None]:
# Define paths to transcripts and tags directories
transcripts_dir = 'cleaned_transcripts/'
tags_dir = 'tags/'
csv_path = 'cleaned_results.xlsx'

# Load the CSV file and filter for "related" videos
csv_data = pd.read_excel(csv_path)
related_videos = csv_data[csv_data["related"] == "yes"]

# Load transcripts
transcripts = []
tags = []

# Process each related video based on its video_id
for video_id in related_videos["Video Id"]:
    # Construct paths based on video ID naming conventions
    transcript_file = os.path.join(transcripts_dir, f"{video_id}_captions.txt")
    tag_file = os.path.join(tags_dir, f"{video_id}.txt")
    
    # Read the transcript and tag files if they exist
    try:
        with open(transcript_file, "r", encoding="utf-8") as file:
            transcripts.append(file.read())
        with open(tag_file, "r", encoding="utf-8") as file:
            tags.append(file.read().strip())
    except FileNotFoundError:
        print(f"Files for video ID {video_id} not found, skipping.")

# Combine the filtered data into a DataFrame
data = pd.DataFrame({"video_id": related_videos["Video Id"], "tags": tags, "transcripts": transcripts})
data["text"] = data["tags"] + " " + data["transcripts"]

# Display the first few rows to verify
print(data.head())

In [3]:
# Step 2: Text Preprocessing
stop_words = set(stopwords.words("english"))

# Function to fetch stopwords from GitHub URL
def fetch_stopwords_from_github(url):
    response = requests.get(url)
    github_stopwords = response.text.splitlines()  # Split by new lines
    return set(github_stopwords)

# GitHub URL for stopwords
github_stopwords_url = 'https://raw.githubusercontent.com/stopwords-iso/stopwords-en/master/stopwords-en.txt'
github_stopwords = fetch_stopwords_from_github(github_stopwords_url)


custom_stop_words = ['like', 'yeah', 'know', 'um', 'uh', 'really', 'one', 'go', 'right', 'okay', 'well', 'said', 
                     'going', 'got', 'na', 'always', 'every', 'each', 'say', 'el', 'little', 'still', 
                     'best', 'dutch', 'nice', 'great', 'awesome', 'good', 'cool', 'love', 'amazing', 'wow' ]
broad_terms = ['philippines', 'philippine', 'british', 'filipino', 'video', 'http', 'korea', 'korean', 
               'youtube', 'google', 'united', 'america', 'american']
kpop_keywords = ['kpop', '필리핀', 'bts', 'blackpink', 'twice', 'exo', 'k-pop', 'seventeen', 
                 'stray kids', 'nct', 'kdrama', 'aespa', 'taehyung', 'jimin', 'jungkook']
more_keywords = [
    'breaking news', 'report', 'coverage', 'investigation', 'interview', 'documentary', 
    'journalist', 'headline', 'reporter', 'current events', 'special report', 
    'analysis', 'documented', 'broadcast', 'reporting', 'v', 'food', 'travel', 'react', 
    'reacts', 'reaction', 'foreigner', 'thing', 'visit', 'dc', 'japan', 'first', 'fast', 
    'asia', 'ang', 'indian', 'thai', 'vietnamese', 'russia', 'gon', 'canada', 'canadian', 'russian', 
    'russia', 'guy', 'lot', 'bit', 'diba', 'ola', 'cuz', 'thai', 'thailand', 'person', 'citizen', 'foreigner', 'foreign', 'foreigners',
    'facebook', 'filipinos', 'filipinas', 'vlog', 'vlogs', 'vlogging', 'hashtag', 'india', 'bro', 'dito', 'people', 'time', 'music', 'guys'
]

# Add custom and broad terms
stop_words.update(custom_stop_words, broad_terms, kpop_keywords, more_keywords, github_stopwords)

def preprocess_text(text):
    # Lowercase and remove non-alphabet characters
    text = re.sub(r'\W+', ' ', text.lower())
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and filter out short words
    words = [word for word in words if word not in stop_words and len(word) > 1]
    return words

data["cleaned_text"] = data["text"].apply(preprocess_text)

# Vectorize text with TF-IDF to remove low-impact words
tfidf_vectorizer = TfidfVectorizer(max_df=0.85, min_df=5)
tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(doc) for doc in data["cleaned_text"]])

# Filter words in each transcript based on TF-IDF scores
def filter_by_tfidf(doc):
    features = tfidf_vectorizer.get_feature_names_out()
    vector = tfidf_vectorizer.transform([" ".join(doc)]).toarray()[0]
    return [features[i] for i in vector.argsort()[-15:]]  # Top 15 tf-idf terms

data["filtered_words"] = data["cleaned_text"].apply(filter_by_tfidf)


In [None]:
data

In [None]:
from wordcloud import WordCloud
import random

# Assuming 'data' is a DataFrame that contains your preprocessed text data

# Step 3: Train the LDA Model

# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary(data["filtered_words"])
corpus = [dictionary.doc2bow(text) for text in data["filtered_words"]]

# General LDA model across all transcripts
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42, passes=100)

# Display general topics
print("General Topics Across All Videos:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

# Visualization for General Topics
pyLDAvis.enable_notebook()
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

# Create a WordCloud for general topics
def create_wordcloud_for_topic(topic_model, topic_idx):
    topic_words = topic_model.show_topic(topic_idx, topn=30)  # Get the top 30 words for the topic
    word_freq = dict(topic_words)
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud for Topic {topic_idx}")
    plt.show()

# Display WordClouds for general topics
for i in range(5):  # Adjust to the number of topics
    create_wordcloud_for_topic(lda_model, i)

# Per-video LDA (randomly selecting 50 videos)
video_topics = []
max_videos = 50  # Limit to 50 videos

# Get a random sample of video indices
random_indices = random.sample(range(len(data["filtered_words"])), min(max_videos, len(data["filtered_words"])))

for i in random_indices:
    text = data["filtered_words"][i]
    corpus_per_video = [dictionary.doc2bow(text)]
    lda_per_video = gensim.models.LdaModel(corpus=corpus_per_video, id2word=dictionary, num_topics=5, passes=100)
    video_topics.append(lda_per_video.print_topics(-1))

    # Prepare data for visualization for each video
    vis_data_video = pyLDAvis.gensim_models.prepare(lda_per_video, corpus_per_video, dictionary)

    # Create a WordCloud for each video's topics
    for j in range(5):  # Adjust to the number of topics
        create_wordcloud_for_topic(lda_per_video, j)

# Show per-video topics
for idx, video_topic in enumerate(video_topics):
    print(f"\nTopics for Video {random_indices[idx] + 1}:")  # Display the actual index + 1
    for topic in video_topic:
        print(topic)


In [None]:
pyLDAvis.display(vis_data)

In [None]:
# Step 4: Topic Validation and Coherence Checking

# Coherence Score for General LDA Model
coherence_model_lda = CoherenceModel(model=lda_model, texts=data["filtered_words"], dictionary=dictionary, coherence="c_v")
coherence_lda = coherence_model_lda.get_coherence()
print("\nGeneral LDA Model Coherence Score:", coherence_lda)

# Filter topics based on coherence threshold if necessary (e.g., only retain if > 0.5)
if coherence_lda < 0.5:
    print("Warning: Topic coherence is low, consider re-evaluating the topic terms or increasing the number of topics.")

In [None]:
# Step 5: Interpretability Verification

def validate_topics_with_tags(tags, topics):
    for idx, topic in enumerate(topics):
        print(f"\nValidating Topic {idx+1}")
        keywords = [word.split('*')[1].strip('"') for word in topic[1].split(" + ")]
        tag_matches = [tag for tag in tags.split() if tag in keywords]
        print("Keywords:", keywords)
        print("Matching Tags:", tag_matches)
        if len(tag_matches) < len(keywords) * 0.3:
            print("Warning: Topic may be too generic for this video.")

# Validate each video's topics with its tags
for i, tags in enumerate(data["tags"]):
    print(f"\nValidating topics for Video {i+1}")
    validate_topics_with_tags(tags, video_topics[i])

In [3]:
from transformers import pipeline
from concurrent.futures import ProcessPoolExecutor, as_completed
import os
import glob
import random

# Setup the summarization pipeline
# Choose between T5 for abstractive or DistilBERT for extractive summarization
# summarizer_abstractive = pipeline("summarization", model="t5-small", tokenizer="t5-small", framework="pt", torch_dtype="float16")
# summarizer_extractive = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
summarizer_abstractive = pipeline("summarization", model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn", framework="pt")
# summarizer_abstractive = pipeline("summarization", model="google/pegasus-xsum", tokenizer="google/pegasus-xsum", framework="pt")

# Path to the folder containing text files
text_folder = 'cleaned_transcripts/'


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed
# Set the number of samples to process
sample_size = 50
def load_and_filter_text_files(folder_path, min_word_count=100):
    file_paths = glob.glob(os.path.join(folder_path, "*.txt"))
    text_data = {}
    for file_path in file_paths:
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                text = file.read()
                text = text.encode("utf-8", "ignore").decode()  # Sanitize non-UTF-8 characters
                if len(text.split()) >= min_word_count:
                    text_data[file_path] = text
        except Exception as e:
            print(f"Error loading file {file_path}: {e}")
    sampled_files = {k: text_data[k] for k in random.sample(list(text_data.keys()), min(len(text_data), sample_size))}
    return sampled_files

def chunk_text(text, chunk_size=512):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i:i + chunk_size])

def summarize_text(text, chunk_size=512, max_length=None, min_length=30, length_penalty=2.0):
    summaries = []
    try:
        for chunk in chunk_text(text, chunk_size):
            input_length = len(chunk.split())
            if max_length is None or max_length > input_length:
                max_length = min(input_length, 130)  # Set a reasonable max length based on the input length
            summary = summarizer_abstractive(chunk, max_length=max_length, min_length=min_length, length_penalty=length_penalty, do_sample=False)
            summaries.append(summary[0]['summary_text'])
    except Exception as e:
        print(f"Error summarizing text: {e}")
        return None
    return " ".join(summaries)


def parallel_summarization(text_data, chunk_size=512, max_workers=4):
    summaries = {}
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(summarize_text, text, chunk_size): file_name for file_name, text in text_data.items()}
        for future in as_completed(futures):
            file_name = futures[future]
            try:
                result = future.result()
                if result:
                    summaries[file_name] = result
                else:
                    print(f"No summary generated for {file_name}.")
            except Exception as e:
                print(f"Error processing {file_name}: {e}")
    return summaries

In [5]:
# Load and summarize files
text_data = load_and_filter_text_files(text_folder)
summarized_texts = parallel_summarization(text_data, chunk_size=512)

# Save summaries to output files
output_folder = 'summaries/'
os.makedirs(output_folder, exist_ok=True)

for file_name, summary in summarized_texts.items():
    base_name = os.path.basename(file_name)
    output_path = os.path.join(output_folder, f"{base_name}_summary.txt")
    with open(output_path, "w") as f:
        f.write(summary)

print("Summarization complete. Check the output folder for results.")

Your min_length=30 must be inferior than your max_length=16.
Your min_length=30 must be inferior than your max_length=27.


Summarization complete. Check the output folder for results.
