In [0]:
%pip install vaderSentiment textblob nltk
%pip install datasets
%pip install transformers
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
%pip install tensorflow
%pip install praw
%pip install tf-keras
%pip install gensim spacy nltk wordcloud
%pip install spacy
%pip install inflect
%pip install nltk
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

In [0]:
pip install youtube-comment-downloader

In [0]:
from youtube_comment_downloader import YoutubeCommentDownloader
import pandas as pd

downloader = YoutubeCommentDownloader()

urls = [
    'https://www.youtube.com/watch?v=8r6FyCMbqow',
    'https://www.youtube.com/watch?v=oPD3KR_PtfY',
    'https://www.youtube.com/watch?v=QrCqsYqbuAc',
    'https://www.youtube.com/watch?v=-Wr-RuoV0pU&ab_channel=BALR',
    'https://www.youtube.com/watch?v=nHj8LrOmAgE&ab_channel=ChrisWilliamson',
    'https://www.youtube.com/watch?v=mKs0Uyj0q-w&ab_channel=HowMoneyWorks',
    'https://www.youtube.com/watch?v=mKs0Uyj0q-w&ab_channel=HowMoneyWorks'
]

comment_list = []
for url in urls:
    comments = downloader.get_comments_from_url(url)
    for comment in comments:
        comment_list.append(comment)

df = pd.DataFrame(comment_list)

df = df[['text', 'votes']]

In [0]:
# spark_df = spark.createDataFrame(df)
# spark_df.createOrReplaceTempView("youtube_comments")

In [0]:
youtube_comments_df = df

## Topic Modelling

In [0]:
import nltk
nltk.download('wordnet')

In [0]:
import numpy as np
import inflect
import gensim
import matplotlib.pyplot as plt
import nltk
import inflect
from nltk.stem import WordNetLemmatizer
from gensim.utils import simple_preprocess
from wordcloud import STOPWORDS as wc_stopwords
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from gensim.utils import simple_preprocess

In [0]:

# Build a cleaner custom stopword set (deduped)
custom_stopwords = set(wc_stopwords).union({
    "uk","like","would","also","getting","got","get","go",
    "ll","ve","re","don","didn","isn","wasn","weren","day",
    "now","ask","want","know","think","say","said","saying","says","bn","email",
    "new","year","person","people","try","look","tell","thing","good","use","will",
    "see","back","come","one","really","i_am","i_don","take","put", "oh",
    "the_uk","feel","make","today","still","m_not","due_to","even", "wa", "thi", "ha",
    "seem", "we_re", "do_you", "etc", "https_www", "hi", "let", "would_be",
    "we_re", "hi", "do_you", "seem", "etc", "would_be", "bro", "if_you", "lol", "me", "im",
    "I"
})

lemmatizer = WordNetLemmatizer()
p = inflect.engine()

# Train bigrams on LOWER-CASED, tokenized text (better quality than .split())
sentences = df["text"].astype(str).apply(lambda x: simple_preprocess(x))
bigram = Phrases(sentences, min_count=5, threshold=10)
bigram_model = Phraser(bigram)

# Apply bigrams
df["bigram_text"] = sentences.apply(lambda toks: " ".join(bigram_model[toks]))

def preprocess(text: str):
    tokens = simple_preprocess(text)  # lower, deaccent, min_len=2 by default
    cleaned = []
    for t in tokens:
        # normalize first
        t_norm = p.singular_noun(t) or t      
        t_norm = lemmatizer.lemmatize(t_norm, 'n')
        t_norm = lemmatizer.lemmatize(t_norm, 'v')
        # THEN filter
        if t_norm not in custom_stopwords:
            cleaned.append(t_norm)
    return cleaned

df["processed_text_new"] = df["bigram_text"].apply(preprocess)


In [0]:
%python
texts = df["processed_text_new"]         # tokenized, preprocessed list-of-lists
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(t) for t in texts]

K_RANGE = range(4, 13)   
RANDOM_STATE = 42
PASSES = 10             
ALPHA = 'symmetric'      
ETA = 'symmetric'

def fit_lda(k):
    model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        random_state=RANDOM_STATE,
        passes=PASSES,
        alpha=ALPHA,
        eta=ETA,
        minimum_probability=0.0
    )
    perplexity = np.exp2(-model.log_perplexity(corpus))
    cm_cv = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
    cv = cm_cv.get_coherence()
    cm_umass = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    umass = cm_umass.get_coherence()
    return model, cv, umass, perplexity

results = []
models = {}

for k in K_RANGE:
    mdl, cv, umass, pplx = fit_lda(k)
    models[k] = mdl
    results.append({"k": k, "coherence_cv": cv, "coherence_umass": umass, "perplexity": pplx})
    print(f"k={k:>2}  Cv={cv:.3f}  UMass={umass:.3f}  Perplexity={pplx:.1f}")

res_df = pd.DataFrame(results)

fig, ax1 = plt.subplots(figsize=(7,4))
ax1.plot(res_df["k"], res_df["coherence_cv"], marker="o", label="Coherence (Cv)")
ax1.plot(res_df["k"], res_df["coherence_umass"], marker="o", linestyle="--", label="Coherence (UMass)")
ax1.set_xlabel("Number of topics (k)")
ax1.set_ylabel("Coherence (higher is better)")
ax1.legend(loc="upper left")

ax2 = ax1.twinx()
ax2.plot(res_df["k"], res_df["perplexity"], marker="s", color="gray", alpha=0.5, label="Perplexity")
ax2.set_ylabel("Perplexity (lower is better)")
plt.title("LDA model selection")
plt.tight_layout()
plt.show()

best_k = int(res_df.loc[res_df["coherence_cv"].idxmax(), "k"])
print(f"\nSuggested k by Cv peak: {best_k}")

final_model = models[best_k]  # reuse trained model at best_k

# Topic prevalence (mean theta across docs)
doc_topic = np.array([
    [p for _, p in final_model.get_document_topics(bow, minimum_probability=0.0)]
    for bow in corpus
])
prevalence = doc_topic.mean(axis=0)

# Coherence per topic
cm_final = CoherenceModel(model=final_model, texts=texts, dictionary=dictionary, coherence='c_v')
coh_per_topic = cm_final.get_coherence_per_topic()
overall_cv = cm_final.get_coherence()

# Build a tidy table
def top_terms(model, topic_id, topn=8):
    return ", ".join([w for w, _ in model.show_topic(topic_id, topn=topn)])

summary = pd.DataFrame({
    "topic_id": range(best_k),
    "top_terms": [top_terms(final_model, k, topn=8) for k in range(best_k)],
    "prevalence": [prevalence[k] for k in range(best_k)],
    "coherence_Cv": coh_per_topic
}).sort_values("prevalence", ascending=False).reset_index(drop=True)

summary["prevalence"] = summary["prevalence"].map(lambda x: f"{x:.3f}")
summary["coherence_Cv"] = summary["coherence_Cv"].map(lambda x: f"{x:.3f}")

print(f"\nFinal model (k={best_k}) overall coherence Cv = {overall_cv:.3f}")
print(summary.to_string(index=False))


In [0]:

post_dictionary = corpora.Dictionary(df["processed_text_new"])
post_corpus = [post_dictionary.doc2bow(text) for text in df["processed_text_new"]]

lda_model_posts = gensim.models.LdaModel(
    corpus=post_corpus,
    id2word=post_dictionary,
    num_topics=6,  # Adjust number of topics as needed
    random_state=42,
    passes=10
)

# Display top topics for Reddit posts
print("\n🔹 Top Topics in Youtube Comments:")
for idx, topic in lda_model_posts.print_topics(-1):
    print(f"Topic {idx}: {topic}")