In [1]:
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound
from pytube import Playlist
from pytube import YouTube
import time
import umap.umap_ as UMAP
from bertopic import BERTopic
import nltk
from youtube_transcript_api._errors import TranscriptsDisabled
from bertopic.representation import KeyBERTInspired
import pandas as pd
import matplotlib.pyplot as plt
import plotly.io as pio
import plotly.graph_objects as go
from openai import OpenAI
import json
import os
import pickle
from datetime import datetime, timedelta
from dotenv import load_dotenv
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/connorgag/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## New York Times Podcasts Parameters

In [2]:
# num_videos = 582
# doc_size = 200
# number_of_topics = 50
# video_order = 'sequential'
# url_playlist = "https://www.youtube.com/playlist?list=PLdMrbgYfVl-s16D_iT2BJCJ90pWtTO1A4" # NYT
# channel_name = Playlist(url_playlist).owner

# transcript_location = f"data/transcript_list_{channel_name}.txt"
# timestamp_location = f"data/timestamp_list_{channel_name}.txt"
# bert_model_path = f"models/bertopic_model_{channel_name}_{number_of_topics}_topics"
# topic_prob_location = f"model_results/topic_probs_{channel_name}_{number_of_topics}_topics.pkl"

# print(f"Analyzing playlist from channel {channel_name}")


## PowerfulJRE Parameters

In [3]:
num_videos = 2207
doc_size = 200
number_of_topics = 50
video_order = 'sample'
url_playlist = "https://www.youtube.com/playlist?list=PLk1Sqn_f33KuWf3tW9BBe_4TP7x8l0m3T" # JRE
channel_name = Playlist(url_playlist).owner

transcript_location = f"data/transcript_list_{channel_name}.txt"
timestamp_location = f"data/timestamp_list_{channel_name}.txt"
bert_model_path = f"models/bertopic_model_{channel_name}_{number_of_topics}_topics"
topic_prob_location = f"model_results/topic_probs_{channel_name}_{number_of_topics}_topics.pkl"

print(f"Analyzing playlist from channel {channel_name}")


Analyzing playlist from channel PowerfulJRE


## Pull (and clean) YouTube transcript data

In [4]:
def create_transcripts_from_playlist(playlist_url, num_videos=0, order='sequential'):
    if ('playlist' not in playlist_url):
        print("Error: This is not a playlist URL")
        return None
    
    # Retrieve URLs of videos from playlist
    playlist = Playlist(playlist_url)
    print('Going through the first ' + str(num_videos) + ' videos in the playlist with %s' % len(playlist.video_urls) + ' videos')
    urls = []
    if order == 'sequential':
        for url in playlist:
            if (len(url) >= num_videos):
                break
            urls.append(url)
    elif order == 'sample':
        every_k_videos = len(playlist.video_urls) // num_videos
        for i in range(0, every_k_videos * num_videos, every_k_videos):
            urls.append(playlist[i])
    else:
        print("Invalid Order")
        return None


    transcripts = []
    transcript_timestamps = []
    
    start_time = time.time()
    video_num = 1
    for youtube_url in urls:
        print(f"Starting {youtube_url}")
        if (video_num % 100 == 0):
            print(f"On Video Number {video_num}")
        video_num += 1
        
        youtube_id = youtube_url[youtube_url.index('watch?v=') + len('watch?v='):]
        try:
            transcript_dict = YouTubeTranscriptApi.get_transcript(youtube_id)
            transcript_timestamps.append(YouTube(youtube_url).publish_date)
            
            full_transcript = ""
            for i in transcript_dict:
                full_transcript += " " + i['text']
            transcripts.append(full_transcript)
        
        except TranscriptsDisabled:
            print(f"Skipping this video: {youtube_url}")
        
        except NoTranscriptFound:
            print(f"Skipping this video: {youtube_url}")

        

    end_time = time.time()
    print(f"Total time {end_time - start_time} seconds")
    print(f"Time per video was {(end_time - start_time) / num_videos} seconds")
    
    return transcripts, transcript_timestamps


Some of the dates are not filled in, but we know the videos are in chronological order, so we can estimate the dates to fill them in.

In [5]:
def date_midpoint(date1, date2):
    if date1 > date2:
        date1, date2 = date2, date1

    # Calculate the midpoint date
    return (date1 + (date2 - date1) / 2).replace(microsecond=0)

# Fill in Null timestamps
def clean_timestamps(transcript_timestamps):
    for i in range(len(transcript_timestamps)):
        if transcript_timestamps[i] is None:
            if (i == 0):
                # If the first element is empty, make it the next non-null element
                transcript_timestamps[i] = next(i for i in transcript_timestamps if i is not None)
            elif (i == len(transcript_timestamps) - 1):
                transcript_timestamps[i] = [i for i in transcript_timestamps if i is not None][-1]
            else:
                transcript_timestamps[i] = date_midpoint(transcript_timestamps[i-1], [i for i in transcript_timestamps if i is not None][-1])
    
    return transcript_timestamps


In [6]:
# Either read from past files or create new transcripts and timestamps
def read_or_create_transcripts(url_playlist, num_videos, video_order, transcript_location, timestamp_location):
    if (os.path.exists(transcript_location) == False or os.path.exists(timestamp_location) == False):
        transcript_list, transcript_timestamps = create_transcripts_from_playlist(url_playlist, num_videos=num_videos, order=video_order)
        transcript_timestamps = clean_timestamps(transcript_timestamps)

        # Write
        with open(transcript_location, "w") as file:
            for line in transcript_list:
                file.write(line + "\n")

        print(f"Strings have been written to {transcript_location}")

        # Save the timestamps
        with open(timestamp_location, "w") as file:
            for ts in transcript_timestamps:
                file.write(ts.isoformat() + "\n")  

    else:
    # Read
        with open(transcript_location, "r") as file:
            transcript_list = [line.strip() for line in file]

        with open(timestamp_location, "r") as file:
            transcript_timestamps = [datetime.fromisoformat(line.strip()) for line in file]
    return transcript_list, transcript_timestamps

transcript_list, transcript_timestamps = read_or_create_transcripts(url_playlist, num_videos, video_order, transcript_location, timestamp_location)

## Split transcripts into batches

We now need to split the document up into chunks for BERTopic. BERTopic works best with lengths from a sentence to a paragraph.
We have a lot of data, so we usually split it up into 200 word chunks.

In [7]:
def split_doc(doc, words_per_doc):
    words = doc.split()
    k = len(words) // words_per_doc

    # If the video isn't long enough to be split into groups, just return all of the words
    if (k == 0):
        documents = [' '.join(words)]
        return documents
    else:
        remainder = len(words) % k  # Words that remain after even distribution

    documents = []
    start_index = 0

    for i in range(k):
        end_index = start_index + words_per_doc + (1 if i < remainder else 0)
        document = ' '.join(words[start_index:end_index])
        documents.append(document)
        start_index = end_index

    # print(f"Split document up into {k} groups of {words_per_doc} words")
    return documents

We need to split the timestamps up into the same chunks to keep track of the timestamps for each.

In [8]:
def split_transcipts_and_timestamps(transcript_list, transcript_timestamps, words_per_doc):
    all_timestamps = []
    all_transcripts = []
    for i in range(len(transcript_list)):
        split_transcript = split_doc(transcript_list[i], words_per_doc)
        all_transcripts = all_transcripts + split_transcript

        all_timestamps = all_timestamps + ([transcript_timestamps[i]] * len(split_transcript))
    return all_transcripts, all_timestamps

In [9]:
all_transcripts, all_timestamps = split_transcipts_and_timestamps(transcript_list, transcript_timestamps, doc_size)

## Fit BERTopic

### Removing Stop Words

We can attempt to remove stop words, but this does not improve the results. 
So we'll keep this commented out. 

In [10]:
# def remove_stopwords(text):
#     # We can get the stop words from NLTK and take them out
#     stop_words = set(stopwords.words('english'))
#     words = text.split()
#     return ' '.join([word for word in words if word.lower() not in stop_words])

# all_transcripts = [remove_stopwords(i) for i in all_transcripts]

# # The model does not work if the strings are too small
# all_transcripts = [i for i in all_transcripts if len(i.split()) > 100]


### Fitting model to data

In [11]:
def get_bert(bert_model_path, number_of_topics, all_transcripts):
    # If we already have a model, load it
    if (os.path.exists(bert_model_path)):
        topic_model = BERTopic.load(bert_model_path)
        
        # Load topics and probs
        with open(topic_prob_location, "rb") as file:
            topics, probs = pickle.load(file)

    # No BERT model exists yet, so we'll create one
    else:
        # Create a BERTopic model
        topic_model = BERTopic(representation_model=KeyBERTInspired(), nr_topics=number_of_topics)
        
        # Fit the BERTopic model
        topics, probs = topic_model.fit_transform(all_transcripts)
        
        topic_model.save(bert_model_path)

        # Replace the number of the topic with the actual topic name
        topic_df = topic_model.get_topic_info()
        topics = [topic_df[topic_df['Topic'] == i]['Name'].iloc[0] for i in topics]

        # Save topics and probs
        with open(topic_prob_location, "wb") as file:
            pickle.dump((topics, probs), file)

    return topic_model, topics, probs

topic_model, topics, probs = get_bert(bert_model_path, number_of_topics, all_transcripts)

## Visualizations

In [12]:
topic_model.visualize_topics(custom_labels=True)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [13]:
topic_model.visualize_hierarchy(custom_labels=True)

In [14]:
topic_model.visualize_barchart() 

## Improve Topic Names

In [15]:
topic_df = topic_model.get_topic_info()

In [16]:
topic_names = str(topic_df['Name'].tolist())
topic_descriptions = str(topic_df['Representation'].tolist())
prompt = f"I will give you a list of lists called topic_descriptions. \
Each element in this list is a list of words that represent the topic. I will also given you a list of strings called topic_names. \
These are the current names of the topics. \
Take these topics and summarize them. Return a dictionary where the keys are the current names of the topics and the values \
are the new names of the topics that you generate.\n topic_names: {str(topic_names)} \n topic_descriptions: {str(topic_descriptions)} \
The result should be able to be put in json.loads(result)" 

Must have an OpenAI key for this part to work

In [17]:
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAIAPI_KEY"))
GPT_MODEL = "gpt-4o-mini"
messages = [
        {"role": "user", "content": prompt},
    ]
response = client.chat.completions.create(
        model=GPT_MODEL,
        messages=messages,
        temperature=0
    )
response_message = response.choices[0].message.content
print(response_message)
new_topic_names = json.loads(response_message[response_message.index("{"):response_message.index("}")+1].replace("\n", ""))

Here is the summarized dictionary with new topic names based on the provided topic names and descriptions:

```json
{
    "-1_like_say_mean_something": "Communication and Meaning",
    "0_comedy_him_be_like": "Humor and Comedy",
    "1_ufc_fights_fight_like": "Mixed Martial Arts",
    "2_hunting_animals_animal_bear": "Hunting and Wildlife",
    "3_trump_thing_something_what": "Politics and Trump",
    "4_twitter_social_like_mean": "Social Media and Communication",
    "5_drugs_drug_marijuana_weed": "Substance Use and Drugs",
    "6_vaccine_vaccines_vaccinated_disease": "Vaccines and Health",
    "7_porsche_cars_car_tesla": "Automobiles and Performance Cars",
    "8_diet_eat_eating_protein": "Nutrition and Diet",
    "9_yoga_training_breathing_sauna": "Wellness and Yoga",
    "10_like_memory_dreams_think": "Memory and Cognition",
    "11_guns_cop_cops_police": "Law Enforcement and Firearms",
    "12_ancient_civilization_pyramids_egypt": "Ancient Civilizations",
    "13_hamas_gaza_cult_i

In [18]:
new_topic_names

{'-1_like_say_mean_something': 'Communication and Meaning',
 '0_comedy_him_be_like': 'Humor and Comedy',
 '1_ufc_fights_fight_like': 'Mixed Martial Arts',
 '2_hunting_animals_animal_bear': 'Hunting and Wildlife',
 '3_trump_thing_something_what': 'Politics and Trump',
 '4_twitter_social_like_mean': 'Social Media and Communication',
 '5_drugs_drug_marijuana_weed': 'Substance Use and Drugs',
 '6_vaccine_vaccines_vaccinated_disease': 'Vaccines and Health',
 '7_porsche_cars_car_tesla': 'Automobiles and Performance Cars',
 '8_diet_eat_eating_protein': 'Nutrition and Diet',
 '9_yoga_training_breathing_sauna': 'Wellness and Yoga',
 '10_like_memory_dreams_think': 'Memory and Cognition',
 '11_guns_cop_cops_police': 'Law Enforcement and Firearms',
 '12_ancient_civilization_pyramids_egypt': 'Ancient Civilizations',
 '13_hamas_gaza_cult_israel': 'Middle Eastern Politics',
 '14_transgender_gender_trans_women': 'Gender Identity and Transgender Issues',
 '15_ukraine_putin_russia_russians': 'Geopolitic

In [19]:
new_topics = [new_topic_names[i] for i in topics]

Removing an outlier, which has thousands of topics associated with it.

Also removing topic -1, which is used as a bucket for topics that don't fit

In [20]:
meaningless_topic = new_topic_names[topic_df[topic_df['Topic'] == -1]['Name'][0]]

new_topics_no_outliers = []
new_timestamps_no_outliers = []
outlier = datetime(2024, 6, 27, 0, 0)
for i in range(len(all_timestamps)):
    if (all_timestamps[i] != outlier and new_topics[i] != meaningless_topic):
        new_timestamps_no_outliers.append(all_timestamps[i])
        new_topics_no_outliers.append(new_topics[i])


## Evaluate BERTopic Performance

We can evaluate the performance by looking at how many batches in the dataset were classified within a topic. This does not evaluate the performance of BERTopic on grouping topics, but rather analyzes how well it was able to understand the data.

In [21]:
num_unclassified = topic_df[topic_df['Topic'] == -1]['Count'].loc[0]
total_batches = topic_df['Count'].sum()
classification_rate = ((total_batches - num_unclassified) / total_batches)
print(f"The classification rate of {channel_name} is {classification_rate*100:.2f}%.")

The classification rate of PowerfulJRE is 37.98%.


## Time Series Analysis

In [None]:
def plot_topics_interactive_ranked(all_timestamps, topics, granularity):
    df = pd.DataFrame(data={"Date": all_timestamps, "Topic": topics})
    df["Date"] = pd.to_datetime(df["Date"])

    # Set time granularity
    if granularity == 'year':
        df['Time'] = df['Date'].dt.year
    elif granularity == 'month':
        df['Time'] = df['Date'].dt.to_period('M').astype(str)
    elif granularity == 'day':
        df['Time'] = df['Date'].dt.to_period('D').astype(str)
    else:
        raise ValueError("Granularity must be 'year', 'month', or 'day'")

    grouped = df.groupby(['Time', 'Topic']).size().reset_index(name='Count')
    pivot = grouped.pivot(index='Time', columns='Topic', values='Count').fillna(0)

    # Get total occurrences for each topic
    total_occurrences = pivot.sum().sort_values(ascending=False)
    fig = go.Figure()

    # Plot topics by their total number of occurrences
    for topic in total_occurrences.index:
        fig.add_trace(go.Scatter(
            x=pivot.index, 
            y=pivot[topic],
            mode='lines+markers',
            name=topic,
            visible=True if topic == total_occurrences.index[0] else "legendonly"
        ))

    fig.update_layout(
        title=f"Occurrences of Topics by {granularity.capitalize()} in {channel_name}",
        xaxis_title=granularity.capitalize(),
        yaxis_title="Number of Occurrences",
        legend_title="Topic (Ranked by Total Occurrences)",
        hovermode="x unified",
        template="plotly_white"
    )

    fig.show()

# Adjust granularity as needed, JRE goes back to 2014, NYT doesn't go back very far so we should stick with 'month'
plot_topics_interactive_ranked(new_timestamps_no_outliers, new_topics_no_outliers, granularity='month')