# Topic Modeling for extracting Youtube Topics





In [1]:
# Increase the display size
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
# !pip install langchain python-dotenv
# !pip install tiktoken

In [3]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import create_extraction_chain

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma, Pinecone

import os
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
llm = ChatOpenAI(temperature=0,
                  openai_api_key=os.getenv('OPENAI_KEY'),
                  model_name="gpt-3.5-turbo-0613",
                  request_timeout = 180
                )

In [5]:
# I put three prepared transcripts
transcript_paths = [
    'Transcripts/MFMPod/mfm_pod_steph.txt',
    'Transcripts/MFMPod/mfm_pod_alex.txt',
    'Transcripts/MFMPod/mfm_pod_rob.txt'
]

with open('Transcripts/MFMPod/mfm_pod_steph.txt') as file:
    transcript = file.read()

In [6]:
print(transcript[:280])

Shaan Puri (0:00:00-0:00:03): D to see hearing AIDS. I think that's actually going to be a big deal. 

Sam Parr (0:00:03-0:00:05): And they're profitable. 

Shaan Puri (0:00:05-0:00:08): I mean, I'm just turning you on. Yeah, they were. 

Sam Parr (0:00:12-0:00:13): They Mormon. 


Then we are going to split our text up into chunks. We do this because if the text is too long, it wouldn't fit in the prompt 

In [11]:
# Load up your text splitter
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", " "], chunk_size=10000, chunk_overlap=2200)

transcript_subsection_characters = 23250
docs = text_splitter.create_documents([transcript[:transcript_subsection_characters]])
print (f"You have {len(docs)} docs. First doc is {llm.get_num_tokens(docs[0].page_content)} tokens")

You have 3 docs. First doc is 2801 tokens


## Step 1: Extract Topic Titles & Short Description

First we will try to obrain a topic title and a short description from the LLM. I found it was too much for the LLM to ask for a long description in one go. So we will get short description and then expand on that.


In [7]:
template="""
You are a helpful assistant that helps retrieve topics talked about in a podcast transcript
- Your goal is to extract the topic names and brief 1-sentence description of the topic
- Topics include:
  - Themes
  - Business Ideas
  - Interesting Stories
  - Money making businesses
  - Quick stories about people
  - Mental Frameworks
  - Stories about an industry
  - Analogies mentioned
  - Advice or words of caution
  - Pieces of news or current events
- Provide a brief description of the topics after the topic name. Example: 'Topic: Brief Description'
- Use the same words and terminology that is said in the podcast
- Do not respond with anything outside of the podcast. If you don't see any topics, say, 'No Topics'
- Do not respond with numbers, just bullet points
- Do not include anything about 'Marketing Against the Grain'
- Only pull topics from the transcript. Do not use the examples
- Make your titles descriptive but concise. Example: 'Shaan's Experience at Twitch' should be 'Shaan's Interesting Projects At Twitch'
- A topic should be substantial, more than just a one-off comment

% START OF EXAMPLES
 - Sam’s Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
 - Shaan’s Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
 - Revenge Against The Spam Calls: A couple of businesses focused on protecting consumers: RoboCall, TrueCaller, DoNotPay, FitIt
 - Wildcard CEOs vs. Prudent CEOs: However, Munger likes to surround himself with prudent CEO’s and says he would never hire Musk.
 - Chess Business: Priyav, a college student, expressed his doubts on the MFM Facebook group about his Chess training business, mychesstutor.com, making $12.5K MRR with 90 enrolled.
 - Restaurant Refiller: An MFM Facebook group member commented on how they pay AirMark $1,000/month for toilet paper and toilet cover refills for their restaurant. Shaan sees an opportunity here for anyone wanting to compete against AirMark.
 - Collecting: Shaan shared an idea to build a mobile only marketplace for a collectors’ category; similar to what StockX does for premium sneakers.
% END OF EXAMPLES
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Transcript: {text}" # Just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_map = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

Then we have our combine prompt which will run once over the results of the map prompt above

In [8]:
template="""
You are a helpful assistant that helps retrieve topics talked about in a podcast transcript
- You will be given a series of bullet topics of topics vound
- Your goal is to exract the topic names and brief 1-sentence description of the topic
- Deduplicate any bullet points you see
- Only pull topics from the transcript. Do not use the examples

% START OF EXAMPLES
 - Sam’s Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
 - Shaan’s Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
% END OF EXAMPLES
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Transcript: {text}" # Just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_combine = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

### Extract the topics according to our custom prompts

In [15]:
chain = load_summarize_chain(llm,
                             chain_type="map_reduce",
                             map_prompt=chat_prompt_map,
                             combine_prompt=chat_prompt_combine,
#                              verbose=True
                            )

In [17]:
topics_found = chain.run({"input_documents": docs})

In [10]:
print (topics_found)

- Hearing Aids Business: Shaan and Sam explore the potential profitability of the hearing aids industry.
- Children's Play Space Business: Shaan revisits a business idea about a membership-based children's play space.
- Steph Smith's Career: The hosts discuss Steph Smith's career progression, including her current role at Andreessen Horowitz.
- Working at Andreessen Horowitz: Steph shares insights about her experience at Andreessen Horowitz, a leading VC firm.
- Office Culture: The trio discuss the differences between working in an office environment and working remotely.
- Sam's Master Plan at Facebook: Sam shares advice he gave to his wife Sarah about making an impact at Facebook.
- Shaan's Strategy at Twitch: Shaan recounts his networking strategy during his time at Twitch.
- Commercial Real Estate Crisis: The hosts discuss the high vacancy rates in commercial real estate, particularly in cities like San Francisco.
- Opportunity in Fractional Real Estate: Steph suggests that the com

### Structured Data - Turn the LLM output into structured data


In [11]:
schema = {
    "properties": {
        # The title of the topic
        "topic_name": {
            "type": "string",
            "description" : "The title of the topic listed"
        },
        # The description
        "description": {
            "type": "string",
            "description" : "The description of the topic listed"
        },
        "tag": {
            "type": "string",
            "description" : "The type of content being described",
            "enum" : ['Business Models', 'Life Advice', 'Health & Wellness', 'Stories']
        }
    },
    "required": ["topic", "description"],
}

In [12]:
chain = create_extraction_chain(schema, llm)

In [13]:
topics_structured = chain.run(topics_found)

In [14]:
topics_structured

[{'topic_name': 'Hearing Aids Business',
  'description': 'Shaan and Sam explore the potential profitability of the hearing aids industry.',
  'tag': 'Business Models'},
 {'topic_name': "Children's Play Space Business",
  'description': "Shaan revisits a business idea about a membership-based children's play space.",
  'tag': 'Business Models'},
 {'topic_name': "Steph Smith's Career",
  'description': "The hosts discuss Steph Smith's career progression, including her current role at Andreessen Horowitz.",
  'tag': 'Business Models'},
 {'topic_name': 'Working at Andreessen Horowitz',
  'description': 'Steph shares insights about her experience at Andreessen Horowitz, a leading VC firm.',
  'tag': 'Business Models'},
 {'topic_name': 'Office Culture',
  'description': 'The trio discuss the differences between working in an office environment and working remotely.',
  'tag': 'Business Models'},
 {'topic_name': "Sam's Master Plan at Facebook",
  'description': 'Sam shares advice he gave to 

Great, now we have our structured topics. Let's *expand* on those topics even more.

## Step 2: Expand on the topics you found



In [15]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=800)

docs = text_splitter.create_documents([transcript[:transcript_subsection_characters]])

print (f"You have {len(docs)} docs. First doc is {llm3.get_num_tokens(docs[0].page_content)} tokens")

You have 8 docs. First doc is 776 tokens


Because I want to do Question & Answer Retrieval, we need to get embeddings for our documents so we can pull out the docs which are similar for context later.

In [16]:
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_KEY'))

In [22]:
docsearch = Chroma.from_documents(docs, embeddings)

In [23]:
# the context placeholder kind of stores the relevant docs
system_template = """
You will be given text from a podcast transcript which contains many topics.
You goal is to write a summary (5 sentences or less) about a topic the user chooses
Do not respond with information that isn't relevant to the topic that the user gives you
----------------
{context}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]

# This will pull the two messages together and get them ready to be sent to the LLM through the retriever
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

In [24]:
# I'm using gpt4 for the increased reasoning power.
# I'm also setting k=4 so the number of relevant docs we get back is 4. This parameter should be tuned to your use case
qa = RetrievalQA.from_chain_type(llm=llm4,
                                 chain_type="stuff",
                                 retriever=docsearch.as_retriever(k=4),
                                 chain_type_kwargs = {
#                                      'verbose': True,
                                     'prompt': CHAT_PROMPT
                                 })

Then let's iterate through the topics that we found and run our QA query on them.

This will print out our expanded topics.

In [25]:
# Only doing the first 3 for conciseness 
for topic in topics_structured[:5]:
    query = f"""
        {topic['topic_name']}: {topic['description']}
    """

    expanded_topic = qa.run(query)

    print(f"{topic['topic_name']}: {topic['description']}")
    print(expanded_topic)
    print ("\n\n")

Hearing Aids Business: Shaan and Sam explore the potential profitability of the hearing aids industry.
Shaan Puri and Sam Parr discussed the potential of the hearing aids industry, noting that it could be a profitable venture. They did not provide specific details about the industry but expressed optimism about its potential.



Children's Play Space Business: Shaan revisits a business idea about a membership-based children's play space.
Shaan Puri discussed a business idea he had previously mentioned about a children's play space. This business operates on a membership basis where parents pay a fee for their children to play with various toys inside the facility. However, Shaan clarified that he does not endorse this business idea, as he only had a single experience with it and does not know if it is profitable or not. He expressed concern that listeners might have taken his discussion as a recommendation and invested in similar franchises.



Steph Smith's Career: The hosts discuss S

## Extracting timestamps of chapters

In [26]:
system_template = """
What is the first timestamp when the speakers started talking about a topic the user gives?
Only respond with the timestamp, nothing else. Example: 0:18:24
----------------
{context}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

In [27]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=docsearch.as_retriever(k=4),
                                 chain_type_kwargs = {
#                                      'verbose': True,
                                     'prompt': CHAT_PROMPT
                                 })

In [28]:
# Holder for topic timestamps
topic_timestamps = []

for topic in topics_structured:

    query = f"{topic['topic_name']} - {topic['description']}"
    timestamp = qa.run(query)
    
    topic_timestamps.append(f"{timestamp} - {topic['topic_name']}")

In [29]:
print ("\n".join(sorted(topic_timestamps)))

0:00:00 - Hearing Aids Business
0:00:40 - Children's Play Space Business
0:04:24 - Office Culture
0:04:26 - Steph Smith's Career
0:05:27 - Working at Andreessen Horowitz
0:06:37 - Sam's Master Plan at Facebook
0:09:21 - Shaan's Strategy at Twitch
0:12:32 - Commercial Real Estate Crisis
0:12:32 - Opportunity in Fractional Real Estate
0:13:10 - Temple Immersive
0:14:56 - Rage Rooms
0:16:43 - Escape Room Business Success


In [None]:
# 0:00:00 - Hearing Aids Business
# 0:00:40 - Children's Play Space Business
# 0:04:24 - Office Culture
# 0:04:26 - Steph Smith's Career
# 0:05:27 - Working at Andreessen Horowitz
# 0:06:37 - Sam's Master Plan at Facebook
# 0:09:21 - Shaan's Strategy at Twitch
# 0:12:32 - Commercial Real Estate Crisis
# 0:12:32 - Opportunity in Fractional Real Estate
# 0:13:10 - Temple Immersive
# 0:14:56 - Rage Rooms
# 0:16:43 - Escape Room Business Success