# Summarise podcast transcripts

Prepare raw caption data (manual copy-past + data clean)

In [4]:
import pandas as pd

# Read the text file into a DataFrame
df = pd.read_csv('caption.txt', names=['Lines'])

# Split odd and even lines into separate columns
df_new = pd.DataFrame({
    'text': df[df.index % 2 != 0]['Lines'].reset_index(drop=True),
    'timestamp': df[df.index % 2 == 0]['Lines'].reset_index(drop=True)
})

print(df_new)

                                                   text timestamp
0     two one boom all right we're live thank you ve...      0:00
1     information and listening to you talk for uh q...      0:06
2     having me my pleasure my pleasure you are one ...      0:12
3     you are um you're deep in the tech world but y...      0:20
4     perspective in terms of how to live life as op...      0:26
...                                                 ...       ...
1310  actually okay just at naval then i have a webs...   2:11:25
1311  channel neval and i have a podcast in the worl...   2:11:30
1312                    thank you bye everybody [Music]   2:11:35
1313                                 [Applause] [Music]   2:11:41
1314                                          [Music] i   2:11:51

[1315 rows x 2 columns]


## SET UP

In [8]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

from langchain.llms import OpenAI
llm = OpenAI(temperature=0, openai_api_key=openai.api_key)

### Quick start: 

In [26]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [16]:
captions_text = df_new['text'].str.cat(sep=' ')
llm.get_num_tokens(captions_text)

29533

In [27]:
# text_splitter = RecursiveCharacterTextSplitter()
text_splitter = CharacterTextSplitter()
texts = text_splitter.split_text(captions_text)


In [28]:
from langchain.docstore.document import Document

docs = [Document(page_content=t) for t in texts[:3]]

In [None]:
chain = load_summarize_chain(llm, chain_type="map_reduce")
chain.run(docs)

In [29]:
num_docs = len(docs)

num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")


Now we have 1 documents and the first one has 29533 tokens
