In [1]:
import os
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
from google.cloud import bigquery
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
load_dotenv()
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

In [3]:
with open("hl_podcast_wmemory_stage_2.txt", "r") as f:
    transcript = f.read()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=500
)

documents = text_splitter.create_documents([transcript])

In [5]:
def chat_gpt_query(prompt):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

In [6]:
def safe_split(result):
    if result is None or result == "":
        return []
    
    try:
        result_array = result.replace("\n", "").split("- ")
        if len(result_array) > 1:
            return result_array[1:]
        else:
            return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [7]:
class Prompts:
    @staticmethod
    def advice_prompt(content):
        prompt = f"""----- INSTRUCTION -----
In the following, I will provide you a part of a \
podcast-transcript. This podcast covers health-related topics. \
I need you to extract actionable advice from each transcript. \
Each advice should be formulated as an instruction. \
Advices should be enumerated with an '-'. \
If there is no actionable advice in the transcript, you should return 'No Advice'.
----- TRANSCRIPT ----
{content}
----- ADVICE -----"""
        return prompt
    
    @staticmethod 
    def summary_prompt(content):
        prompt = f"""----- INSTRUCTION -----
Summarize the following text in one word, or two words at max. 
----- TEXT ----
Here's the transcript: {content}
----- SUMMARY -----"""
        return prompt

In [8]:
instructions = []

for doc in tqdm(documents):
    content = doc.page_content
    prompt = Prompts.advice_prompt(content)
    
    result = chat_gpt_query(prompt)
    result_array = safe_split(result)
    instructions.extend(result_array)

100%|██████████| 22/22 [01:54<00:00,  5.21s/it]


In [9]:
raw_data = []
video_id = "CQlTmOFM4Qs"

for ins in tqdm(instructions):
    prompt = Prompts.summary_prompt(ins)
    summary = chat_gpt_query(prompt)
    raw_data.append((summary, ins, video_id))

100%|██████████| 152/152 [01:18<00:00,  1.94it/s]


In [10]:
rows_to_insert = [{"index": index, "advice": advice, "video_id": video_id} for index, advice, video_id in raw_data]

In [11]:
rows_to_insert[:2]

[{'index': 'Working memory',
  'advice': 'Understand what working memory is and how it is related to attention.',
  'video_id': 'CQlTmOFM4Qs'},
 {'index': 'Memory improvement',
  'advice': 'Learn about tools and strategies to improve working memory.',
  'video_id': 'CQlTmOFM4Qs'}]

In [12]:
client = bigquery.Client()
table_id = "steam-378309.huberman.advice"

In [13]:
errors = client.insert_rows_json(table_id, rows_to_insert)

if errors == []:
    print("New rows have been added.")
else:
    print("Encountered errors while inserting rows: {}".format(errors))

New rows have been added.


In [19]:
job = client.query("select video_id, count(*) from `steam-378309.huberman.advice` group by video_id")

for result in job.result():
    print(*result)

E7W4OQfJWdw 147
CQlTmOFM4Qs 152
