In [1]:
import os
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
from google.cloud import bigquery
from langchain.text_splitter import RecursiveCharacterTextSplitter

ModuleNotFoundError: No module named 'openai'

In [11]:
load_dotenv()
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))

In [12]:
with open("hl_podcast_42.txt", "r") as f:
    transcript = f.read()

In [13]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=500
)

documents = text_splitter.create_documents([transcript])

In [15]:
def chat_gpt_query(prompt):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

In [34]:
def safe_split(result):
    if result is None or result == "":
        return []
    
    try:
        result_array = result.replace("\n", "").split("- ")
        if len(result_array) > 1:
            return result_array[1:]
        else:
            return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [49]:
class Prompts:
    @staticmethod
    def advice_prompt(content):
        prompt = f"""----- INSTRUCTION -----
In the following, I will provide you a part of a \
podcast-transcript. This podcast covers health-related topics. \
I need you to extract actionable advice from each transcript. \
Each advice should be formulated as an instruction. \
Advices should be enumerated with an '-'. \
If there is no actionable advice in the transcript, you should return 'No Advice'.
----- TRANSCRIPT ----
{content}
----- ADVICE -----"""
        return prompt
    
    @staticmethod 
    def summary_prompt(content):
        prompt = f"""----- INSTRUCTION -----
Summarize the following text in one word, or two words at max. 
----- TEXT ----
Here's the transcript: {content}
----- SUMMARY -----"""
        return prompt

In [48]:
print(Prompts.summary_prompt("lol"))

----- INSTRUCTION -----
Summarize the following text in one word, or two words at max. 
----- TEXT ----
Here's the transcript: lol
----- SUMMARY -----


In [50]:
instructions = []

for doc in tqdm(documents):
    content = doc.page_content
    prompt = Prompts.advice_prompt(content)
    
    result = chat_gpt_query(prompt)
    result_array = safe_split(result)
    instructions.extend(result_array)

100%|██████████| 22/22 [02:46<00:00,  7.57s/it]


In [53]:
raw_data = []

for ins in tqdm(instructions):
    prompt = Prompts.summary_prompt(ins)
    summary = chat_gpt_query(prompt)
    raw_data.append((summary, ins, "E7W4OQfJWdw"))

100%|██████████| 147/147 [01:29<00:00,  1.63it/s]


In [63]:
rows_to_insert = [{"index": index, "advice": advice, "video_id": video_id} for index, advice, video_id in raw_data]

In [64]:
rows_to_insert[:2]

[{'index': 'Brain foods',
  'advice': 'Pay attention to the foods that are good for your brain in terms of focus and brain health.',
  'video_id': 'E7W4OQfJWdw'},
 {'index': 'Food signals.',
  'advice': 'Be aware of the three major signals that drive food choices: subconscious signals from your gut, metabolically accessibility of the food for your brain, and the belief signal.',
  'video_id': 'E7W4OQfJWdw'}]

In [67]:
client = bigquery.Client()
table_id = ""

In [66]:
errors = client.insert_rows_json(table_id, rows_to_insert)

if errors == []:
    print("New rows have been added.")
else:
    print("Encountered errors while inserting rows: {}".format(errors))

New rows have been added.
