In [None]:
# Import necessary libraries
import datetime
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import pandas as pd
from IPython.display import Markdown
from tenacity import retry, stop_after_attempt, wait_exponential
from tqdm.notebook import tqdm
from vertexai.generative_models import GenerativeModel
import vertexai
import extraction_prompts as prompts

In [None]:
# Constants
PROJECT_ID = "sharp-airway-408502"
LOCATION = "us-central1"
GUEST = "Leopold Aschenbrenner"
HOST = "Dwarkesh Patel"
DATE = datetime.datetime.now().strftime("%Y-%m-%d")
AIR_DATE = "2024-06-04"

In [None]:
# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)
model = GenerativeModel(model_name="gemini-1.5-flash-001")

# Load transcript
def load_transcript(file_path):
    with open(file_path, "r") as f:
        return f.read()

transcript = load_transcript("test.txt")

In [None]:
# Extract transcript entries
def extract_transcript_entries(transcript):
    pattern = re.compile(r'(\w+\s\w+)\s(\d{2}:\d{2}:\d{2})\n([\s\S]+?)(?=\n\w+\s\w+\s\d{2}:\d{2}:\d{2}|$)')
    matches = pattern.findall(transcript)
    return [{'speaker': match[0], 'start_time': match[1], 'text': match[2].strip()} for match in matches]

transcript_entries = extract_transcript_entries(transcript)
df = pd.DataFrame(transcript_entries)

# Identify speaker chunks
def identify_speaker_chunks(df, second_speaker):
    chunks = []
    i = 0
    while i < len(df):
        if df.loc[i, 'speaker'] == second_speaker:
            preceding_beliefs, second_speaker_beliefs, following_beliefs = [], [], []
            j = i - 1
            while j >= 0 and df.loc[j, 'speaker'] != second_speaker:
                preceding_beliefs.insert(0, f"{df.loc[j, 'speaker']}\n{df.loc[j, 'text']}")
                j -= 1
            
            while i < len(df) and df.loc[i, 'speaker'] == second_speaker:
                second_speaker_beliefs.append(f"{df.loc[i, 'speaker']}\n{df.loc[i, 'text']}")
                i += 1
            
            while i < len(df) and df.loc[i, 'speaker'] != second_speaker:
                following_beliefs.append(f"{df.loc[i, 'speaker']}\n{df.loc[i, 'text']}")
                i += 1
            
            chunk = "\n".join(preceding_beliefs + second_speaker_beliefs + following_beliefs)
            chunks.append(chunk)
        else:
            i += 1
    return chunks

chunks = identify_speaker_chunks(df, GUEST)
extraction_df = pd.DataFrame(chunks, columns=['chunk'])

In [None]:
# Create meta chunks
def create_meta_chunks(extraction_df):
    meta_chunks = []
    for index in range(len(extraction_df)):
        if index < 3:
            meta_chunk = "\n\n".join(extraction_df['chunk'].iloc[:5])
        elif index >= len(extraction_df) - 3:
            meta_chunk = "\n\n".join(extraction_df['chunk'].iloc[-5:])
        else:
            meta_chunk = "\n\n".join(extraction_df['chunk'].iloc[index-2:index+3])
        meta_chunks.append(meta_chunk)
    return meta_chunks

extraction_df['meta_chunk'] = create_meta_chunks(extraction_df)
extraction_df = extraction_df.reset_index().rename(columns={'index': 'chunk_id'})

In [None]:
# Updated process_row function with enhanced error handling and debugging details
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=60))
def process_row(row, prompt_template, system_message):
    prompt = (
        system_message
        + prompt_template.replace("<chunk>", row['chunk'])
        .replace("<meta_chunk>", row['meta_chunk'])
        .replace("<belief>", row.get('belief', ''))
        .replace("<type>", row.get('type', ''))
        .replace("<context>", row.get('context', ''))
        .replace("<justification>", row.get('justification', ''))
        .replace("<verification_focus>", row.get('verification_focus', ''))
        .replace("<guest>", GUEST)
        .replace("<host>", HOST)
    )
    try:
        response = model.generate_content(contents=prompt, generation_config={"response_mime_type": "application/json"})
        return json.loads(response.text)
    except json.JSONDecodeError:
        display(Markdown(f"JSON Decoding Error: Unable to decode the response as JSON. Response: {response.text}\n\nMeta Chunk:  \n{row['meta_chunk']}"))
        raise ValueError("JSON decoding error, triggering retry")
    except ValueError as e:
        display(Markdown(f"Value Error: {e}\n\nResponse: {response.to_dict()}\n\nMeta Chunk:  \n{row['meta_chunk']}"))
        return None
    except Exception as e:
        display(Markdown(f"Unexpected Error: {e}\n\nMeta Chunk:  \n{row['meta_chunk']}"))
        return None

# Updated generate_responses function
def generate_responses(df, prompt_template, system_message):
    with ThreadPoolExecutor(max_workers=5) as executor:
        results = list(tqdm(executor.map(lambda row: process_row(row, prompt_template, system_message), df.to_dict('records')), total=len(df)))
    return results

In [None]:
# Format system message
system_message = prompts.system_message.replace("<date>", DATE).replace("<air_date>", AIR_DATE).replace("<guest>", GUEST).replace("<host>", HOST)

In [None]:
# Extract beliefs
# Process each chunk to extract beliefs using the belief extraction prompt
extraction_df['extracted_beliefs'] = generate_responses(extraction_df, prompts.belief_extraction, system_message)
# Explode the list of extracted beliefs into separate rows and reset the index to get a unique belief_id for each belief
extraction_df = extraction_df.explode('extracted_beliefs').reset_index(drop=True).reset_index().rename(columns={'index': 'belief_id'})
# Normalize the JSON structure of the extracted beliefs into a flat table
extracted_df = pd.json_normalize(extraction_df['extracted_beliefs'])
# Merge the normalized extracted beliefs back into the main DataFrame on belief_id
extraction_df = pd.merge(extraction_df, extracted_df, left_on='belief_id', right_index=True)

In [None]:
display(extraction_df.tail())

In [10]:
# Process verification
# Drop rows where 'belief' is missing and create a copy for verification processing
temp_df = extraction_df.dropna(subset=['belief']).copy()
# Process each belief to evaluate whether it needs verification using the verification evaluation prompt
temp_df['verification_output'] = generate_responses(temp_df, prompts.verification_evaluation, system_message)
# Normalize the JSON structure of the verification output into a flat table and drop the redundant 'belief' column
verification_df = pd.json_normalize(temp_df['verification_output']).drop(columns=['belief'])
# Merge the verification results back into the main DataFrame on belief_id
extraction_df = pd.merge(temp_df, verification_df, left_on='belief_id', right_index=True)
# Filter the DataFrame to keep only the beliefs that require verification and reset the index
research_df = extraction_df[extraction_df['verify'] == True].copy().reset_index(drop=True)

In [None]:
# Generate hypotheses
# Process each belief that requires verification to generate hypotheses using the hypothesis generation prompt
research_df['hypotheses_list'] = generate_responses(research_df, prompts.hypothesis_generation, system_message)
# Explode the list of generated hypotheses into separate rows and reset the index to get a unique hypothesis_id for each hypothesis
research_df = research_df.explode('hypotheses_list').reset_index(drop=True).reset_index().rename(columns={'index': 'hypothesis_id'})
# Normalize the JSON structure of the generated hypotheses into a flat table
hypotheses_df = pd.json_normalize(research_df['hypotheses_list'])
# Merge the hypotheses back into the main DataFrame on hypothesis_id
research_df = pd.merge(research_df, hypotheses_df, left_on='hypothesis_id', right_index=True)