In [None]:
from vertexai.generative_models import GenerativeModel
import vertexai

import extraction_prompts as prompts

from tenacity import retry, stop_after_attempt, wait_exponential
from concurrent.futures import ThreadPoolExecutor
from IPython.display import Markdown 
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import datetime
import json
import time
import re

In [None]:
PROJECT_ID = "sharp-airway-408502"
LOCATION = "us-central1"
GUEST = "Leopold Aschenbrenner"
HOST = "Dwarkesh Patel"
DATE = datetime.datetime.now().strftime("%Y-%m-%d")
AIR_DATE = "2024-06-04"

In [None]:
vertexai.init(project=PROJECT_ID, location=LOCATION)

model = GenerativeModel(model_name="gemini-1.5-flash-001")

with open("test.txt", "r") as f:
    transcript = f.read()

In [None]:
system_message = prompts.system_message.replace("<date>", DATE).replace("<air_date>", AIR_DATE).replace("<guest>", GUEST).replace("<host>", HOST)

In [None]:
# Define the regex pattern
pattern = re.compile(r'(\w+\s\w+)\s(\d{2}:\d{2}:\d{2})\n([\s\S]+?)(?=\n\w+\s\w+\s\d{2}:\d{2}:\d{2}|$)')

# Extract matches from the content
matches = pattern.findall(transcript)

# Create a list of dictionaries with speaker, start time, and text
transcript_entries = [{'speaker': match[0], 'start_time': match[1], 'text': match[2].strip()} for match in matches]

df = pd.DataFrame(transcript_entries)

# Identify the second speaker
second_speaker = GUEST

# Initialize chunks list
chunks = []

# Loop through the dataframe
i = 0
while i < len(df):
    if df.loc[i, 'speaker'] == second_speaker:
        # Capture the preceding beliefs by the first speaker
        preceding_beliefs = []
        j = i - 1
        while j >= 0 and df.loc[j, 'speaker'] != second_speaker:
            preceding_beliefs.insert(0, f"{df.loc[j, 'speaker']}\n{df.loc[j, 'text']}")
            j -= 1
        
        # Capture the second speaker's belief
        second_speaker_beliefs = []
        while i < len(df) and df.loc[i, 'speaker'] == second_speaker:
            second_speaker_beliefs.append(f"{df.loc[i, 'speaker']}\n{df.loc[i, 'text']}")
            i += 1
        
        # Capture the subsequent beliefs by the first speaker
        following_beliefs = []
        while i < len(df) and df.loc[i, 'speaker'] != second_speaker:
            following_beliefs.append(f"{df.loc[i, 'speaker']}\n{df.loc[i, 'text']}")
            i += 1
        
        # Combine all beliefs into one chunk
        chunk = "\n".join(preceding_beliefs + second_speaker_beliefs + following_beliefs)
        chunks.append(chunk)
    else:
        i += 1

In [None]:
# Create a DataFrame from chunks
extraction_df = pd.DataFrame(chunks, columns=['chunk'])

# Create a column that contains the chunk for the current index as well as the 2 chunks before and after
meta_chunks = []

for index in range(len(extraction_df)):
    if index < 3:
        meta_chunk = "\n\n".join(extraction_df['chunk'].iloc[:5])
    elif index >= len(extraction_df) - 3:
        meta_chunk = "\n\n".join(extraction_df['chunk'].iloc[-5:])
    else:
        meta_chunk = "\n\n".join(extraction_df['chunk'].iloc[index-2:index+3])
    meta_chunks.append(meta_chunk)

extraction_df['meta_chunk'] = meta_chunks

extraction_df = extraction_df.reset_index().rename(columns={'index': 'chunk_id'})

In [None]:
# Define a retrying function with exponential backoff
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=60))
def process_row(row):
    prompt = system_message + prompts.belief_extraction.replace("<chunk>", row['chunk']).replace("<guest>", GUEST).replace("<host>", HOST).replace("<meta_chunk>", row['meta_chunk'])
    try:
        response = model.generate_content(contents=prompt, generation_config={"response_mime_type": "application/json"})
        response_json = json.loads(response.text)
        return response_json
    except json.JSONDecodeError as e:
        # Raise an exception to trigger retry
        raise ValueError("JSON decoding error, triggering retry") from e
    except ValueError as e:
        # Handle specific ValueError related to blocked content
        return f"Error: {str(e)}"
    except Exception as e:
        # Handle other potential errors
        return f"Unexpected Error: {str(e)}"

# Use ThreadPoolExecutor for multithreading with tqdm progress bar
with ThreadPoolExecutor(max_workers=50) as executor:
    extracted_beliefs = list(tqdm(executor.map(process_chunk, extraction_df.to_dict('records')), total=len(extraction_df)))

# Add the extracted beliefs to the DataFrame
extraction_df['extracted_beliefs'] = extracted_beliefs

In [None]:
extraction_df = extraction_df.explode('extracted_beliefs')
extraction_df = extraction_df.reset_index(drop=True).reset_index().rename(columns={'index': 'belief_id'})

In [None]:
extracted_df = pd.json_normalize(extraction_df['extracted_beliefs'])
extraction_df = pd.merge(extraction_df, extracted_df, left_on='belief_id', right_index=True)

In [None]:
# Define a retrying function with exponential backoff
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=60))
def process_chunk(row):
    prompt = system_message + prompts.verification_evaluation.replace("<chunk>", row['chunk']).replace("<belief>", json.dumps(row['extracted_beliefs'], indent=4)).replace("<guest>", GUEST).replace("<host>", HOST).replace("<meta_chunk>", row['meta_chunk'])
    try:
        response = model.generate_content(contents=prompt, generation_config={"response_mime_type": "application/json"})
        response_json = json.loads(response.text)
        return response_json
    except json.JSONDecodeError as e:
        print(e)
        # Raise an exception to trigger retry
        raise ValueError("JSON decoding error, triggering retry") from e
    except ValueError as e:
        print(e)
        # Handle specific ValueError related to blocked content
        return None
    except Exception as e:
        print(e)
        # Handle other potential errors
        return None

temp_df = extraction_df.dropna(subset=['belief']).copy()

# Use ThreadPoolExecutor for multithreading with tqdm progress bar
with ThreadPoolExecutor(max_workers=50) as executor:
    verification_list = list(tqdm(executor.map(process_chunk, temp_df.to_dict('records')), total=len(extraction_df)))

# Add the extracted beliefs to the DataFrame
temp_df['verification_output'] = verification_list

In [None]:
verification_df = pd.json_normalize(temp_df['verification_output']).drop(columns=['belief'])
extraction_df = pd.merge(temp_df, verification_df, left_on='belief_id', right_index=True)

In [None]:
research_df = extraction_df[extraction_df['verify'] == True].copy().reset_index(drop=True)

In [None]:
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=4, max=60))
def process_row(row):
    prompt = (system_message + prompts.hypothesis_generation
              .replace("<chunk>", str(row['chunk']))
              .replace("<meta_chunk>", row['meta_chunk'])
              .replace("<belief>", str(row['belief']))
              .replace("<context>", str(row['context']))
              .replace("<justification>", str(row['justification']))
              .replace("<verification_focus>", str(row['verification_focus']))
              .replace("<guest>", GUEST)
              .replace("<host>", HOST)
              )

    try:
        response = model.generate_content(contents=prompt, generation_config={"response_mime_type": "application/json"})
        response_json = json.loads(response.text)
        return response_json
    except json.JSONDecodeError as e:
        # Raise an exception to trigger retry
        raise ValueError("JSON decoding error, triggering retry") from e
    except ValueError as e:
        # Handle specific ValueError related to blocked content
        return f"Error: {str(e)}"
    except Exception as e:
        # Handle other potential errors
        return f"Unexpected Error: {str(e)}"

# Assuming research_df is your DataFrame
# Use ThreadPoolExecutor for multithreading with tqdm progress bar
with ThreadPoolExecutor(max_workers=50) as executor:
    hypotheses_list = list(tqdm(executor.map(process_row, research_df.to_dict('records')), total=len(research_df)))

# Add the extracted beliefs to the DataFrame
research_df['hypotheses_list'] = hypotheses_list

In [None]:
research_df = research_df.explode('hypotheses_list')

research_df = research_df.reset_index(drop=True).reset_index().rename(columns={'index': 'hypothesis_id'})

In [None]:
hypotheses_df = pd.json_normalize(research_df['hypotheses_list'])
research_df = pd.merge(research_df, hypotheses_df, left_on='hypothesis_id', right_index=True)