# Setup

In [28]:
# Import everything and setup things
import sys
import os
file_folder = os.getcwd()
if "testing_folder" in file_folder:
    file_folder = os.path.dirname(file_folder)
    sys.path.append(file_folder)
    os.chdir(file_folder)

import datetime
import math
import time
import json
from youtube_transcript_api import YouTubeTranscriptApi
import tiktoken
import faiss
import numpy as np
import asyncio
enc=tiktoken.get_encoding("cl100k_base")


keys=json.load(open("working_folder/keys.json"))

# Setup anthropic
import anthropic
client = anthropic.Anthropic(api_key=keys["anthropic"])

# Setup openai
from openai import OpenAI
os.environ["OPENAI_API_KEY"]=keys["openai"]
client = OpenAI(api_key=keys["openai"])

# Get transcript, Define video

In [50]:
# Get raw transcript
video_id="FC3giY7YDAQ"
if os.path.isdir("working_folder")==False:
    os.mkdir("working_folder")
if os.path.isdir("working_folder/"+video_id)==False:
    os.mkdir("working_folder/"+video_id)

raw_transcript=YouTubeTranscriptApi.get_transcript(video_id)
transcript=""
nearest_times={}
for m in raw_transcript:
    print(m['text'], m["start"])
    transcript+=str(m['text'])+"\n"
    nearest_times[len(transcript)]=m["start"]
    if len(transcript)>50000:
        break

# save as transcript.txt
with open("working_folder/transcript.txt","w") as f:
    f.write(transcript)

e 28.96
e 58.92
e 88.92
e 118.84
e 148.84
oh 164.56
hello 178.84
why is short uh leaving for 192.319
nebka uh so packing stuff and doing 196.0
[ __ ] The Simple Solution traffic seven 203.36
years old why'd you link this wait hello 205.76
hello okay 208.04
for how long uh just a day I'll be back 210.64
on the 212.319
12th here I have a video please watch 229.04
unfold the fundamental problem of 232.2
traffic on green the first car 234.239
accelerates and then the next and then 236.159
the next and then the next next and then 238.36
you only to catch the red had the cars 240.599
accelerated simultaneously you would 243.439
have made it through coordination not 245.28
cars is the problem because we are 248.0
monkey drivers with slow reaction times 250.239
and short attention spans even if we 252.56
tried getting everyone to press the 254.92
pedal on 3 2 1 now would be challenging 256.759
this discoordination limits how many 260.72
cars can get through an intersection and 262.479
when one

# Intialize Variables and Functions

In [44]:
# Cost calculation
input_cost=0
output_cost=0
total_cost=input_cost+output_cost
def get_cost(input_text,output_text):
    global input_cost
    global output_cost
    global total_cost
    input_cost+=len(enc.encode(input_text))*(3/1000000.0)
    output_cost+=len(enc.encode(output_text))*(15/1000000.0)
    total_cost=input_cost+output_cost

# make vector db
def make_vector_db(text_document):
    # Chunk text
    chunk_size=1000
    text_chunks=[text_document[i:i+chunk_size] for i in range(0, len(text_document), chunk_size)]
    print("Number of chunks: ",len(text_chunks))

    # Generate embeddings
    model="text-embedding-3-large"
    embeddings = []
    for chunk in text_chunks:
        response = client.embeddings.create(input=chunk, model=model)
        embeddings.append(response.data[0].embedding)
    embeddings=np.array(embeddings)
    print("Finished generating embeddings")

    # Make vector db
    vector_db=faiss.IndexFlatL2(embeddings.shape[1])
    vector_db.add(np.array(embeddings))

    # return text chunks and vector db
    return(text_chunks, embeddings, vector_db)

# Async function to fetch embeddings
async def fetch_embeddings_async(text_chunks, model):
    model="text-embedding-3-large"
    async def fetch_embedding(chunk):
        # Simulate an async call to the embeddings API
        return await asyncio.to_thread(client.embeddings.create, input=chunk, model=model)

    responses = await asyncio.gather(*(fetch_embedding(chunk) for chunk in text_chunks))
    embeddings = [response.data[0].embedding for response in responses]
    return np.array(embeddings)

# make vector db
async def make_vector_db_fast(text_document):
    # Chunk text
    chunk_size=1000
    text_chunks=[text_document[i:i+chunk_size] for i in range(0, len(text_document), chunk_size)]
    print("Number of chunks: ",len(text_chunks))

    # Generate embeddings
    model="text-embedding-3-large"
    embeddings=await fetch_embeddings_async(text_chunks, model)
    print("Finished generating embeddings")

    # Make vector db
    vector_db=faiss.IndexFlatL2(embeddings.shape[1])
    vector_db.add(np.array(embeddings))

    # return text chunks and vector db
    return(text_chunks, embeddings, vector_db)


# vector search
def search_vector_db(query, k, vector_db):
    # Generate query embedding
    query_embedding = client.embeddings.create(input=query,model="text-embedding-3-large").data[0].embedding
    query_embedding_np = np.array(query_embedding).astype('float32').reshape(1, -1)

    D, I = vector_db.search(query_embedding_np, k)
    return (D,I)

# Get time of transcript
def get_time_at_length_transcript(length):
    i=0
    while nearest_times.get(length-i,None)==None:
        i+=1
        if (length-i)<=0:
            return list(nearest_times.values())[0]

    return(nearest_times[length-i])

# Produce embedding search index

In [51]:
# Produce text chunks and vector db
text_chunks, embeddings, vector_db= await make_vector_db_fast(transcript)#[:10000])

# Save json text chunks
text_chunks_dict={}
index=0
for t_chunk in text_chunks:
    text_chunks_dict[str(index)]={"text":t_chunk, "start":get_time_at_length_transcript(transcript.find(t_chunk)), "end":get_time_at_length_transcript(transcript.find(t_chunk)+len(t_chunk))}
    index+=1
json.dump(text_chunks_dict,open("working_folder/"+video_id+"/text_chunks_dict.json","w"))

# Save vector db
faiss.write_index(vector_db, "working_folder/"+video_id+"/vector_db.index")

Number of chunks:  51
Finished generating embeddings


## Search index

In [None]:
#load from text_chunks_dict
text_chunks_dict=json.load(open("working_folder/"+video_id+"/text_chunks_dict.json"))
D, I=search_vector_db("ADHD",5,vector_db)
print(text_chunks_dict[str(I[0][0])]["text"])

# Incremental Summarization


In [None]:
increment_chars=10000
char_start_index=0
segments=3
model_responses=[]

model_prompt="Your purpose is to take a transcript from a youtube streamer named Destiny and give a synopsis of the content and the sentiment/takes of the speaker. Include all of the topics even if they are covered briefly instead of just covering the main topic."

get_response=False


# get a certain number of segments
while (len(model_responses)<segments) and ((char_start_index+increment_chars)<=len(transcript)):
    input_transcript=transcript[char_start_index:char_start_index+increment_chars]

    conv_messages=[{"role": "user", "content": "Transcript: "+input_transcript}]
    bot_response=""

    # display start and endtime
    start_second_raw=get_time_at_length_transcript(char_start_index)
    hours = math.floor(start_second_raw / 3600)
    minutes = math.floor((start_second_raw % 3600) / 60)
    seconds = start_second_raw % 60

    # calculate end time
    end_second_raw=get_time_at_length_transcript(char_start_index+increment_chars)
    hours_end = math.floor(end_second_raw / 3600)
    minutes_end = math.floor((end_second_raw % 3600) / 60)
    seconds_end = end_second_raw % 60

    print(f"Start time {int(hours):02d}:{int(minutes):02d}:{seconds:06.3f}  End time {int(hours_end):02d}:{int(minutes_end):02d}:{seconds_end:06.3f}")

    

    if get_response:
        with client.messages.stream(
            max_tokens=2024,
            system=model_prompt,
            messages=conv_messages,
            #model="claude-3-opus-20240229",
            model="claude-3-sonnet-20240229",
        ) as stream:
            for text in stream.text_stream:
                bot_response+=text
                print(text, end="", flush=True)
        print()

        get_cost(input_transcript+model_prompt,bot_response)
        print("Cost: ",total_cost)

        model_responses.append({"bot": bot_response,"transcript": input_transcript})
    
    char_start_index+=increment_chars-300
        
