# Setup

In [20]:
# Import everything and setup things
import sys
import os
file_folder = os.getcwd()
if "testing_folder" in file_folder:
    file_folder = os.path.dirname(file_folder)
    sys.path.append(file_folder)
    os.chdir(file_folder)

import datetime
import math
import time
import json
from youtube_transcript_api import YouTubeTranscriptApi
import tiktoken
import faiss
import numpy as np
import asyncio
import ipywidgets
from IPython.display import display, HTML
enc=tiktoken.get_encoding("cl100k_base")


keys=json.load(open("working_folder/keys.json"))

# Setup anthropic
import anthropic
client = anthropic.Anthropic(api_key=keys["anthropic"])

# Setup openai
from openai import OpenAI
os.environ["OPENAI_API_KEY"]=keys["openai"]
openai_client = OpenAI(api_key=keys["openai"])

# Get transcript, Define video

In [2]:
# Get raw transcript
video_id="FC3giY7YDAQ"
if os.path.isdir("working_folder")==False:
    os.mkdir("working_folder")
if os.path.isdir("working_folder/"+video_id)==False:
    os.mkdir("working_folder/"+video_id)

raw_transcript=YouTubeTranscriptApi.get_transcript(video_id)
transcript=""
nearest_times={}
for m in raw_transcript:
    print(m['text'], m["start"])
    transcript+=str(m['text'])+"\n"
    nearest_times[len(transcript)]=m["start"]
    # if len(transcript)>50000:
    #     break

# save as transcript.txt
with open("working_folder/transcript.txt","w") as f:
    f.write(transcript)

e 28.96
e 58.92
e 88.92
e 118.84
e 148.84
oh 164.56
hello 178.84
why is short uh leaving for 192.319
nebka uh so packing stuff and doing 196.0
[ __ ] The Simple Solution traffic seven 203.36
years old why'd you link this wait hello 205.76
hello okay 208.04
for how long uh just a day I'll be back 210.64
on the 212.319
12th here I have a video please watch 229.04
unfold the fundamental problem of 232.2
traffic on green the first car 234.239
accelerates and then the next and then 236.159
the next and then the next next and then 238.36
you only to catch the red had the cars 240.599
accelerated simultaneously you would 243.439
have made it through coordination not 245.28
cars is the problem because we are 248.0
monkey drivers with slow reaction times 250.239
and short attention spans even if we 252.56
tried getting everyone to press the 254.92
pedal on 3 2 1 now would be challenging 256.759
this discoordination limits how many 260.72
cars can get through an intersection and 262.479
when one

# Intialize Variables and Functions

In [42]:
# Cost calculation
input_cost=0
output_cost=0
total_cost=input_cost+output_cost
def get_cost(input_text,output_text):
    global input_cost
    global output_cost
    global total_cost
    input_cost+=len(enc.encode(input_text))*(3/1000000.0)
    output_cost+=len(enc.encode(output_text))*(15/1000000.0)
    total_cost=input_cost+output_cost

# make vector db
def make_vector_db(text_document):
    # Chunk text
    chunk_size=1000
    text_chunks=[text_document[i:i+chunk_size] for i in range(0, len(text_document), chunk_size)]
    print("Number of chunks: ",len(text_chunks))

    # Generate embeddings
    model="text-embedding-3-large"
    embeddings = []
    for chunk in text_chunks:
        response = client.embeddings.create(input=chunk, model=model)
        embeddings.append(response.data[0].embedding)
    embeddings=np.array(embeddings)
    print("Finished generating embeddings")

    # Make vector db
    vector_db=faiss.IndexFlatL2(embeddings.shape[1])
    vector_db.add(np.array(embeddings))

    # return text chunks and vector db
    return(text_chunks, embeddings, vector_db)

# Async function to fetch embeddings
async def fetch_embeddings_async(text_chunks, model):
    model="text-embedding-3-large"
    async def fetch_embedding(chunk):
        # Simulate an async call to the embeddings API
        return await asyncio.to_thread(openai_client.embeddings.create, input=chunk, model=model)

    responses = await asyncio.gather(*(fetch_embedding(chunk) for chunk in text_chunks))
    embeddings = [response.data[0].embedding for response in responses]
    return np.array(embeddings)

# make vector db
async def make_vector_db_fast(text_document):
    # Chunk text
    chunk_size=1000
    text_chunks=[text_document[i:i+chunk_size] for i in range(0, len(text_document), chunk_size)]
    print("Number of chunks: ",len(text_chunks))

    # Generate embeddings
    model="text-embedding-3-large"
    embeddings=await fetch_embeddings_async(text_chunks, model)
    print("Finished generating embeddings")

    # Make vector db
    vector_db=faiss.IndexFlatL2(embeddings.shape[1])
    vector_db.add(np.array(embeddings))

    # return text chunks and vector db
    return(text_chunks, embeddings, vector_db)


# vector search
def search_vector_db(query, k, vector_db):
    # Generate query embedding
    query_embedding = openai_client.embeddings.create(input=query,model="text-embedding-3-large").data[0].embedding
    query_embedding_np = np.array(query_embedding).astype('float32').reshape(1, -1)

    D, I = vector_db.search(query_embedding_np, k)
    return (D,I)

# Get time of transcript
def get_time_at_length_transcript(length):
    i=0
    while nearest_times.get(length-i,None)==None:
        i+=1
        if (length-i)<=0:
            return list(nearest_times.values())[0]

    return(nearest_times[length-i])

# Put hyperlink to time in text
def convert_to_html(text,start_second, index):
    lines = text.split('\n')
    len_so_far=0
    html_lines = []
    for line in lines:
        line = line.strip()
        if line:
            time_at_hyperlink=get_time_at_length_transcript(index*1000+len_so_far)+3
            hyperlinked_line = f'<a href="https://www.youtube.com/watch?v={video_id}#t={time_at_hyperlink}s"c">{line}</a>'
            html_lines.append(hyperlinked_line)
        len_so_far+=len(line)

    html_text = '<br>'.join(html_lines)
    return html_text

# Produce embedding search index

In [9]:
# Produce text chunks and vector db
text_chunks, embeddings, vector_db= await make_vector_db_fast(transcript)#[:10000])

# Save json text chunks
text_chunks_dict={}
index=0
for t_chunk in text_chunks:
    text_chunks_dict[str(index)]={"text":t_chunk, "start":get_time_at_length_transcript(transcript.find(t_chunk)), "end":get_time_at_length_transcript(transcript.find(t_chunk)+len(t_chunk))}
    index+=1
json.dump(text_chunks_dict,open("working_folder/"+video_id+"/text_chunks_dict.json","w"))

# Save vector db
faiss.write_index(vector_db, "working_folder/"+video_id+"/vector_db.index")

Number of chunks:  143
Finished generating embeddings


## Search index

In [34]:
text_chunks_dict["0"].keys()

dict_keys(['text', 'start', 'end'])

In [43]:
#load from text_chunks_dict
text_chunks_dict=json.load(open("working_folder/"+video_id+"/text_chunks_dict.json"))
D, I=search_vector_db("Remove moles",5,vector_db)

thtml=convert_to_html(text_chunks_dict[str(I[0][0])]["text"],text_chunks_dict[str(I[0][0])]["start"],I[0][0])
display(HTML(thtml))

# # print results
# print("https://www.youtube.com/watch?v="+video_id+"#t="+str(text_chunks_dict[str(I[0][0])]["start"])+"s")
# print(text_chunks_dict[str(I[0][0])]["text"])

# # produce html of the text so it can be displayed
# html_text=text_chunks_dict[str(I[0][0])]["text"]


# # get all indexes of "/n" in the text
# new_line_indexes = [i for i in range(len(html_text)) if html_text.startswith('\n', i)]
# print(new_line_indexes)


# #for i in range(5):
# #    html_text+=text_chunks_dict[str(I[0][i])]["text"]#+"<br>"
# #print(html_text)

# # save html
# with open("working_folder/"+video_id+"/search_results.html","w") as f:
#     f.write(html_text)


# # display text in html below

# display(HTML(html_text))

# Incremental Summarization


In [None]:
increment_chars=10000
char_start_index=0
segments=3
model_responses=[]

model_prompt="Your purpose is to take a transcript from a youtube streamer named Destiny and give a synopsis of the content and the sentiment/takes of the speaker. Include all of the topics even if they are covered briefly instead of just covering the main topic."

get_response=False


# get a certain number of segments
while (len(model_responses)<segments) and ((char_start_index+increment_chars)<=len(transcript)):
    input_transcript=transcript[char_start_index:char_start_index+increment_chars]

    conv_messages=[{"role": "user", "content": "Transcript: "+input_transcript}]
    bot_response=""

    # display start and endtime
    start_second_raw=get_time_at_length_transcript(char_start_index)
    hours = math.floor(start_second_raw / 3600)
    minutes = math.floor((start_second_raw % 3600) / 60)
    seconds = start_second_raw % 60

    # calculate end time
    end_second_raw=get_time_at_length_transcript(char_start_index+increment_chars)
    hours_end = math.floor(end_second_raw / 3600)
    minutes_end = math.floor((end_second_raw % 3600) / 60)
    seconds_end = end_second_raw % 60

    print(f"Start time {int(hours):02d}:{int(minutes):02d}:{seconds:06.3f}  End time {int(hours_end):02d}:{int(minutes_end):02d}:{seconds_end:06.3f}")

    

    if get_response:
        with client.messages.stream(
            max_tokens=2024,
            system=model_prompt,
            messages=conv_messages,
            #model="claude-3-opus-20240229",
            model="claude-3-sonnet-20240229",
        ) as stream:
            for text in stream.text_stream:
                bot_response+=text
                print(text, end="", flush=True)
        print()

        get_cost(input_transcript+model_prompt,bot_response)
        print("Cost: ",total_cost)

        model_responses.append({"bot": bot_response,"transcript": input_transcript})
    
    char_start_index+=increment_chars-300
        


# Async Summarization

In [30]:
# Produce summaries for each transcript segment asynchroneously
async def get_claude_responses(input_data):
    input_transcripts=[]
    input_indexes=[]
    for input_ in input_data:
        input_transcripts.append(input_[0])
        input_indexes.append(input_[1])
    

    model_prompt="Your purpose is to take a transcript from a youtube streamer named Destiny and give a synopsis of the content and the sentiment/takes of the speaker. Include all of the topics even if they are covered briefly instead of just covering the main topic."

    # Def synchronous api call
    def fetch_response(transcript,index):
        conv_messages=[{"role": "user", "content": "Transcript: "+transcript}]
        bot_response=""
        try:
            print("Sleeping for ",index)
            #time.sleep(index)
            with client.messages.stream(
                    max_tokens=2024,
                    system=model_prompt,
                    messages=conv_messages,
                    model="claude-3-sonnet-20240229",
                ) as stream:
                    for text in stream.text_stream:
                        bot_response+=text
        except:
            print("Error in api call ",index)
            time.sleep(10)
            with client.messages.stream(
                    max_tokens=2024,
                    system=model_prompt,
                    messages=conv_messages,
                    model="claude-3-sonnet-20240229",
                ) as stream:
                    for text in stream.text_stream:
                        bot_response+=text
            print("Retrying api call ",index)

        return [bot_response,index]
    
    # Create thread to run api call
    async def thread_fetch(transcript,index):
        thread = await asyncio.to_thread(fetch_response, transcript,index)
        return(thread)

    # Gather all the responses
    responses = await asyncio.gather(*(thread_fetch(in_data[0],in_data[1]) for in_data in input_data))
    return(responses)


In [41]:
# Setup Variables
increment_chars=10000
char_start_index=0
#segments=3
segments=100
model_responses=[]
tasks=[]
index=0

# get a certain number of segments
while (len(model_responses)<segments) and ((char_start_index+increment_chars)<=len(transcript)):
    input_transcript=transcript[char_start_index:char_start_index+increment_chars]
    conv_messages=[{"role": "user", "content": "Transcript: "+input_transcript}]
    bot_response=""

    # display start and endtime
    start_second_raw=get_time_at_length_transcript(char_start_index)
    hours = math.floor(start_second_raw / 3600)
    minutes = math.floor((start_second_raw % 3600) / 60)
    seconds = start_second_raw % 60

    # calculate end time
    end_second_raw=get_time_at_length_transcript(char_start_index+increment_chars)
    hours_end = math.floor(end_second_raw / 3600)
    minutes_end = math.floor((end_second_raw % 3600) / 60)
    seconds_end = end_second_raw % 60

    sf_str=f"Start time {int(hours):02d}:{int(minutes):02d}:{seconds:06.3f}  End time {int(hours_end):02d}:{int(minutes_end):02d}:{seconds_end:06.3f}"
    
    model_responses.append({"bot": "","transcript": input_transcript,"time_string":sf_str,"char_start_finsih_indexes":[char_start_index,char_start_index+increment_chars], "index":index})

    index+=1
    char_start_index+=increment_chars-300



# get approximate cost of run
prev_cost=total_cost
for m in model_responses:
    get_cost(m["transcript"],"a b c"*200)
print("Approximate cost: ",total_cost-prev_cost)

# Get user decision to proceed
proceed=input("Proceed with run? (y/n): ")
if proceed.lower()!="y":
    print("Run cancelled")
else:
    bot_responses=await get_claude_responses([[m["transcript"],m["index"]] for m in model_responses])
    for i in range(len(bot_responses)):
        model_responses[i]["bot"]=bot_responses[i][0]
        get_cost(model_responses[i]["transcript"],model_responses[i]["bot"])
    print(total_cost)

# save model responses to json
json.dump(model_responses,open("working_folder/"+video_id+"/model_responses.json","w"))


Approximate cost:  0.18804900000000013
Sleeping for Sleeping for  1
Sleeping for  2
 0
Sleeping for  3
Sleeping for  4
Sleeping for  5
Sleeping for  6
Sleeping for  7
Sleeping for  8
Sleeping for  9
Sleeping for  10
Sleeping for  11
Sleeping for  12
Sleeping for  13
2.0838569999999996


# Cost description
0.003 per kt input 
0.015 per kt output

typical input is a round 2.5kt, typical output is around 0.35kt
input $0.0075 and output $0.005 for 1 summary
input $0.11 and output $0.075 for 14 summaries totalling 3 hrs with about 15 minutes per segment
total $0.18 for 3 hrs

In [5]:
mrsl=json.load(open("working_folder/"+video_id+"/model_responses.json"))

In [6]:
all_summaries=""
for mr in mrsl:
    print(mr["time_string"])
    print(mr["bot"],"\n")
    all_summaries+=mr["time_string"]+"\n"+mr["bot"]+"\n\n"


Start time 00:00:28.960  End time 00:14:25.639
Here is a synopsis of the topics and sentiments expressed in the transcript:

Traffic and Solutions
- The video explains the "fundamental problem of traffic" at intersections - cars accelerating one after the other instead of simultaneously, leading to discoordination and traffic backups.
- It proposes some potential solutions:
1) People driving better by not tailgating and maintaining equal distance front and back. But admits getting humans to change behavior is difficult.
2) Self-driving cars that can accelerate/brake simultaneously and communicate with each other, eliminating need for intersections and traffic lights.
- The speaker seems to favor the self-driving car solution as a "systematized" fix rather than relying on changing human behavior.

Reaction/Debunking Video
- A separate video debunks and criticizes the "simple traffic solution" video, arguing it is misinformed and harmful.
- Main critique is that removing intersections, a

In [65]:
print(model_prompt)

NameError: name 'model_prompt' is not defined

In [None]:
model_prompt="Your purpose is to take a transcript from a youtube streamer named Destiny and give a synopsis of the content and the sentiment/takes of the speaker. Include all of the topics even if they are covered briefly instead of just covering the main topic."

In [66]:
meta_model_prompt="Your purpose is to take a conglomerate of summaries and compile it into one summary which provides a quick and effective way of knowing what things were talked about in the collection of summaries. The summaries are off of a youtube video transcript of a youtube streamer named Destiny."

In [68]:
# Meta summary
bot_response=""
with client.messages.stream(
        max_tokens=2024,
        system=meta_model_prompt,
        messages=[{"role":"user", "content": "Collection of summaries for the video/transcript: "+all_summaries}],
        model="claude-3-sonnet-20240229",
    ) as stream:
        for text in stream.text_stream:
            bot_response+=text

In [72]:
print("Before:",len(enc.encode(transcript)),"  After:",len(enc.encode(bot_response)))
print(bot_response)


Before: 35242   After: 362
Here is a summary compiling the key points from the collection of summaries:

This video/transcript covered a wide range of topics with Destiny sharing his thoughts and opinions. Some of the main topics included:

Traffic and Transportation Solutions
Destiny discussed the "fundamental problem of traffic" at intersections and entertained potential solutions like self-driving cars that could communicate and accelerate/brake in coordination to alleviate traffic issues. However, he acknowledged critiques that removing intersections could create major problems for pedestrians.

Candace Owens' Background and College Experience 
Destiny expressed significant skepticism about the details and truthfulness of Candace Owens' claims regarding her upbringing, college costs/debt, and ability to pay it off quickly after dropping out. He suggested her story seemed implausible or exceptionally atypical compared to normal college/debt experiences in the U.S.

Value of College 