# TOC at E-M  

## This is the master prompt engineering notebook for DestinyRecaps

In [3]:
# DIRECTORY SET
import os
import sys
from pathlib import Path
base_dir=Path(os.getcwd()).parent
os.chdir(base_dir)

import yaml
# data path
prompt_yaml_path="destinyapp/prompter_.yaml"


In [4]:
# DJANGO SETUP
import django
sys.path.append(os.path.abspath(''))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "serverproject.settings")
django.setup()

# Import custom modules
from destinyapp import views
from destinyapp.models import TranscriptData
from destinyapp.api import ServerAiFunctions as saf

# Import async modules
import asyncio
from asgiref.sync import sync_to_async

# Import display modules
from IPython.display import display, Markdown

# Import other modules
import faiss

# import reloading
from importlib import reload

Keys loaded


# Useful reusables

In [6]:
del sys.modules['destinyapp.api.ServerAiFunctions']
import destinyapp.api.ServerAiFunctions as saf
reload(saf)

async def run_systems(system_functions, input_contexts):
    async def run_system(system_function):
        tasks=[]
        for input_context in input_contexts:
            task=system_function(**input_context)
            tasks.append(task)

        responses=await asyncio.gather(*tasks)
        return responses

    systems=[]
    for system_function in system_functions:
        system=run_system(system_function)
        systems.append(system)
    
    system_responses=await asyncio.gather(*systems)

    return system_responses

# save to prompt yaml
def save_to_prompt_yaml(input_prompt_data):
    with open(prompt_yaml_path, 'r') as file:
        prompt_data=yaml.load(file, Loader=yaml.FullLoader)
    prompt_data["working"]=input_prompt_data
    with open(prompt_yaml_path, 'w') as file:
        yaml.dump(prompt_data, file)

# TOC id=E-M
### Items
- Summary Chunks id=9xq
- Recap id=7l.
    - Discord
    - Website
- Hook id=1m3
- Chat v1@

In [5]:
from moviepy.editor import AudioFileClip, concatenate_audioclips
import asyncio
import os

# delete previous audio file with 'raw' in the name
# audio_dir_files=os.listdir("workingaudio")
# for file_name in audio_dir_files:
#     if 'raw' in file_name:
#         os.remove("workingaudio/"+file_name)
#         break

# open destinyspeaking.mp3
destiny_speech_path="workingaudio/destinyspeaking.mp3"
destiny_speech = AudioFileClip(destiny_speech_path)
audio_dir_files=os.listdir("workingaudio")

# find the audio file with 'raw' in the name
for file_name in audio_dir_files:
    if 'raw' in file_name:
        youtube_audio_path="workingaudio/"+file_name
        break

# Concatentate the two audio files
youtube_video = AudioFileClip(youtube_audio_path)
merged_audio = concatenate_audioclips([destiny_speech, youtube_video])
merged_audio.write_audiofile("workingaudio/merged_audio.mp3")
print("download thread finished")


MoviePy - Writing audio in workingaudio/merged_audio.mp3


                                                                          

MoviePy - Done.
download thread finished


In [7]:
# load merged audio and make a new audio file which is only the first 2000 seconds
merged_audio = AudioFileClip("workingaudio/merged_audio.mp3")
merged_audio = merged_audio.subclip(0, 2000)
merged_audio.write_audiofile("workingaudio/testing_merged_audio.mp3")

MoviePy - Writing audio in workingaudio/testing_merged_audio.mp3


                                                                        

MoviePy - Done.


In [17]:
yt_id="29ixeHeiLZI"
raw_transcript_data=await saf.assembly_transcript_generation(yt_id, "workingaudio/merged_audio.mp3")#os.path.join(output_folder,audio_file_name))
save_raw_transcript_data={"raw_transcript_data": raw_transcript_data}
print("Raw Transcript Finished")


Starting assembly transcription thread
Finished assembly transcription thread
Raw Transcript Finished


In [18]:
# transcript_model_data=await saf.grab_transcript_data(yt_id)
raw_transcript_data=save_raw_transcript_data["raw_transcript_data"]
save_processed_transcripts=await saf.process_raw_transcript(raw_transcript_data, yt_id)
save_processed_transcripts["raw_transcript_data"]=raw_transcript_data

# save transcript data
await saf.save_data(yt_id, save_processed_transcripts)

{'text': 'looks', 'start': 186890, 'end': 187242, 'confidence': 0.93248, 'speaker': 'A', 'channel': None}
578
680
Destiny starts
Finished diarization cutoff
updating transcript data


In [8]:
yt_id="29ixeHeiLZI"
raw_transcript_data=await saf.testing_assembly_transcript_generation(yt_id, "workingaudio/testing_merged_audio.mp3")#os.path.join(output_folder,audio_file_name))
save_raw_transcript_data={"raw_transcript_data": raw_transcript_data}
print("Raw Transcript Finished")


Starting assembly transcription thread
Finished assembly transcription thread
Raw Transcript Finished


AttributeError: 'NoneType' object has no attribute 'raw_transcript_data'

In [None]:
yt_id="29ixeHeiLZI"
raw_transcript_data=await saf.testing_assembly_transcript_generation(yt_id, "workingaudio/testing_merged_audio.mp3")#os.path.join(output_folder,audio_file_name))
save_raw_transcript_data={"raw_transcript_data": raw_transcript_data}
print("Raw Transcript Finished")


In [9]:
# transcript_model_data=await saf.grab_transcript_data(yt_id)
raw_transcript_data=save_raw_transcript_data["raw_transcript_data"]
save_processed_transcripts=await saf.process_raw_transcript(raw_transcript_data, yt_id)
save_processed_transcripts["raw_transcript_data"]=raw_transcript_data

# save transcript data
await saf.save_data(yt_id, save_processed_transcripts)

{'text': 'looks', 'start': 186955, 'end': 187331, 'confidence': 0.97028, 'speaker': 'A', 'channel': None}
587
668
Destiny starts
Finished diarization cutoff
creating new transcript data


In [10]:
from copy import deepcopy

In [11]:
save_processed_transcripts_nano=deepcopy(save_processed_transcripts)

In [2]:
# Video Download
import yt_dlp
from moviepy.editor import AudioFileClip, concatenate_audioclips
import asyncio
import os

async def video_download(video_id):#, output_folder, output_name):
    """Takes a video id, downloads the video from youtube, concatenates the video with a pre-recorded audio file of the target speaker
    
    This allows targeted diarization of the audio file."""

    # Download video
    def download_video_thread(video_id):
        # Set download parameters
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': 'workingaudio/raw'+'.%(ext)s',#os.path.join(output_folder,output_name)+'.%(ext)s',
            'age_limit': 21, 
        }

        # delete previous audio file with 'raw' in the name
        audio_dir_files=os.listdir("workingaudio")
        for file_name in audio_dir_files:
            if 'raw' in file_name:
                os.remove("workingaudio/"+file_name)
                break

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download(["https://www.youtube.com/watch?v=29ixeHeiLZI"])
            # ydl.download(['https://youtu.be/'+video_id])

        # open destinyspeaking.mp3
        destiny_speech_path="workingaudio/destinyspeaking.mp3"
        destiny_speech = AudioFileClip(destiny_speech_path)
        audio_dir_files=os.listdir("workingaudio")

        # find the audio file with 'raw' in the name
        for file_name in audio_dir_files:
            if 'raw' in file_name:
                youtube_audio_path="workingaudio/"+file_name
                break

        # Concatentate the two audio files
        youtube_video = AudioFileClip(youtube_audio_path)
        merged_audio = concatenate_audioclips([destiny_speech, youtube_video])
        merged_audio.write_audiofile("workingaudio/merged_audio.mp3")
        print("download thread finished")
        return

    await asyncio.to_thread(download_video_thread, video_id)
    print("download thread closed")
await video_download("29ixeHeiLZI")


[youtube] Extracting URL: https://www.youtube.com/watch?v=29ixeHeiLZI
[youtube] 29ixeHeiLZI: Downloading webpage
[youtube] 29ixeHeiLZI: Downloading ios player API JSON
[youtube] 29ixeHeiLZI: Downloading player 81a0fcab
[youtube] 29ixeHeiLZI: Downloading m3u8 information
[info] 29ixeHeiLZI: Downloading 1 format(s): 251
[download] Destination: workingaudio\raw.webm
[download] 100% of  454.23MiB in 00:05:48 at 1.30MiB/s      
download thread closed


# Summary Chunks 9xq

In [None]:
# Reset saf 
del sys.modules['destinyapp.api.ServerAiFunctions']
import destinyapp.api.ServerAiFunctions as saf
reload(saf)

In [None]:
# Get data from database
video_ids=["3kJr7ODrwNw"]
all_transcript_model_data=[]
for video_id in video_ids:
    trancript_model_data=await saf.grab_transcript_data(video_id)
    all_transcript_model_data.append(trancript_model_data)

In [None]:
len(trancript_model_data.transcript)

In [None]:
# make request to get the data
keys=views.keys
import requests
request_video_id="ZBcQEnCDgDg"
url="https://destinyrecaps.com/api/view_raw_transcripts"
query_params={"mra":keys["req_pass"], "video_id":request_video_id}
response=requests.get(url, params=query_params)

request_raw_transcript_data=response.json()
processed_transcript_data=await saf.process_raw_transcript(request_raw_transcript_data["response"], request_video_id)

In [None]:
summarization_prompt="""Your purpose is to take a transcript from a youtube streamer named Destiny and give a synopsis of the content and the sentiment/takes of the speaker. Include all of the topics even if they are covered briefly instead of just covering the main topic although you should do that as well. The main topic or seeming focus of the segment and all of the things said or discussed. This should be quite long.
        
FYI: The transcript is diarized, Destiny should be annotated 'Destiny' with other speaker being a default from the transcription engine like b, c, d ... etc. You may have to use some intuition to figure out what is happening."""

In [None]:
print(processed_transcript_data["transcript"][0:29000])

In [None]:
systems=[saf.summarized_segment_generator.generate_summarized_segments]

# set config parameters
config_parameters=[
    {"model_company": saf.ModelCompanyEnum.anthropic, "model_name": saf.ModelNameEnum.claude_3_5_sonnet},
    {"model_company": saf.ModelCompanyEnum.openai, "model_name": saf.ModelNameEnum.gpt_4o, "summarization_prompt": saf.summarized_segment_generator.long_summarization_prompt},
]
# set inputs
input_variants=[
    {"transcript": processed_transcript_data["transcript"][0:29000]}
]

# set input contexts
input_contexts=[]
for input_variant in input_variants:
    for config_parameter in config_parameters:
        input_contexts.append({**config_parameter, **input_variant})

# run systems
system_responses=await run_systems(systems, input_contexts)

# save to prompt yaml
save_to_prompt_yaml(system_responses)

In [None]:
# TBD
summarized_segments=await saf.summarized_segment_generator.generate_summarized_segments(processed_transcript_data["transcript"][0:29000])

In [None]:
for summarized_segment in summarized_segments:
    print(len(summarized_segment["summary"]), len(summarized_segment["transcript"]))

In [None]:
print(summarized_segments[2]["summary"])

In [None]:
print(summarized_segments[2]["summary"])

In [None]:
model_name=saf.ModelNameEnum.claude_3_5_sonnet
cost=0
input_token_count=0
output_token_count=0
for summarized_segment in summarized_segments:
    input_token_count+=len(saf.enc.encode(summarized_segment["transcript"]))
    output_token_count+=len(saf.enc.encode(summarized_segment["summary"]))
    cost+=saf.calculate_cost(model_name, summarized_segment["transcript"], summarized_segment["summary"])
print(cost, input_token_count, output_token_count)
print("Average input/output per segment: ", input_token_count/len(summarized_segments), output_token_count/len(summarized_segments))

In [None]:
len(saf.enc.encode("a c"*2290))

In [None]:
num_segments=len(summarized_segments)
cost=saf.calculate_cost(model_name, "a c"*2290*num_segments, "a c"*640*num_segments)
print(cost)

In [None]:
len(saf.enc.encode(summarized_segment["summary"]))

In [None]:
(47*5000/5)*3/(1000*1000)

In [None]:
(47*2000/5)*15/(1000*1000)

In [None]:
print(len(summarized_segment["transcript"]))

In [None]:
# import deepcopy
from copy import deepcopy

In [None]:
old_summarized_segments=deepcopy(summarized_segments)

In [None]:
print(old_summarized_segments[-3]["summary"])

In [None]:
print(summarized_segment["summary"])

# Recap 71.

In [None]:
# production test
discord_recaps_to_send=[{"meta":"Test","yt_id":"3kJr7ODrwNw"}]
trancript_model_data=await saf.grab_transcript_data(discord_recaps_to_send[0]["yt_id"])
recap = await saf.meta_summary_generator.generate_meta_summary(trancript_model_data.summarized_chunks)

In [None]:
# Reset saf 
del sys.modules['destinyapp.api.ServerAiFunctions']
import destinyapp.api.ServerAiFunctions as saf
reload(saf)

In [None]:
# load transcript data for a video id

video_ids=["3kJr7ODrwNw"]

for video_id in video_ids:
    trancript_model_data=await saf.grab_transcript_data(video_id)
    saf.meta_summary_geneator.generate_meta_summary
    recap=await saf.generate_meta_summary(trancript_model_data.summarized_chunks, video_id)

    save_to_prompt_yaml({"recap":recap})

    



In [None]:
recap = await saf.meta_summary_geneator.generate_meta_summary(trancript_model_data.summarized_chunks)

In [None]:
discord_recaps_to_send=[{"meta":"Test","yt_id":"3kJr7ODrwNw"}]
print("Getting data")
trancript_model_data=await saf.grab_transcript_data(discord_recaps_to_send[0]["yt_id"])

In [None]:
video_ids=["3kJr7ODrwNw"]
all_transcript_model_data=[]
for video_id in video_ids:
    trancript_model_data=await saf.grab_transcript_data(video_id)
    all_transcript_model_data.append(trancript_model_data)

systems=[saf.meta_summary_geneator.generate_meta_summary]

# set config parameters
config_parameters=[
    {"model_company": saf.ModelCompanyEnum.anthropic, "model_name": saf.ModelNameEnum.claude_3_5_sonnet, "meta_model_prompt": saf.meta_summary_geneator.html_sytem},
    {"model_company": saf.ModelCompanyEnum.openai, "model_name": saf.ModelNameEnum.gpt_4o, "meta_model_prompt": saf.meta_summary_geneator.html_sytem},
]
# set inputs
input_variants=[
    {"summarized_chunks": trancript_model_data.summarized_chunks}
]

# set input contexts
input_contexts=[]
for input_variant in input_variants:
    for config_parameter in config_parameters:
        input_contexts.append({**config_parameter, **input_variant})

# run systems
system_responses=await run_systems(systems, input_contexts)

# save to prompt yaml
save_to_prompt_yaml(system_responses)

In [None]:
system_responses[0][0]

# Zoom in tags

In [None]:
trancript_model_data=await saf.grab_transcript_data(video_id)

In [None]:
trancript_model_data.summarized_chunks

In [None]:
recap="""Here's a comprehensive summary of the topics discussed in the collection of
summaries, organized into main topics and smaller details:

# Main Topics

### 1\. Content Creation and Streaming

  * **Twitch and streaming:** Destiny expresses frustration with Twitch, criticizing the current state of political content on the platform
  * **Relationships with other streamers:** Mentions being on good terms with several Twitch streamers
  * **Kit (streaming platform):** Discusses Kit's potential to compete with Twitch
  * **Collaboration with other creators:** Expresses reluctance due to differences in approach and audience expectations

### 2\. Political Commentary and Debates

  * **Criticism of political commentators:** Particularly those who made incorrect predictions about COVID-19
  * **Discussion of political extremes:** Potential for extremes to work together, drawing parallels to historical events
  * **Science and politics:** Criticizes misrepresentation of scientific facts for political purposes
  * **Israel-Palestine conflict:** References to debates and discussions on this topic

### 3\. Technical Issues and Discussions

  * **Video file management:** Issues with large video files and potential solutions
  * **Russian bot operation analysis:** In-depth discussion of a potential Russian bot operation on Twitter, focusing on error messages and technical aspects

### 4\. Personal Life and Relationships

  * **Work-life balance:** Discusses the relationship between career success and personal life
  * **Relationship discussions:** Comments on breakups and relationship dynamics

# Smaller Details and Brief Mentions

  * Upcoming events and travel plans
  * Typing speed and tests
  * Current events (e.g., protest group damaging Stonehenge)
  * Personal stance on issues and desire for fair treatment in online discourse
  * Mental state and need for emotional organization
  * Brief mentions of historical atrocities (Holocaust, Holodomor, Great Leap Forward)
  * Music and cultural gatekeeping
  * January 6th event
  * Merchandise promotion
  * YouTube drama
  * News story about a 12-year-old Jewish girl in Paris
  * Cocaine-related incident involving a 9-year-old
  * Qatar's relationship with Hamas and Al Jazeera
  * Douglas Murray's comments on Israel-Hamas conflict
  * Age of consent laws and pedophilia (mentioned in confusion)
  * Illegal firearm modifications

Overall, Destiny's tone throughout these discussions is often critical,
skeptical, and analytical. He frequently expresses frustration with
misinformation, personal attacks, and what he perceives as inconsistencies in
others' arguments or behaviors. He emphasizes the importance of substantive
debates and accurate information in public discourse.
"""
recap="""Here's a comprehensive summary of the topics discussed in the collection of summaries, organized into main topics and smaller details:

<h2>Main Topics</h2>

<h3>1. Content Creation and Streaming</h3>
<ul>
  <li><strong>Twitch and streaming:</strong> Destiny expresses frustration with Twitch, criticizing the current state of political content on the platform</li>
  <li><strong>Relationships with other streamers:</strong> Mentions being on good terms with several Twitch streamers</li>
  <li><strong>Kit (streaming platform):</strong> Discusses Kit's potential to compete with Twitch</li>
  <li><strong>Collaboration with other creators:</strong> Expresses reluctance due to differences in approach and audience expectations</li>
</ul>

<h3>2. Political Commentary and Debates</h3>
<ul>
  <li><strong>Criticism of political commentators:</strong> Particularly those who made incorrect predictions about COVID-19</li>
  <li><strong>Discussion of political extremes:</strong> Potential for extremes to work together, drawing parallels to historical events</li>
  <li><strong>Science and politics:</strong> Criticizes misrepresentation of scientific facts for political purposes</li>
  <li><strong>Israel-Palestine conflict:</strong> References to debates and discussions on this topic</li>
</ul>

<h3>3. Technical Issues and Discussions</h3>
<ul>
  <li><strong>Video file management:</strong> Issues with large video files and potential solutions</li>
  <li><strong>Russian bot operation analysis:</strong> In-depth discussion of a potential Russian bot operation on Twitter, focusing on error messages and technical aspects</li>
</ul>

<h3>4. Personal Life and Relationships</h3>
<ul>
  <li><strong>Work-life balance:</strong> Discusses the relationship between career success and personal life</li>
  <li><strong>Relationship discussions:</strong> Comments on breakups and relationship dynamics</li>
</ul>

<h2>Smaller Details and Brief Mentions</h2>

<ul>
  <li>Upcoming events and travel plans</li>
  <li>Typing speed and tests</li>
  <li>Current events (e.g., protest group damaging Stonehenge)</li>
  <li>Personal stance on issues and desire for fair treatment in online discourse</li>
  <li>Mental state and need for emotional organization</li>
  <li>Brief mentions of historical atrocities (Holocaust, Holodomor, Great Leap Forward)</li>
  <li>Music and cultural gatekeeping</li>
  <li>January 6th event</li>
  <li>Merchandise promotion</li>
  <li>YouTube drama</li>
  <li>News story about a 12-year-old Jewish girl in Paris</li>
  <li>Cocaine-related incident involving a 9-year-old</li>
  <li>Qatar's relationship with Hamas and Al Jazeera</li>
  <li>Douglas Murray's comments on Israel-Hamas conflict</li>
  <li>Age of consent laws and pedophilia (mentioned in confusion)</li>
  <li>Illegal firearm modifications</li>
</ul>

<p>Overall, Destiny's tone throughout these discussions is often critical, skeptical, and analytical. He frequently expresses frustration with misinformation, personal attacks, and what he perceives as inconsistencies in others' arguments or behaviors. He emphasizes the importance of substantive debates and accurate information in public discourse.</p>
"""

# grab which data is associated with each title
import re
def extract_titles_and_list_items(recap):
    title_pattern=re.compile(r'<h3>(.*?)</h3>')
    list_item_pattern=re.compile(r'<li>(.*?)</li>')
    titles=title_pattern.findall(recap)
    list_items=list_item_pattern.findall(recap)
    return titles, list_items

titles, list_items=extract_titles_and_list_items(recap)

In [None]:
test_response=await saf.recap_zoomed_in_generator.annotate_zoom_chunks(summaized_chunks=trancript_model_data.summarized_chunks, recap=recap)

In [None]:
print(test_response)

In [None]:
print(trancript_model_data.summarized_chunks[8]["summary"])

In [None]:
print(all_transcript_model_data[0].meta)

In [None]:
annotated_chunks="""Based on the recap and summaries provided, here are recommendations for summary chunks that could be used to create more detailed panes for each of the main topics and smaller details:

Large topics:

{{{"Twitch and Streaming": [0, 5], "content": ["<strong>Twitch and streaming</strong> Destiny discusses his relationship with Twitch, expressing frustration about being unable to respond to slander and threats on the platform. He criticizes the current state of political content on Twitch, saying it has become less collaborative and more focused on individual streamers' clout.", "<strong>Twitch and streaming</strong> Destiny mentions being on good terms with several Twitch streamers, including Mizkif, Dr. K, and Asmongold. He emphasizes that he doesn't want blind defense from others, just fair treatment and acknowledgment when he's correct."]}}}

{{{"Collaborations and Relationships with Other Creators": [6, 8], "content": ["<strong>Collaborations and Relationships with Other Creators</strong> Destiny criticizes Hasan's approach to engaging with other content creators, suggesting Hasan only engages when he thinks he can gain clout or handle the situation.", "<strong>Collaborations and Relationships with Other Creators</strong> The speaker (Destiny) discusses collaborating with other content creators, expressing reluctance due to differences in approach and audience expectations."]}}}

{{{"Personal Stance and Approach": [0, 1], "content": ["<strong>Personal Stance and Approach</strong> Destiny emphasizes that he speaks out on issues he believes are correct, not to defend specific groups or individuals. He expresses discomfort with being seen as a spokesperson for any particular cause.", "<strong>Personal Stance and Approach</strong> Destiny expresses frustration with personal attacks that aren't accompanied by substantive arguments. He's open to criticism of his ideas but dislikes baseless personal attacks."]}}}

{{{"Current Events and Debates": [0, 1, 5], "content": ["<strong>Current Events and Debates</strong> He mentions a protest group damaging Stonehenge with orange spray paint.", "<strong>Current Events and Debates</strong> Destiny discusses leveraging his mainstream credibility from large debates. He expresses frustration with a particular individual who initially claimed willingness to debate but later backtracked.", "<strong>Current Events and Debates</strong> Reference to a debate or discussion about the Israel-Palestine conflict, with someone challenging Destiny's knowledge on the topic."]}}}

{{{"Media and Misinformation": [1, 4, 5], "content": ["<strong>Media and Misinformation</strong> He's triggered by ignorant people spreading misinformation in large spaces.", "<strong>Media and Misinformation</strong> This transcript covers several topics and sentiments expressed by the speaker, Destiny: An alleged exposure of Russian bots on Twitter supporting Trump, which Destiny is skeptical about.", "<strong>Media and Misinformation</strong> Criticism of media reporting on unsubstantiated rumors and its potential to cause division."]}}}

{{{"Israel-Palestine Conflict": [5, 6], "content": ["<strong>Israel-Palestine Conflict</strong> Reference to a debate or discussion about the Israel-Palestine conflict, with someone challenging Destiny's knowledge on the topic.", "<strong>Israel-Palestine Conflict</strong> Discussion about Hassan Piker's stance on Israel and Palestine, with Destiny questioning Hassan's consistency and motives."]}}}

{{{"Video File Management": [0], "content": ["<strong>Video File Management</strong> Destiny discusses issues with large video files from his cameras and explores options for compressing or managing these files for his editor in Australia. He goes into technical details about file sizes, upload speeds, and potential solutions."]}}}

{{{"Russian Bot Analysis": [3, 4, 5], "content": ["<strong>Russian Bot Analysis</strong> An in-depth analysis of the alleged Russian bot exposure, focusing on the technical aspects of how such a bot might function and why the reported error seems suspicious.", "<strong>Russian Bot Analysis</strong> Destiny expresses skepticism about its authenticity.", "<strong>Russian Bot Analysis</strong> Discussion about a Twitter bot that was allegedly exposed as a Russian disinformation campaign. The speaker (Destiny) is skeptical of the claims and suggests that the account suspension could be for various reasons."]}}}

Smaller topics:

{{{"Schedule and Travel": 0}}}
{{{"Typing Speed": 0}}}
{{{"Mental State": 1}}}
{{{"Work-Life Balance": 2}}}
{{{"Music and Cultural Gatekeeping": 2}}}
{{{"Relationship Discussions": 2}}}
{{{"January 6th Event": 2}}}
{{{"Merchandise Promotion": 8}}}
{{{"Basketball Game Scene": 8}}}
{{{"Courtroom and Prison Scenario": 8}}}
{{{"Illegal Firearm Modifications": 8}}}
{{{"Personal Interactions": 8}}}
"""

In [None]:
# parse the annotated_chunks
import re
def parse_annotated_chunks(annotated_chunks):
    # find everything
    pattern=re.compile(r'\{\{\{.*?\}\}\}')
    chunks=pattern.findall(annotated_chunks)

    # replace the triple curly braces with single curly braces
    chunks=[chunk.replace("{{{", "{").replace("}}}", "}") for chunk in chunks]
    return chunks

In [None]:
test_annotated_chunks=parse_annotated_chunks(annotated_chunks)

In [None]:
print(test_annotated_chunks[0])

In [None]:
topic_prompt_str=""
annotated_chunk_dict=json.loads(test_annotated_chunks[0])
chunk_indexes=[]
for key, value in annotated_chunk_dict.items():
    if key=="content":
        topic_prompt_str+="\n"+", ".join(value)
    else:
        chunk_indexes=value
        topic_prompt_str+=key

transcript_chunks_str=""
for chunk_index in chunk_indexes:
    transcript_chunks_str+=trancript_model_data.summarized_chunks[chunk_index]["transcript"]+"\n\n"
    

In [None]:
chunk_annotations_str=test_response
summarized_chunks=trancript_model_data.summarized_chunks

pattern=re.compile(r'\{\{\{.*?\}\}\}')
chunk_annotations_temp=pattern.findall(chunk_annotations_str)
chunk_annotations_temp=[chunk_annotation.replace("{{{", "{").replace("}}}", "}") for chunk_annotation in chunk_annotations_temp]
chunk_annotations=[]
for chunk_annotation in chunk_annotations_temp:
    chunk_annotations.append(json.loads(chunk_annotation))

# turn summarized chunks into prompt contexts
topic_prompts=[]
for chunk_annotation in chunk_annotations:
    # get topic prompt and chunk indexes for transcript prompt
    temp_prompt_str=""
    chunk_indexes=[]
    for key, value in chunk_annotation.items():
        if key=="content":
            temp_prompt_str+="\n"+", ".join(value)
        else:
            chunk_indexes=value
            temp_prompt_str+=key
    topic_prompts.append(temp_prompt_str)
    
    # get transcript prompt from the indexes
    transcript_chunks_str=""
    for chunk_index in chunk_indexes:
        transcript_chunks_str+=summarized_chunks[chunk_index]["transcript"]+"\n\n"

In [None]:
trancript_model_data=await saf.grab_transcript_data(video_id)
chunk_annotations_str=await saf.recap_zoomed_in_generator.annotate_zoom_chunks(summaized_chunks=trancript_model_data.summarized_chunks, recap=recap)
chunk_annotations, topic_prompts, transcript_chunks_prompts=saf.recap_zoomed_in_generator.prepare_zoom_inputs(trancript_model_data.summarized_chunks, chunk_annotations_str)
zooms=await saf.recap_zoomed_in_generator.prepare_zoom_inputs(topic_prompts, transcript_chunks_prompts)

In [None]:
print(recap)

In [None]:
# Reset saf 
del sys.modules['destinyapp.api.ServerAiFunctions']
import destinyapp.api.ServerAiFunctions as saf
reload(saf)

In [None]:
zoom_response=await saf.recap_zoomed_in_generator.generate_zoom(transcript_chunks_str=transcript_chunks_str, topic_prompt_str=topic_prompt_str)

In [None]:
print(zoom_response)

In [None]:
transcript_chunks_str

In [None]:
print(zoom_response)

In [None]:
print(zoom_response)

# all zoom gen

In [None]:
# Reset saf 
del sys.modules['destinyapp.api.ServerAiFunctions']
import destinyapp.api.ServerAiFunctions as saf
reload(saf)

In [None]:
trancript_model_data=await saf.grab_transcript_data(video_id)
chunk_annotations_str=await saf.recap_zoomed_in_generator.annotate_zoom_chunks(summaized_chunks=trancript_model_data.summarized_chunks, recap=recap)
chunk_annotations, topic_prompts, transcript_chunks_prompts=saf.recap_zoomed_in_generator.prepare_zoom_inputs(trancript_model_data.summarized_chunks, chunk_annotations_str)
zooms=await saf.recap_zoomed_in_generator.generate_all_zooms(topic_prompts, transcript_chunks_prompts)

In [None]:
zooms=await saf.recap_zoomed_in_generator.generate_all_zooms(topic_prompts, transcript_chunks_prompts)

In [None]:
print(topic_prompts[0])
print("\n")
print(zooms[0])

In [None]:
reformt_dict=await saf.recap_zoomed_in_generator.reformat_recap(recap, topic_prompts, zooms)

In [None]:
print(recap)

In [None]:
# parse the recap to get the [{"title": list_title, "list_items": list_items}] for each title and subsequent list items
# basically find the stuff between headed tags, that is the title, continue until you see a <ul> tag which is the start of the list items


# title_pattern=re.compile(r'<h3>(.*?)</h3>')
# list_item_pattern=re.compile(r'<li>(.*?)</li>')
# titles=title_pattern.findall(recap)
# list_items=list_item_pattern.findall(recap)
    
from bs4 import BeautifulSoup

soup = BeautifulSoup(recap, 'html.parser')

# Find all the titles and list items
data = []
for title in soup.find_all('h3'):
    list_items = []
    ul_tag = title.find_next_sibling('ul')
    if ul_tag:
        for li in ul_tag.find_all('li'):
            list_items.append(li.text)
    data.append({"title": title.text, "list_items": list_items})

for title in soup.find_all('h2'):
    list_items = []
    ul_tag = title.find_next_sibling('ul')
    if ul_tag:
        for li in ul_tag.find_all('li'):
            list_items.append(li.text)
    data.append({"title": title.text, "list_items": list_items})

print(data)

    # return the titles and list items
    # return [{"title": title, "list_items": list_items} for title, list_items in zip(titles, list_items)]

In [None]:
list(chunk_annotation.keys())[0]

In [None]:
chunk_annotations

In [None]:
filled_data=[]

for piece in data:
    for i, chunk_annotation  in enumerate(chunk_annotations):
        if list(chunk_annotation.keys())[0] in piece["title"]:
            html_list="<ul>"
            for list_item in piece["list_items"]:
                html_list+=f"<li>{list_item}</li>"
            html_list+="</ul>"
            filled_data.append({"title":piece["title"], "content":html_list})
            filled_data[-1]["zoom"]=zooms[i]

    if piece["title"]=='Smaller Details and Brief Mentions':
        for small_piece in piece["list_items"]:
            for i, chunk_annotation  in enumerate(chunk_annotations):
                if list(chunk_annotation.keys())[0] in small_piece:
                    filled_data.append({"title":small_piece})
                    filled_data[-1]["zoom"]=zooms[i]






In [None]:
# import html
from IPython.display import display, Markdown, HTML

In [None]:
display(HTML(recap))

In [None]:
filled_data

In [None]:
print(data)

In [None]:
len_zooms=0
for zoom in zooms:
    len_zooms+=len(zoom)
print(len_zooms)

In [None]:
# grab which data is associated with each title
import re
def extract_titles_and_list_items(recap):
    title_pattern=re.compile(r'<h3>(.*?)</h3>')
    list_item_pattern=re.compile(r'<li>(.*?)</li>')
    titles=title_pattern.findall(recap)
    list_items=list_item_pattern.findall(recap)
    return titles, list_items

titles, list_items=extract_titles_and_list_items(recap)

In [None]:
titles

In [None]:
list_items

# Hook 1m3

# Chat v1@

In [None]:
# reload
del sys.modules['destinyapp.api.ServerAiFunctions']
import destinyapp.api.ServerAiFunctions as saf
reload(saf)

In [None]:
yt_id="3kJr7ODrwNw"

import datetime
from chat_downloader import ChatDownloader
import math

In [None]:
video_meta_data = ChatDownloader().get_chat('https://www.youtube.com/watch?v='+yt_id)

# get the chat data
all_chat_messages=[]
for chat in video_meta_data.chat:
    all_chat_messages.append(chat)

# Simplify the chat data
simplified_messsages=[]
for message in all_chat_messages:
    simplified_messsages.append({"name": message["author"]["name"], "message": message["message"], "time": message["time_text"]})

In [None]:
all_messages=""
for i, message in enumerate(simplified_messsages):
    all_messages+=f"{i+1}. {message['message']}\n"

In [None]:
len(saf.enc.encode(all_messages))

In [None]:
# get the transcript
transcript_data=await saf.grab_transcript_data(yt_id)
transcript=transcript_data.transcript
start_end_char=transcript_data.summarized_chunks[0]["char_start_finish_indexes"]
transcript=transcript[start_end_char[0]:start_end_char[1]]
linked_transcript=transcript_data.linked_transcript

# get the messages in the segment
start_time=saf.get_time_at_char_count(start_end_char[0], linked_transcript)
finish_time=saf.get_time_at_char_count(start_end_char[1], linked_transcript)
segment_messages=saf.get_chats_in_start_end(simplified_messsages,  int(start_time), int(finish_time))

In [None]:
len(simplified_messsages)

In [None]:
# propose a split size and then get the indexes to split the chat into batches
chat_processing_batches=[]
set_batch_size=150
if len(segment_messages)<set_batch_size:
    batch_size=len(segment_messages)
else:
    batch_size=math.ceil(len(segment_messages)/round(len(segment_messages)//set_batch_size))
# Make set of indexes to split the chat into batches
batch_indexes=[i for i in range(0, len(segment_messages), batch_size)]
for i, index in enumerate(batch_indexes):
    if index==batch_indexes[-1]:
        chat_processing_batches.append(segment_messages[index:])
    else:
        chat_processing_batches.append(segment_messages[index:batch_indexes[i+1]])
# Create chat str batches
chat_segment_str_batches=[]
for i, batch in enumerate(chat_processing_batches):
    print("Len batch", len(batch))
    chat_segment_str=""
    for j, message in enumerate(batch):
        message_at_time=message["time"]
        message_content=message["message"]
        temp_str=f"{j}: {message_content}"
        chat_segment_str+=temp_str+"\n"
    chat_segment_str_batches.append(chat_segment_str)

In [None]:
# bot_analysis=await saf.analyze_chat(transcript_data.summarized_chunks[0]["summary"], chat_segment_str_batches[0])
bot_analysis=await saf.analyze_chat(transcript_data.summarized_chunks[0]["summary"], chat_segment_str_batches[3])

In [None]:
print(bot_analysis.split("\n")[-1])

In [None]:
# total cost
0.0025700000000000002*(len(simplified_messsages)/150)

In [None]:
chat_annotations={}
for line in bot_analysis.split("\n"):
    number=line.split(":")[0]
    if number.isdigit():
        full_reason=line.split(":")[1].strip()
        serious=full_reason.startswith("yes")
        chat_annotations[number]={"full": full_reason, "serious": serious}

In [None]:
print(chat_annotations['0'])

In [None]:
stats={}
stats["serious"]=0
for chat_annotation in chat_annotations:
    if chat_annotations[chat_annotation]["serious"]:
        stats["serious"]+=1

print(stats)

In [None]:
simplified_messsages[587]

In [None]:
for i, chat_annotation in enumerate(chat_annotations):
    if chat_annotations[chat_annotation]["serious"]:
        y_n="y"
    else:
        y_n="n"
    print(y_n, chat_segment_str_batches[3].split("\n")[i])#, "                              ANALYSIS", chat_annotations[chat_annotation]["full"])

In [None]:
start_time

In [None]:
simplified_messsages[20]

In [None]:
print(chat_segment_str_batches[0])

# process all chat messages

In [None]:
# reload
del sys.modules['destinyapp.api.ServerAiFunctions']
import destinyapp.api.ServerAiFunctions as saf
reload(saf)

In [None]:
yt_id="3kJr7ODrwNw"

import datetime
from chat_downloader import ChatDownloader
import math

In [None]:
# GET TRANSCRIPT
transcript_data=await saf.grab_transcript_data(yt_id)
transcript=transcript_data.transcript
linked_transcript=transcript_data.linked_transcript

In [None]:
# GET BASE CHAT DATA
video_meta_data = ChatDownloader().get_chat('https://www.youtube.com/watch?v='+yt_id)

# get the chat data
all_chat_messages=[]
for chat in video_meta_data.chat:
    all_chat_messages.append(chat)

# Simplify the chat data
simplified_messsages=[]
for message in all_chat_messages:
    simplified_messsages.append({"name": message["author"]["name"], "message": message["message"], "time": message["time_text"]})

In [None]:
# make tasks and reference data
tasks=[]
input_data=[]

for summarized_chunk in transcript_data.summarized_chunks:
    # get the transcript
    start_end_char=summarized_chunk["char_start_finsih_indexes"]
    transcript=transcript_data.transcript[start_end_char[0]:start_end_char[1]]

    # get the messages in the segment
    start_time=saf.get_time_at_char_count(start_end_char[0], linked_transcript)
    finish_time=saf.get_time_at_char_count(start_end_char[1], linked_transcript)
    segment_messages=saf.get_chats_in_start_end(simplified_messsages,  int(start_time), int(finish_time))

    # # CREATE CHAT BATCHES
    # propose a split size and then get the indexes to split the chat into batches
    chat_processing_batches=[]
    set_batch_size=150
    if len(segment_messages)<set_batch_size:
        batch_size=len(segment_messages)
    else:
        batch_size=math.ceil(len(segment_messages)/round(len(segment_messages)//set_batch_size))
    # Make set of indexes to split the chat into batches
    batch_indexes=[i for i in range(0, len(segment_messages), batch_size)]
    for i, index in enumerate(batch_indexes):
        if index==batch_indexes[-1]:
            chat_processing_batches.append(segment_messages[index:])
        else:
            chat_processing_batches.append(segment_messages[index:batch_indexes[i+1]])
    # Create chat str batches
    chat_segment_str_batches=[]
    for i, batch in enumerate(chat_processing_batches):
        print("Len batch", len(batch))
        chat_segment_str=""
        for j, message in enumerate(batch):
            message_at_time=message["time"]
            message_content=message["message"]
            temp_str=f"{j}: {message_content}"
            chat_segment_str+=temp_str+"\n"
        chat_segment_str_batches.append(chat_segment_str)

    for i, chat_batch in enumerate(chat_segment_str_batches):
        input_data.append({"transcript": transcript, "chat_batch": chat_batch, "segment_messages": chat_processing_batches[i]})
        tasks.append(saf.analyze_chat(transcript, chat_batch))


In [None]:
# # RUN TASKS
# split the tasks into batches of about 20 but make them as even as possible with math
split_up_tasks=[]
proposed_batch_size=20
batch_size=math.ceil(len(tasks)/round(len(tasks)//proposed_batch_size))
batch_indexes=[i for i in range(0, len(tasks), batch_size)]
for i, index in enumerate(batch_indexes):
    if index==batch_indexes[-1]:
        split_up_tasks.append(tasks[index:])
    else:
        split_up_tasks.append(tasks[index:batch_indexes[i+1]])

# run the tasks
all_chat_analysis=[]
for tasks_batch in split_up_tasks:
    all_chat_analysis+=await asyncio.gather(*tasks_batch)

In [None]:
# Organize Analysis responses
all_chat_annotations=[]
error_count=0
for i, chat_analysis in enumerate(all_chat_analysis):
    chat_annotations={}
    for c, line in enumerate(chat_analysis.split("\n")):
        number=line.split(":")[0]
        if number.isdigit():
            full_reason=line.split(":")[1].strip()
            serious=full_reason.startswith("yes")
            try:
                chat_annotations[number]={"full": full_reason, "serious": serious, "chat_data": input_data[i]["segment_messages"][int(number)], "transcript":input_data[i]["transcript"]}
            except:
                error_count+=1
    all_chat_annotations.append(chat_annotations)
print(error_count)

ordered_chat_annotations={}
message_count=0
stats={}
stats["serious"]=0
for chat_annotations in all_chat_annotations:
    for chat_annotation in list(chat_annotations.values()):
        ordered_chat_annotations[simplified_messsages.index(chat_annotation["chat_data"])]=chat_annotation
        if chat_annotation["serious"]:
            stats["serious"]+=1
        message_count+=1
    
print(message_count)
print(stats)

In [None]:
def print_chat_annotation(chat_annotation):
    print(chat_annotation["full"], chat_annotation["serious"], chat_annotation["chat_data"])
chat_annotations=all_chat_annotations[0]
print_chat_annotation(chat_annotations['1'])
chat_annotations=all_chat_annotations[1]
print_chat_annotation(chat_annotations['1'])
chat_annotations=all_chat_annotations[2]
print_chat_annotation(chat_annotations['1'])
chat_annotations=all_chat_annotations[3]
print_chat_annotation(chat_annotations['1'])

In [None]:
list_chat_annotations=list(ordered_chat_annotations.values())
stats={}
stats["serious"]=0
serious_messages=[]

for key, chat_annotation in ordered_chat_annotations.items():
    if chat_annotation["serious"]:
        stats["serious"]+=1
        serious_messages.append({key: chat_annotation})
    
print(stats)
print(len(list_chat_annotations))


In [None]:
for serious_message in serious_messages:
    for key, value in serious_message.items():
        print(simplified_messsages[key], value["chat_data"], value["full"])

In [None]:
serious_message

In [None]:
item_c=0
for key, value in serious_message.items():
    # print(key, value)
    item_c+=1
print(item_c)

In [None]:
value

In [None]:
same_segment_serious_messages=[]
temp_segment=[]
previous_transcript=None
for serious_message in serious_messages:
    for key, value in serious_message.items():
        if (value["transcript"]!=previous_transcript) and (previous_transcript!=None):
            same_segment_serious_messages.append(temp_segment)
            temp_segment=[]
        else:
            temp_segment.append(value)

        previous_transcript=value["transcript"]

if temp_segment!=[]:
    same_segment_serious_messages.append(temp_segment)


In [None]:
for segment_serious_messages in same_segment_serious_messages:
    print(len(segment_serious_messages))

In [None]:
same_segment_serious_messages[-1]

# REDO RECAPS

In [None]:
from pytube import YouTube
import discord
import html2text
import traceback
all_transcript_data=await saf.get_all_data()
keys=views.keys

In [None]:
import datetime

video_id="5raed64fL0Y"

In [None]:
video_id="I3FuM7myMrI"
video_id="N2lCEccQNvY"
video_id="anDEECKsCfc"
video_id="ys64pMzpDUs"

In [None]:
url = 'https://www.youtube.com/watch?v='+video_id
yt = YouTube(url)


raw_date=yt.publish_date.__str__()
date_obj=datetime.datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S")
date_str=date_obj.strftime("%m/%d/%Y")

print(date_str)

In [None]:
# Get video
yt = YouTube(url)

# Fill attributes
try:
    for attr in dir(yt):
        value=getattr(yt, attr)
except Exception as e:
    pass

# Get Live status
try:
    vid_info=getattr(yt, "_vid_info")
    live_bool=vid_info["videoDetails"]["isLive"]
except Exception as e:
    live_bool=False


In [None]:
# print every attribute of the video
for attr in dir(yt):
    try:
        print(f"yt.{attr}")# = {getattr(yt, attr)}")
    except Exception as e:
        pass

In [None]:
import youtube_dl
ydl_opts = {}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.extract_info()

In [None]:
import yt_dlp as youtube_dl

ydl_opts = {}
full_title=""
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    info_dict = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
    info_dict_g=info_dict
    upload_date = info_dict['upload_date']
    upload_date
    date_obj=datetime.datetime.strptime(upload_date, "%Y%m%d")
    date_str=date_obj.strftime("%m/%d/%Y")
    title=info_dict["title"]
    full_title=title+"\nStream Date: "+date_str

print(full_title)

In [None]:
# get some video metadata
async def get_video_metadata(video_id):
    ydl_opts = {}
    full_title=""
    try:
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            info_dict = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
            info_dict_g=info_dict
            upload_date = info_dict['upload_date']
            upload_date
            date_obj=datetime.datetime.strptime(upload_date, "%Y%m%d")
            date_str=date_obj.strftime("%m/%d/%Y")
            title=info_dict["title"]
            full_title=title+"\nStream Date~ "+date_str
    except Exception as e:
        pass

    return full_title

await get_video_metadata("ZBcQEnCDgDg")

In [None]:
info_dict["title"]

In [None]:
info_dict.keys()

In [None]:
upload_date
date_obj=datetime.datetime.strptime(upload_date, "%Y%m%d")
date_str=date_obj.strftime("%m/%d/%Y")
title=info_dict["title"]
full_title=yt.title+"\nStream Date: "+date_str

In [None]:
yt._vid_info["videoDetails"]

In [None]:
async def redo_recap(transcript_model_datas, vector_embeedding_bool=True, summary_segments_bool=True, meta_summary_bool=True, video_metadata_bool=True):

    discord_recaps_to_send=[]
    for transcript_model_data in transcript_model_datas:
        yt_id=transcript_model_data.video_id
        print("REDOING RECAP FOR: ", yt_id)
        raw_transcript_data=transcript_model_data.raw_transcript_data
        save_raw_transcript_data={"raw_transcript_data": raw_transcript_data}

        # Process to make regular and linked_transcript
        save_processed_transcripts=await saf.process_raw_transcript(raw_transcript_data, yt_id)
        transcript=save_processed_transcripts["transcript"]
        linked_transcript=save_processed_transcripts["linked_transcript"]
        print("Transcript finished")#, transcript)

        # Make vector db
        if vector_embeedding_bool:
            vectordb_and_textchunks=await saf.assembly_generate_vectordb_and_chunks(yt_id, save_processed_transcripts["transcript"])
            text_chunks=vectordb_and_textchunks["text_chunks"]
            print("Text Chunks Finished")# ", text_chunks)

        # Generate summarized segments
        if summary_segments_bool:
            model_responses=await saf.generate_summarized_segments(save_processed_transcripts["transcript"])#,[], 10)
            print("Summarized Chunks Finished")#, model_responses)

        # Make meta summary
        if meta_summary_bool:
            # meta_summary=await saf.generate_meta_summary(model_responses)
            meta_summary=await saf.meta_summary_generator.generate_meta_summary(model_responses)
            print("Meta Summary Finished: ", meta_summary)

            # add the hook to the meta summary
            recap_hook=await saf.generate_recap_hook(meta_summary)
            meta_summary=recap_hook+"\n"+meta_summary+"\n\nDISCLAIMER: This is all AI generated and there are frequent errors."


        # get some video metadata
        if video_metadata_bool:
            async def get_video_metadata(video_id):
                url = 'https://www.youtube.com/watch?v='+video_id
                yt = YouTube(url)

                raw_date=yt.publish_date.__str__()
                date_obj=datetime.datetime.strptime(raw_date, "%Y-%m-%d %H:%M:%S")
                date_str=date_obj.strftime("%m/%d/%Y")

                full_title=yt.title+"\nStream Date: "+date_str

                return full_title
            full_title=await get_video_metadata(yt_id)

        # Save everything
        await saf.save_data(yt_id, save_processed_transcripts)
        if vector_embeedding_bool:
            await saf.save_data(yt_id, {"text_chunks":vectordb_and_textchunks["text_chunks"]})
        if summary_segments_bool:
            await saf.save_data(yt_id, {"summarized_chunks":model_responses})
        if meta_summary_bool:
            await saf.save_data(yt_id, {"meta":meta_summary})
        if video_metadata_bool:
            await saf.save_data(yt_id, {"video_characteristics":{"title":full_title}})

        
        # Setup discord to send message
        new_transcript_model_data=await saf.grab_transcript_data(yt_id)
        if new_transcript_model_data.video_characteristics.get("title", None)!=None:
            full_title=new_transcript_model_data.video_characteristics["title"]
        else:
            full_title=None
        discord_recaps_to_send.append({"meta":new_transcript_model_data.meta,"yt_id":yt_id, "title":full_title})

    
    # Using discord_recaps_to_send to send recaps to discord
    async def send_discord_recaps():
        class MessageSendingClient(discord.Client):
            async def on_ready(self):
                async def send_recap(recap):
                    # Send discord message header
                    destinyrecaps_url="https://destinyrecaps.com"+"/details?video_id="+recap["yt_id"]
                    destinyrecaps_msg="Full transcript and embedding search at "+destinyrecaps_url
                    if recap.get("title",None)!=None:
                        youtube_msg=recap["title"]+": "+"https://www.youtube.com/watch?v="+recap["yt_id"]
                    else:
                        youtube_msg="https://www.youtube.com/watch?v="+recap["yt_id"]

                    header_message=f"{youtube_msg}\n{destinyrecaps_msg}"
                    await channel.send(header_message)

                    # initialize variables for recap message
                    tag_message="@everyone \n"
                    message_str=tag_message+html2text.html2text(recap["meta"])
                    start_index=0
                    recap_chunks={}
                    recap_chunks["start_finish"]=[0]
                    recap_chunks["segments"]=[]
                    increment_size=1500

                    # increment for the number of segments needed
                    for i in range((len(message_str)//increment_size)+1):
                        
                        # find the the reasonable end of the segment
                        finish_index=start_index+increment_size
                        if finish_index>=len(message_str):
                            finish_index=None
                        else:
                            while message_str[finish_index]!="\n":
                                finish_index+=1
                                if (finish_index-start_index)>2100:
                                    print("Didn't find a newline")
                                    break
                                if finish_index>=len(message_str):
                                    finish_index=None
                                    break
                        
                        # append the segments to the list
                        recap_chunks["segments"].append(message_str[start_index:finish_index])
                        start_index=finish_index
                        recap_chunks["start_finish"].append(start_index)

                        if finish_index==None:
                            break

                    print("Sending c-hunks: ",len(recap_chunks["segments"]))
                    for recap_chunk in recap_chunks["segments"]:
                        await channel.send(recap_chunk)


                print(f'Discord logged in as {self.user}')
                channels=self.get_all_channels()
                for channel in channels:
                    print(channel.name)
                    if channel.name=="recaps":
                        if channel:
                            for recap in discord_recaps_to_send:
                                print("Sending Recap")
                                try:
                                    await send_recap(recap)
                                except Exception as e:
                                    # print as much as possible
                                    print("ERROR: ",e)
                                    print(traceback.format_exc())

                await self.close()
                print("Send and client closed")

        intents = discord.Intents.default()
        intents.messages = True 
        client = MessageSendingClient(intents=intents)

        await client.start(keys["discord"])

    # start the discord recaps sending
    try:
        await send_discord_recaps()
    except Exception as e:
        print("ERROR: ",e)
        print(traceback.format_exc())

In [None]:
async def redo_recap_controller(item_count=3):
    all_transcript_data = await saf.get_all_data()
    if item_count>len(all_transcript_data):
        item_count=len(all_transcript_data)
    print(f"REDOING RECAPS FOR {item_count} ITEMS")
    for i, transcript_model_data in enumerate(all_transcript_data[::-1][:item_count]):
        print(i, transcript_model_data.video_id, transcript_model_data.video_characteristics)

    for i, transcript_model_data in enumerate(all_transcript_data[::-1][:item_count]):
        print("Redoing:",i, transcript_model_data.video_id, transcript_model_data.video_characteristics)
        await redo_recap([transcript_model_data], transcript_model_data.video_id)

In [None]:
await redo_recap_controller()

In [None]:
for i, transcript_model_data in enumerate(all_transcript_data[::-1]):
    if i==3:
        print(i, transcript_model_data.video_id, transcript_model_data.video_characteristics)
        await redo_recap([transcript_model_data], transcript_model_data.video_id)
        break

In [None]:
for t_d_i in all_transcript_data:
    print(t_d_i.video_characteristics)

In [None]:
print(len(all_transcript_data))

# MISC TESTING

In [None]:
# get the data from the database
video_id="hAf0iOS-2V4"
meta_data=await sync_to_async(TranscriptData.objects.get)(video_id=video_id)

# Get the raw_transcript_data
raw_transcript_data = meta_data.raw_transcript_data
# Get the size in bytes of raw_transcript_data
raw_transcript_size = sys.getsizeof(raw_transcript_data)

# Get the linked_transcript
linked_transcript = meta_data.linked_transcript
# Get the size in bytes of linked_transcript
linked_transcript_size = sys.getsizeof(linked_transcript)

print(f"Size of raw_transcript_data: {raw_transcript_size} bytes")
print(f"Size of linked_transcript: {linked_transcript_size} bytes")

In [None]:
len(raw_transcript_data)

In [None]:
len(meta_data.linked_transcript)

In [None]:
len(str(raw_transcript_data))

In [None]:
raw_transcript_data