In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# install the following libraries: cohere to use the x-large model. 
!pip install cohere youtube_transcript_api

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cohere
  Downloading cohere-3.3.3.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting youtube_transcript_api
  Downloading youtube_transcript_api-0.5.0-py3-none-any.whl (23 kB)
Collecting urllib3~=1.26
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: cohere
  Building wheel for cohere (setup.py) ... [?25l[?25hdone
  Created wheel for cohere: filename=cohere-3.3.3-cp38-cp38-linux_x86_64.whl size=15682 sha256=672aed398b0d83d168b4a55dd8eb09e4f4d591aeebecb3c2d6513aed5dcfc431
  Stored in directory: /root/.cache/pip/wheels/da/e7/ac/033673c03849f03bf424822e247487cd9b89dcb4e5ef609901
Successfully built cohere
Installing collected packages: urllib3, youtube_transcript_api, cohere
  Attempt

In [3]:
!cp /content/drive/MyDrive/weblm/WebLM_interactive_src/cohereapikey.txt  /content/

In [4]:
%cd /content/
!mkdir /content/response_logs

/content


# summarize a youtube video using cohere's 

In [None]:
import cohere
import sys
from youtube_transcript_api import YouTubeTranscriptApi
from time import time,sleep
import re

diagnostics = 0
include_mentions = 0


def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()

def save_file(filepath, content):
    with open(filepath, 'w', encoding='utf-8') as outfile:
        outfile.write(content)

co= cohere.Client(open_file('/content/cohereapikey.txt'))

def get_video_id_from_video_id_or_url(video_id_or_url):
    # fetch the video ID from the URL. if it's more that 11 characters long, crop it to make it 11. 
    if len(video_id_or_url) > 11:
        return video_id_or_url[-11:]
    else:
        return video_id_or_url

def get_chunks_from_youtube(video_id):
    # fetch video's transcript
    # and chunk it into several 5min intervals
    transcript = YouTubeTranscriptApi.get_transcript(video_id)

    chunks = []

    start_timestamp = 0.0
    current_timestamp_mins = 0.0

    current_chunk = []

    for entry in transcript:
        current_timestamp_mins = entry['start'] / 60.0

        # chunk at 5 minutes intervals
        if current_timestamp_mins - start_timestamp > 5:
            # add current chunk to a list of chunks
            chunks.append(current_chunk)
            # then reset the start timestamp
            start_timestamp = current_timestamp_mins
            # reset current chunk
            current_chunk = []

        # append the chunk's text
        current_chunk.append(entry['text'])

    # the last chunk of the video
    if len(current_chunk) > 0:
        chunks.append(current_chunk)

    print(f"Found {len(chunks)} chunks")

    return chunks

def summarize_chunk(index, chunk):
    chunk_str = "\n".join(chunk)
    prompt = f"""The following is a section of the transcript of a youtube video. It is section #{index+1}:
    {chunk_str}
    Briefly summarize this section of the transcript in 100 characters or less."""

    if diagnostics:
        for line in prompt.split('\n'):
            print(f"# {line}")

    
    response = co.generate(
                model='xlarge'
                #model='command-beta',
                prompt= prompt,
                max_tokens=500,
                temperature=1.8,
                k=0,
                p=0.65,
                frequency_penalty=0.15,
                presence_penalty=0.15,
                stop_sequences=[],
                return_likelihoods='NONE')
    text_response = response.generations[0].text.strip()
    text_response = re.sub('\s+', ' ', text_response)
    filename = '%s_logs.txt' % time()
    with open('response_logs/%s' % filename, 'w') as outfile:
        outfile.write('PROMPT:\n\n' + prompt + '\n\n==========\n\nRESPONSE:\n\n' + text_response)
    with open('response.txt', 'w') as f:
        f.write(text_response)
    
    if diagnostics:
        print(f"# Response: {text_response}")
    
    return text_response

def summarize_the_summaries(summaries):
    max_retry = 5
    retry = 0
    summaries_str = ""
    for index, summary in enumerate(summaries):
        summaries_str += f"Summary of chunk {index+1}:\n{summary}\n\n"

    prompt = f"""The following are summaries of a youtube video in 5 minute chunks:"
    {summaries_str}
    Summarize the summaries."""

    # prompt = prompt.encode(encoding='ASCII',errors='ignore').decode()

    if diagnostics:
        for line in prompt.split('\n'):
            print(f"# {line}")

    while True:
        try:
            response = co.generate(
                model='xlarge'
                #model='command-beta',
                prompt= prompt,
                max_tokens=500,
                temperature=1.8,
                k=0,
                p=0.65,
                frequency_penalty=0.15,
                presence_penalty=0.15,
                stop_sequences=[],
                return_likelihoods='NONE')
            text_response = response.generations[0].text.strip()
            text_response = re.sub('\s+', ' ', text_response)
            filename = '%s_log.txt' % time()
            with open('response_logs/%s' % filename, 'w') as outfile:
                outfile.write('PROMPT:\n\n' + prompt + '\n\n==========\n\nRESPONSE:\n\n' + text_response)
            with open('response.txt', 'w') as f:
                f.write(text_response)
            return text_response
        except Exception as oops:
            retry += 1
            if retry >= max_retry:
                return "error: %s" % oops
            print('Error communicating with Cohere:', oops)
            sleep(1)

    if diagnostics:
        print(f"# Response: {text_response}")

    return text_response

def main():
    # the video transcript
    if len(sys.argv) < 2:
        print("Usage: python3 sumvid.py <video id or url>")
        sys.exit(1)

    video_id_or_url = sys.argv[1]

    # if the video id or url is a url, extract the video id
    video_id = get_video_id_from_video_id_or_url(video_id_or_url)

    if len(sys.argv) > 2:
        for arg in sys.argv[2:]:
            if arg == "--diagnostics":
                global diagnostics
                diagnostics = True

            if arg == "--mentions":
                global include_mentions
                include_mentions = True

    chunks = get_chunks_from_youtube(video_id)

    if len(chunks) == 0:
        print("No chunks found")
    elif len(chunks) == 1:
        summary = summarize_chunk(0, chunks[0])
        print(f"\nSummary: {summary}")

    else:
        # summarize each chunk
        summaries = []
        for index, chunk in enumerate(chunks):
            summary = summarize_chunk(index, chunk)
            summaries.append(summary)
            print(f"\nSummary of chunk {index+1}: {summary}")

        # compile the summaries
        summary_of_summaries = summarize_the_summaries(summaries)

        print(f"\nSummary of summaries: {summary_of_summaries}")

if __name__ == "__main__":
    main()

In [5]:
# with mentions of people and places
%cd /content
!python3 /content/summarize_youtube.py https://www.youtube.com/watch?v=3m2Cpbpr1zM

/content
Found 1 chunks

Summary: In this section of the transcript, David's team is discussing a mug that they think belongs to Lee. They are trying to determine if the mug is really Lee's mug or not.


In [7]:
!python3 /content/summarize_youtube.py https://www.youtube.com/watch?v=JsLH0SeqAEc

Found 1 chunks
Traceback (most recent call last):
  File "/content/summarize_youtube.py", line 193, in <module>
    main()
  File "/content/summarize_youtube.py", line 176, in main
    summary = summarize_chunk(0, chunks[0])
  File "/content/summarize_youtube.py", line 86, in summarize_chunk
    text_response = response.generations[0].text.strip()
  File "/usr/local/lib/python3.8/dist-packages/cohere/response.py", line 54, in __getattribute__
    return attr.resolve()
  File "/usr/local/lib/python3.8/dist-packages/cohere/response.py", line 41, in resolve
    self._result = self._getter(self._request.result())
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 444, in result
    return self.__get_result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/usr/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/usr/local/lib/python3.8/dist-packag

In [11]:
!python3 /content/summarize_youtube.py https://youtu.be/_psRCXRyWq8

Found 1 chunks

Summary: Section #1 of the transcript describes a patient's knee surgery to repair a torn meniscus. The meniscus helps spread weight evenly across the joint and provides stability. The surgery involves using an arthroscopic punch and shaver to remove the torn fragment, and a shaver to remove any debris or loose fragments. In young patients, a torn meniscus can occur due to a forceful action such as a deep squat or pivot, while in older patients less force is required. The patient normally goes home the same day and can return to driving and sports within four to six weeks.


In [13]:
!python3 /content/summarize_youtube.py https://www.youtube.com/watch?v=cTxBLn-DoEQ

Found 10 chunks

Summary of chunk 1: The section is a brief introduction of the speakers for the video. They include: Annie Kreitzer, principal designer for the experiment and the campaign and also team lead for integrated modeling of these experiments; Jean-Michel de Nicola, chief engineer for nif laser systems; and Alex Zylstra, the head of the NIF. They describe their roles in the project and how it relates to the overall goal of creating a design that can reach the extreme conditions required for Fusion ignition on a NIF (National Ignition Facility).

Summary of chunk 2: Section #2 of the transcript discusses the continued support of the Department of Energy and the National Security Agency for the Mission. It also talks about the knife laser, the largest laser in the world, and its delivery of energy. The transcript then moves on to discuss the efforts to achieve ignition and the role of targets in this process. It also mentions the importance of diagnostics in understanding the c