In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# install the following libraries: cohere to use the x-large model. 
!pip install cohere youtube_transcript_api

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cohere
  Downloading cohere-3.3.2.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting youtube_transcript_api
  Downloading youtube_transcript_api-0.5.0-py3-none-any.whl (23 kB)
Collecting urllib3~=1.26
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: cohere
  Building wheel for cohere (setup.py) ... [?25l[?25hdone
  Created wheel for cohere: filename=cohere-3.3.2-cp38-cp38-linux_x86_64.whl size=15691 sha256=58717a256202ab1308a76f159372f08f5540adba3823a359cd5c92f99ef8e41a
  Stored in directory: /root/.cache/pip/wheels/b5/e9/5f/3bb45881f18598535b297e45e50bab371e8f4b2078572668c2
Successfully built cohere
Installing collected packages: urllib3, youtube_transcript_api, cohere
  Attempt

In [3]:
!cp /content/drive/MyDrive/weblm/WebLM_interactive_src/cohereapikey.txt  /content/

In [4]:
%cd /content/
!mkdir /content/response_logs

/content


# summarize a youtube video using cohere's 

In [11]:
import cohere
import sys
from youtube_transcript_api import YouTubeTranscriptApi
from time import time,sleep
import re

diagnostics = 0
include_mentions = 0


def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()

def save_file(filepath, content):
    with open(filepath, 'w', encoding='utf-8') as outfile:
        outfile.write(content)

co= cohere.Client(open_file('/content/cohereapikey.txt'))

def get_video_id_from_video_id_or_url(video_id_or_url):
    # fetch the video ID from the URL. if it's more that 11 characters long, crop it to make it 11. 
    if len(video_id_or_url) > 11:
        return video_id_or_url[-11:]
    else:
        return video_id_or_url

def get_chunks_from_youtube(video_id):
    # fetch video's transcript
    # and chunk it into several 10min intervals
    transcript = YouTubeTranscriptApi.get_transcript(video_id)

    chunks = []

    start_timestamp = 0.0
    current_timestamp_mins = 0.0

    current_chunk = []

    for entry in transcript:
        current_timestamp_mins = entry['start'] / 60.0

        # chunk at 10 minutes intervals
        if current_timestamp_mins - start_timestamp > 10:
            # add current chunk to a list of chunks
            chunks.append(current_chunk)
            # then reset the start timestamp
            start_timestamp = current_timestamp_mins
            # reset current chunk
            current_chunk = []

        # append the chunk's text
        current_chunk.append(entry['text'])

    # the last chunk of the video
    if len(current_chunk) > 0:
        chunks.append(current_chunk)

    print(f"Found {len(chunks)} chunks")

    return chunks

def summarize_chunk(index, chunk):
    chunk_str = "\n".join(chunk)
    prompt = f"""The following is a section of the transcript of a youtube video. It is section #{index+1}:
    {chunk_str}
    Summarize this section of the transcript."""

    if diagnostics:
        for line in prompt.split('\n'):
            print(f"# {line}")

    
    response = co.generate(
                # model='xlarge'
                model='command-beta',
                prompt= prompt,
                max_tokens=500,
                temperature=1.8,
                k=0,
                p=0.65,
                frequency_penalty=0.15,
                presence_penalty=0.15,
                stop_sequences=[],
                return_likelihoods='NONE')
    text_response = response.generations[0].text.strip()
    text_response = re.sub('\s+', ' ', text_response)
    filename = '%s_logs.txt' % time()
    with open('response_logs/%s' % filename, 'w') as outfile:
        outfile.write('PROMPT:\n\n' + prompt + '\n\n==========\n\nRESPONSE:\n\n' + text_response)
    with open('response.txt', 'w') as f:
        f.write(text_response)
    
    if diagnostics:
        print(f"# Response: {text_response}")
    
    return text_response

def summarize_the_summaries(summaries):
    max_retry = 5
    retry = 0
    summaries_str = ""
    for index, summary in enumerate(summaries):
        summaries_str += f"Summary of chunk {index+1}:\n{summary}\n\n"

    prompt = f"""The following are summaries of a youtube video in 10 minute chunks:"
    {summaries_str}
    Summarize the summaries."""

    # prompt = prompt.encode(encoding='ASCII',errors='ignore').decode()

    if diagnostics:
        for line in prompt.split('\n'):
            print(f"# {line}")

    while True:
        try:
            response = co.generate(
                # model='xlarge'
                model='command-beta',
                prompt= prompt,
                max_tokens=500,
                temperature=1.8,
                k=0,
                p=0.65,
                frequency_penalty=0.15,
                presence_penalty=0.15,
                stop_sequences=[],
                return_likelihoods='NONE')
            text_response = response.generations[0].text.strip()
            text_response = re.sub('\s+', ' ', text_response)
            filename = '%s_log.txt' % time()
            with open('response_logs/%s' % filename, 'w') as outfile:
                outfile.write('PROMPT:\n\n' + prompt + '\n\n==========\n\nRESPONSE:\n\n' + text_response)
            with open('response.txt', 'w') as f:
                f.write(text_response)
            return text_response
        except Exception as oops:
            retry += 1
            if retry >= max_retry:
                return "error: %s" % oops
            print('Error communicating with Cohere:', oops)
            sleep(1)

    if diagnostics:
        print(f"# Response: {text_response}")

    return text_response

def main():
    # the video transcript
    if len(sys.argv) < 2:
        print("Usage: python3 sumvid.py <video id or url>")
        sys.exit(1)

    video_id_or_url = sys.argv[1]

    # if the video id or url is a url, extract the video id
    video_id = get_video_id_from_video_id_or_url(video_id_or_url)

    if len(sys.argv) > 2:
        for arg in sys.argv[2:]:
            if arg == "--diagnostics":
                global diagnostics
                diagnostics = True

            if arg == "--mentions":
                global include_mentions
                include_mentions = True

    chunks = get_chunks_from_youtube(video_id)

    if len(chunks) == 0:
        print("No chunks found")
    elif len(chunks) == 1:
        summary = summarize_chunk(0, chunks[0])
        print(f"\nSummary: {summary}")

    else:
        # summarize each chunk
        summaries = []
        for index, chunk in enumerate(chunks):
            summary = summarize_chunk(index, chunk)
            summaries.append(summary)
            print(f"\nSummary of chunk {index+1}: {summary}")

        # compile the summaries
        summary_of_summaries = summarize_the_summaries(summaries)

        print(f"\nSummary of summaries: {summary_of_summaries}")

if __name__ == "__main__":
    main()

In [10]:
# with mentions of people and places
%cd /content
!python3 /content/summarize_youtube.py https://www.youtube.com/watch?v=3m2Cpbpr1zM

/content
Found 1 chunks

Summary: Section #1: David is explaining how to play a game where you silently signal to your partner if you trust a tradesman or not. He pretends that Bob is a tradesman and Alice is his wife. Bob comes to repair their broadband and David tells Alice to get the door. Alice introduces herself and David to Bob. David asks Bob how much it will cost to fix the broadband, and Bob says it will cost 2000 pounds. David is shocked and asks if he is telling the truth, and Alice says no.


In [35]:
!python3 /content/summarize_youtube.py https://www.youtube.com/watch?v=E_sMa3N44u4

Found 4 chunks
Traceback (most recent call last):
  File "/content/summarize_youtube.py", line 212, in <module>
    main()
  File "/content/summarize_youtube.py", line 202, in main
    summary = summarize_chunk(index, chunk)
  File "/content/summarize_youtube.py", line 101, in summarize_chunk
    text_response = response.generations[0].text.strip()
  File "/usr/local/lib/python3.8/dist-packages/cohere/response.py", line 54, in __getattribute__
    return attr.resolve()
  File "/usr/local/lib/python3.8/dist-packages/cohere/response.py", line 41, in resolve
    self._result = self._getter(self._request.result())
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 444, in result
    return self.__get_result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/usr/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/usr/local/lib/python3.8/dist-packa

In [37]:
!python3 /content/summarize_youtube.py https://www.youtube.com/watch?v=JsLH0SeqAEc

Found 1 chunks
Traceback (most recent call last):
  File "/content/summarize_youtube.py", line 212, in <module>
    main()
  File "/content/summarize_youtube.py", line 195, in main
    summary = summarize_chunk(0, chunks[0])
  File "/content/summarize_youtube.py", line 101, in summarize_chunk
    text_response = response.generations[0].text.strip()
  File "/usr/local/lib/python3.8/dist-packages/cohere/response.py", line 54, in __getattribute__
    return attr.resolve()
  File "/usr/local/lib/python3.8/dist-packages/cohere/response.py", line 41, in resolve
    self._result = self._getter(self._request.result())
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 444, in result
    return self.__get_result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/usr/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/usr/local/lib/python3.8/dist-packa

In [38]:
!python3 /content/summarize_youtube.py https://youtu.be/_psRCXRyWq8

Found 1 chunks

Summary: Section #1 of the transcript is a description of the meniscus and how it can become injured. The meniscus is a type of cartilage that has two main functions: helps spread a person's weight evenly across the joint whilst also providing some stability. Injuries to the meniscus are often described as a cartilage tear. Arthroscopic punch and shaver are used to remove the torn fragment.


In [39]:
!python3 /content/summarize_youtube.py https://www.youtube.com/watch?v=cTxBLn-DoEQ

Found 6 chunks
Traceback (most recent call last):
  File "/content/summarize_youtube.py", line 212, in <module>
    main()
  File "/content/summarize_youtube.py", line 202, in main
    summary = summarize_chunk(index, chunk)
  File "/content/summarize_youtube.py", line 101, in summarize_chunk
    text_response = response.generations[0].text.strip()
  File "/usr/local/lib/python3.8/dist-packages/cohere/response.py", line 54, in __getattribute__
    return attr.resolve()
  File "/usr/local/lib/python3.8/dist-packages/cohere/response.py", line 41, in resolve
    self._result = self._getter(self._request.result())
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 444, in result
    return self.__get_result()
  File "/usr/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/usr/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/usr/local/lib/python3.8/dist-packa