In [None]:

# install all dependencies
!pip install pytube
!pip install python-dotenv

!pip install torch torchvision torchaudio
!pip install transformers
!pip install sentencepiece

!pip install openai

!pip install --ignore-installed PyYAML
!pip install langchain



In [None]:
# import secrets
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASSEMBLYAI_API_SECRET = os.getenv("ASSEMBLYAI_API_SECRET")

print("OPENAI_API_KEY: ", OPENAI_API_KEY)
print("ASSEMBLYAI_API_SECRET: ", ASSEMBLYAI_API_SECRET)


**AssemblyAI Helper functions**

In [None]:
# AssemblyAI integration
import requests
import json
import time

assemblyAI_base_url = "https://api.assemblyai.com/v2"

assemblyAI_headers = {
  "authorization": ASSEMBLYAI_API_SECRET,
}

# upload file to assemblyAI
def upload_file(filename):
  with open(filename, "rb") as f:
    upload_url = assemblyAI_base_url + "/upload"
    response = requests.post(upload_url, headers=assemblyAI_headers, data=f)

  upload_url = response.json()["upload_url"]
  return upload_url

# transcribe the file
def create_transcript(upload_url):
  transcribe_url = assemblyAI_base_url + "/transcript"
  data = {
    "audio_url": upload_url
  }
  response = requests.post(transcribe_url, json=data, headers=assemblyAI_headers)

  transcript_id = response.json()['id']
  polling_url = transcribe_url + "/" + transcript_id

  while True:
    transcription_result = requests.get(polling_url, headers=assemblyAI_headers).json()

    if transcription_result['status'] == 'completed':
      result = transcription_result['text']
      return result

    elif transcription_result['status'] == 'error':
      raise RuntimeError(f"Transcription failed: {transcription_result['error']}")

    else:
      time.sleep(3)

**Step 0: Process the input file**

In [None]:
import csv

videos = []

with open('videos.csv', mode='r') as csv_file:
  csv_reader = csv.DictReader(csv_file)
  line_count = 0
  for row in csv_reader:
    if line_count == 0:
      line_count += 1
    videos.append(row)
    line_count += 1
print(videos)

**Step 1: Download and transcribe videos**

In [None]:

import os
from pytube import YouTube

def transcribe_video(video_url):
  # download the video
  yt = YouTube(video_url)
  audio = yt.streams.filter(only_audio=True).first()
  file_id = yt.video_id
  video_file = './videos/' + file_id + '.mp3'
  text_file = './transcripts/' + file_id + '.txt'
  
  if not os.path.exists(video_file):
    print(f'Downloading video id {file_id}')
    out_file = audio.download(filename=file_id)
    os.rename(out_file, video_file)

  if not os.path.exists(text_file):
    print(f'Uploading video id {file_id} to assemblyAI')
    # upload the file to assemblyAI
    upload_url = upload_file(video_file)

    # transcribe the file
    print(f'Generating transcript of video id {file_id}')
    transcript = create_transcript(upload_url)
    f = open(text_file, 'w')
    f.write(transcript)
    f.close()

for video in videos:
  transcribe_video(video['link'])


**Read the video file**

In [None]:
transcript_file = open('./transcripts/' + video_id + '.txt', 'r')
video_transcript = transcript_file.read()

**Step 2a: Create an abstractive summary of the transcript using Google’s PEGASUS model**

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch


In [None]:
# load tokenizer
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

In [None]:
# load model
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

In [None]:

tokens = tokenizer(video_transcript, truncation=True, padding='longest', return_tensors='pt')
summary_tokens = model.generate(**tokens)
summary = tokenizer.decode(summary_tokens[0])

transcript_file = open('./summaries/pegasus-' + video_id + '.txt', 'w')
transcript_file.write(summary)
transcript_file.close()

**Step 2b: Create an abstractive summary of the transcript using OpenAI's GPT-3 model**


In [None]:
import openai

openai.api_key = OPENAI_API_KEY

# split the video transcript into chunks of 2048 characters
def split_text(text):
  max_chunk_size = 2048
  chunks = []
  current_chunk = ''
  for sentence in text.split('.'):
    if len(current_chunk) + len(sentence) < max_chunk_size:
      current_chunk += sentence + '.'
    else:
      chunks.append(current_chunk.strip())
      current_chunk = sentence + '.'
  if current_chunk:
    chunks.append(current_chunk.strip())
  return chunks

# generate summary using openai
def generate_summary(text):
  input_chunks = split_text(text)
  output_chunks = []
  for chunk in input_chunks:
    response = openai.Completion.create(
      engine="davinci",
      prompt=(f"Please summarize the following text: {chunk}\n"),
      temperature=0.5,
      max_tokens=1024,
    )
    summary = response.choices[0].text.strip()
    print('chunk: ', chunk)
    print('summary: ', summary)
    output_chunks.append(summary)
  return ''.join(output_chunks)


for video in videos:
  # read the video transcript
  video_id = video['link'].split('=')[1]
  transcript_file = open('./transcripts/' + video_id + '.txt', 'r')
  video_transcript = transcript_file.read()

  summary_file = './summaries/openai-' + video_id + '.txt'
  if not os.path.exists(summary_file):
    print(f'Generating summary of video id {video_id}')

    # run the summary generation workflow
    summary = generate_summary(video_transcript)

    # save summary to file
    f = open(summary_file, 'w')
    f.write(summary)
    f.close()


**Step 3: Create embeddings of the transcript summary**

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
import csv

openai.api_key = OPENAI_API_KEY

def create_embeddings(data_file, video_id):
  with open(data_file, 'w') as csv_file:
    csv_writer = csv.writer(csv_file)

    # write the header
    csv_writer.writerow(['video_id', 'text', 'embedding'])

    # write the data rows
    with open('./summaries/openai-' + video_id + '.txt') as text_file:
      summary = text_file.read()
      text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 2048,
        chunk_overlap  = 20,
      )
      documents = text_splitter.create_documents([summary])

      for text in documents:
        embedding=openai.Embedding.create(model="text-embedding-ada-002", input=text.page_content)
        query_result=embedding['data'][0]['embedding']
        csv_writer.writerow([video_id, text.page_content, query_result])


for video in videos:
  video_id = video['link'].split('=')[1]
  data_file = './data/' + video_id + '.csv'
  if not os.path.exists(data_file):
    print(f'Generating embeddings for video id {video_id}')
    create_embeddings(data_file, video_id) 

**Step 4: Clustering**

In [None]:
import numpy as np
import pandas as pd

# TODO: use the csv data to generate the clusters

embeddings = np.empty((0, 1536), float) # todo: change hardcoded number
embeddings_df = pd.DataFrame()

for video in videos:
  video_id = video['link'].split('=')[1]
  datafile_path = './data/' + video_id + '.csv'
  df = pd.read_csv(datafile_path)
  df["embedding"] = df.embedding.apply(eval).apply(np.array)
  matrix = np.vstack(df.embedding.values)

  # append the matrix and df to the embeddings
  embeddings = np.vstack((embeddings, matrix))
  embeddings_df = embeddings_df.append(df)

from sklearn.cluster import KMeans

n_clusters = 4

kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(embeddings)
labels = kmeans.labels_
embeddings_df['cluster'] = labels

from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, perplexity=3, random_state=42, init="random", learning_rate=200)
vis_dims2 = tsne.fit_transform(embeddings)

x = [x for x, y in vis_dims2]
y = [y for x, y in vis_dims2]

for category, color in enumerate(["purple", "green", "red", "black"]):
  xs = np.array(x)[np.where(embeddings_df['cluster'] == category)[0]]
  ys = np.array(y)[np.where(embeddings_df['cluster'] == category)[0]]
  plt.scatter(xs, ys, color=color, alpha=0.3)

  # avg_x = xs.mean()
  # avg_y = ys.mean()
  # print(avg_x, avg_y)

  # plt.scatter(avg_x, avg_y, marker="x", color=color, s=100)
plt.title("Clusters identified visualized in 2d using t-SNE")
