In [None]:

# install all dependencies
!pip install pytube
!pip install python-dotenv

!pip install torch torchvision torchaudio
!pip install transformers
!pip install sentencepiece

!pip install openai

!pip install --ignore-installed PyYAML
!pip install langchain



In [151]:
# import secrets
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ASSEMBLYAI_API_SECRET = os.getenv("ASSEMBLYAI_API_SECRET")


**AssemblyAI Helper functions**

In [7]:
# AssemblyAI integration
import requests
import json
import time

assemblyAI_base_url = "https://api.assemblyai.com/v2"

assemblyAI_headers = {
  "authorization": ASSEMBLYAI_API_SECRET,
}

# upload file to assemblyAI
def upload_file(filename):
  with open(filename, "rb") as f:
    upload_url = assemblyAI_base_url + "/upload"
    response = requests.post(upload_url, headers=assemblyAI_headers, data=f)

  upload_url = response.json()["upload_url"]
  return upload_url

# transcribe the file
def create_transcript(upload_url):
  transcribe_url = assemblyAI_base_url + "/transcript"
  data = {
    "audio_url": upload_url
  }
  response = requests.post(transcribe_url, json=data, headers=assemblyAI_headers)

  transcript_id = response.json()['id']
  polling_url = transcribe_url + "/" + transcript_id

  while True:
    transcription_result = requests.get(polling_url, headers=assemblyAI_headers).json()

    if transcription_result['status'] == 'completed':
      result = transcription_result['text']
      return result

    elif transcription_result['status'] == 'error':
      raise RuntimeError(f"Transcription failed: {transcription_result['error']}")

    else:
      time.sleep(3)

**Step 0: Process the input file**

In [163]:
import csv

videos = []

with open('videos.csv', mode='r') as csv_file:
  csv_reader = csv.DictReader(csv_file)
  line_count = 0
  for row in csv_reader:
    if line_count == 0:
      line_count += 1
    videos.append(row)
    line_count += 1
print(videos)

[{'link': 'https://www.youtube.com/watch?v=uFg3tDxmqiE', 'category': 'finance'}, {'link': 'https://www.youtube.com/watch?v=45awieDmjVQ', 'category': 'food'}, {'link': 'https://www.youtube.com/watch?v=nqXSTwKw54A', 'category': 'travel'}, {'link': 'https://www.youtube.com/watch?v=mexOYMrM0sE', 'category': 'career'}, {'link': 'https://www.youtube.com/watch?v=hXP5fUfBGQQ', 'category': 'productivity'}, {'link': 'https://www.youtube.com/watch?v=AKYaqQQRBmI', 'category': 'finance'}, {'link': 'https://www.youtube.com/watch?v=DrYP9V4f8Ng', 'category': 'food'}, {'link': 'https://www.youtube.com/watch?v=VmWTHELjmtk', 'category': 'travel'}, {'link': 'https://www.youtube.com/watch?v=RK9SjbVti4Q', 'category': 'career'}, {'link': 'https://www.youtube.com/watch?v=FbSNfj2S6Pw', 'category': 'productivity'}, {'link': 'https://www.youtube.com/watch?v=dLaET6wBga0', 'category': 'finance'}, {'link': 'https://www.youtube.com/watch?v=g27Xh7RVRU4', 'category': 'food'}, {'link': 'https://www.youtube.com/watch?v=

**Step 1: Download and transcribe videos**

In [None]:

import os
from pytube import YouTube

def transcribe_video(video_url):
  # download the video
  yt = YouTube(video_url)
  audio = yt.streams.filter(only_audio=True).first()
  file_id = yt.video_id
  video_file = './videos/' + file_id + '.mp3'
  text_file = './transcripts/' + file_id + '.txt'
  
  if not os.path.exists(video_file):
    print(f'Downloading video id {file_id}')
    out_file = audio.download(filename=file_id)
    os.rename(out_file, video_file)

  if not os.path.exists(text_file):
    print(f'Uploading video id {file_id} to assemblyAI')
    # upload the file to assemblyAI
    upload_url = upload_file(video_file)

    # transcribe the file
    print(f'Generating transcript of video id {file_id}')
    transcript = create_transcript(upload_url)
    f = open(text_file, 'w')
    f.write(transcript)
    f.close()

for video in videos:
  transcribe_video(video['link'])


**Step 2a: Create an abstractive summary of the transcript using Google’s PEGASUS model**

In [11]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [12]:
# load tokenizer
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')

In [13]:
# load model
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

In [None]:

def generate_summary(text):
  tokens = tokenizer(text, truncation=True, padding='longest', return_tensors='pt')
  summary_tokens = model.generate(**tokens)
  summary = tokenizer.decode(summary_tokens[0])
  return summary

for video in videos:
  # read the video transcript
  video_id = video['link'].split('=')[1]
  transcript_file = open('./transcripts/' + video_id + '.txt', 'r')
  video_transcript = transcript_file.read()

  summary_file = './summaries/pegasus-' + video_id + '.txt'
  if not os.path.exists(summary_file):
    print(f'Pegasus: Generating summary of video id {video_id}')

    # run the summary generation workflow
    summary = generate_summary(video_transcript)

    # save summary to file
    f = open(summary_file, 'w')
    f.write(summary)
    f.close()

In [17]:
# clean up the summary data generated by Pegasus
import re
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

for video in videos:
  video_id = video['link'].split('=')[1]
  summary_file = open('./summaries/pegasus-' + video_id + '.txt', 'r')
  summary = summary_file.read()
  summary = cleanhtml(summary)
  clean_summary_file = open('./summaries/pegasus-' + video_id + '-clean.txt', 'w')
  clean_summary_file.write(summary)
  clean_summary_file.close()

**Step 2b: Create an abstractive summary of the transcript using OpenAI's GPT-3 model**


In [None]:
import openai

openai.api_key = OPENAI_API_KEY

# split the video transcript into chunks of 2048 characters
def split_text(text):
  max_chunk_size = 2048
  chunks = []
  current_chunk = ''
  for sentence in text.split('.'):
    if len(current_chunk) + len(sentence) < max_chunk_size:
      current_chunk += sentence + '.'
    else:
      chunks.append(current_chunk.strip())
      current_chunk = sentence + '.'
  if current_chunk:
    chunks.append(current_chunk.strip())
  return chunks

# generate summary using openai
def generate_summary(text):
  input_chunks = split_text(text)
  output_chunks = []
  for chunk in input_chunks:
    response = openai.Completion.create(
      engine="davinci",
      prompt=(f"Please summarize the following text: {chunk}\n"),
      temperature=0.5,
      max_tokens=1024,
    )
    summary = response.choices[0].text.strip()
    output_chunks.append(summary)
  return ''.join(output_chunks)


for video in videos:
  # read the video transcript
  video_id = video['link'].split('=')[1]
  transcript_file = open('./transcripts/' + video_id + '.txt', 'r')
  video_transcript = transcript_file.read()

  summary_file = './summaries/openai-' + video_id + '.txt'
  if not os.path.exists(summary_file):
    print(f'OpenAI: Generating summary of video id {video_id}')

    # run the summary generation workflow
    summary = generate_summary(video_transcript)

    # save summary to file
    f = open(summary_file, 'w')
    f.write(summary)
    f.close()


**Step 3: Create embeddings of the transcript summary**

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
import csv

openai.api_key = OPENAI_API_KEY

def create_embeddings(video_id, summary_path):
  data_path = './data/' + summary_path + video_id + '.csv'
  if not os.path.exists(data_path):
    with open(data_path, 'w') as csv_file:
      csv_writer = csv.writer(csv_file)

      # write the header
      csv_writer.writerow(['video_id', 'text', 'embedding'])

      # write the data rows
      with open('./summaries/' + summary_path + video_id + '.txt') as text_file:
        summary = text_file.read()
        text_splitter = RecursiveCharacterTextSplitter(
          chunk_size = 2048,
          chunk_overlap  = 20,
        )
        documents = text_splitter.create_documents([summary])

        for text in documents:
          embedding=openai.Embedding.create(model="text-embedding-ada-002", input=text.page_content)
          query_result=embedding['data'][0]['embedding']
          csv_writer.writerow([video_id, text.page_content, query_result])

# generate embeddings for the openai summaries
summary_path='openai-'
for video in videos:
  video_id = video['link'].split('=')[1]
  print(f'OpenAI: Generating embeddings for video id {video_id}')
  create_embeddings(video_id, summary_path)

# generate embeddings for the pegasus summaries
summary_path='pegasus-'
for video in videos:
  video_id = video['link'].split('=')[1]
  print(f'Pegasus: Generating embeddings for video id {video_id}')
  create_embeddings(video_id, summary_path)
 

**Step 4: Clustering**

In [138]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib
import matplotlib.pyplot as plt

def create_dataset(path):
  embeddings = np.empty((0, 1536), float)
  embeddings_df = pd.DataFrame()

  for video in videos:
    video_id = video['link'].split('=')[1]
    # print(video_id)
    datafile_path = './data/' + path + video_id + '.csv'
    df = pd.read_csv(datafile_path)
    df["embedding"] = df.embedding.apply(eval).apply(np.array)
    matrix = np.vstack(df.embedding.values)

    # append the matrix and df to the embeddings
    embeddings = np.vstack((embeddings, matrix))
    # print(embeddings.shape)
    embeddings_df = embeddings_df.append(df)
    # print(embeddings_df.shape)

  return embeddings, embeddings_df

def create_clusters(embeddings, embeddings_df):
  kmeans = KMeans(n_clusters=5, init="k-means++", random_state=42)
  kmeans.fit(embeddings)
  labels = kmeans.labels_
  embeddings_df['cluster'] = labels
  
  return embeddings, embeddings_df

def plot_clusters(path, embeddings, embeddings_df):
  tsne = TSNE(n_components=2, perplexity=3, random_state=42, init="random", learning_rate=200)
  vis_dims2 = tsne.fit_transform(embeddings)

  x_coords = [x for x, y in vis_dims2]
  y_coords = [y for x, y in vis_dims2]

  legend_map = {}

  # plot the points
  for category, color in enumerate(["purple", "green", "red", "blue", "yellow"]):
    xs = np.array(x_coords)[np.where(embeddings_df['cluster'] == category)[0]]
    ys = np.array(y_coords)[np.where(embeddings_df['cluster'] == category)[0]]
    # print(category, color, xs.shape, ys.shape)
    
    # get the video ids for the cluster of points
    video_ids = embeddings_df.loc[(embeddings_df['cluster'] == category), 'video_id'].values
    video_ids = np.unique(video_ids)

    # filter the videos by the video ids that belong to this cluster
    filtered_videos = [v for v in videos if v['link'].split('=')[1] in (video_ids)]
    categories = np.unique([video['category'] for video in filtered_videos])
    # print(filtered_videos)
    # print(categories)

    # plot the points 
    plt.scatter(xs, ys, color=color, alpha=0.3, label=None)

    # plot the legend
    legend_map[color] = categories
    
  for color in legend_map:
    plt.scatter([], [], c=color, alpha=0.3, label=legend_map[color])

  plt.legend(loc="upper right", scatterpoints=1, frameon=False, labelspacing=0.5)
  plt.title("Clusters identified visualized in 2d using t-SNE")
  plt.savefig('./images/' + path + 'clustering.png') 



In [None]:
path='openai-'

# create dataset
embeddings, embeddings_df = create_dataset(path)

# create clusters
embeddings, embeddings_df = create_clusters(embeddings, embeddings_df)

# create clusters dataframe
clusters_df = embeddings_df[['video_id', 'cluster']]
clusters_df_unique = clusters_df.drop_duplicates()

# save clusters to file
clusters_file = open('./clustering/' + path + 'clusters.csv', 'w')
clusters_df_unique.to_csv(clusters_file, index=False)

# plot clusters
plot_clusters(path, embeddings, embeddings_df)

In [None]:
path='pegasus-'

# create dataset
embeddings, embeddings_df = create_dataset(path)

# create clusters
embeddings, embeddings_df = create_clusters(embeddings, embeddings_df)

# create clusters dataframe
clusters_df = embeddings_df[['video_id', 'cluster']]
clusters_df_unique = clusters_df.drop_duplicates()

# save clusters to file
clusters_file = open('./clustering/' + path + 'clusters.csv', 'w')
clusters_df_unique.to_csv(clusters_file, index=False)

# plot clusters
plot_clusters(path, embeddings, embeddings_df)

**Step 5: Classification**

In [None]:
import openai
import csv

openai.api_key = OPENAI_API_KEY

categories = ['finance', 'food', 'travel', 'career', 'productivity']

# obtain embeddings of the categories
categories_file = './data/categories.csv'
if not os.path.exists(categories_file):
  with open(categories_file, 'w') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['category', 'embedding'])
    for category in categories:
      print(f'Generating embeddings for category {category}')
      embedding=openai.Embedding.create(model="text-embedding-ada-002", input=category)
      query_result=embedding['data'][0]['embedding']
      csv_writer.writerow([category, query_result])
    

In [None]:

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy import spatial

# classify videos into categories

# read category embeddings
category_df = pd.read_csv('./data/categories.csv')
category_df["embedding"] = category_df.embedding.apply(eval).apply(np.array)
category_matrix = np.vstack(category_df.embedding.values)
print('category matrix shape: ', category_matrix.shape)

# nearest neighbors fit
neighbors = NearestNeighbors(n_neighbors=1, metric=spatial.distance.cosine)
neighbors.fit(category_matrix)

paths = ['openai-', 'pegasus-']

for path in paths:
  # open a file to write the results
  classification_file = './classification/' + path + 'classification.csv'
  with open(classification_file, 'w') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['video_id', 'category'])

    # read data embeddings
    for video in videos:
      # define empty list of embeddings
      embeddings = np.empty((0, 1536), float) 

      # get video id
      video_id = video['link'].split('=')[1]
      # print('video id: ', video_id)

      # read data embeddings
      datafile_path = './data/' + path + video_id + '.csv'
      df = pd.read_csv(datafile_path)
      df["embedding"] = df.embedding.apply(eval).apply(np.array)
      embeddings = np.vstack(df.embedding.values)
      # print(embeddings.shape)
      
      # compute the centroid of summary embeddings
      centroid = embeddings.mean(axis=0)
      
      # classify using KNN
      index = neighbors.kneighbors([centroid], return_distance=False)[0, 0]
      category = categories[index]
      # print('category: ', category)

      # write to file
      csv_writer.writerow([video_id, category])

In [None]:
# evaluation
from sklearn import metrics

true_labels = [video['category'] for video in videos]

for path in paths:
  # open the classification file
  classification_file = './classification/' + path + 'classification.csv'
  df = pd.read_csv(classification_file)
  print(path)
  
  # get the predicted labels
  pred_labels = df["category"].values

  # print the classification report
  report = metrics.classification_report(
    y_true=true_labels,
    y_pred=pred_labels,
    labels=categories
  )
  print(report)