### Install packages

In [2]:
pip install pytube

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0


In [3]:
pip install git+https://github.com/openai/whisper.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone


### Import packages

In [4]:
import whisper
import pytube

### Input and Prepare YouTube URL

In [5]:
url = "https://www.youtube.com/watch?v=-1KdsqonGkM"
video = pytube.YouTube(url)
video.streams.get_highest_resolution().filesize

363707344

In [6]:
audio = video.streams.get_audio_only()
fn = audio.download(output_path="tmp.mp3") # Downlods only audio from youtube video

### Create model

In [8]:
model = whisper.load_model("base")

100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 78.3MiB/s]


In [9]:
transcription = model.transcribe('/content/tmp.mp3/Jocko Podcast 384 Always Finding Ways to Get Stronger Faster and Fitter With Dave Castro.mp4')

### Store text and segments

In [10]:
res = transcription['text']

In [12]:
print(res)



In [29]:
from datetime import datetime

def store_segments(segments):
  texts = []
  start_times = []

  for segment in segments:
    text = segment['text']
    start = segment['start']

    # Convert the starting time to a datetime object
    start_datetime = datetime.fromtimestamp(start)

    # Format the starting time as a string in the format "00:00:00"
    formatted_start_time = start_datetime.strftime('%H:%M:%S')

    texts.append("".join(text))
    start_times.append(formatted_start_time)

  return texts, start_times

In [31]:
segments = [{'text': segment, 'start': 0} for segment in res.split()] # manually putting in start time for now and will debug later

In [32]:
texts, start_times = store_segments(segments)

### Install LangChain, OpenAI, and FAISS

In [35]:
pip install -qqq langchain

In [37]:
pip install -qqq openai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/72.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [41]:
pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


### Import libraries and connect to OpenAI

In [43]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.faiss import FAISS
from langchain.chains import VectorDBQAWithSourcesChain
from langchain import OpenAI
import openai
import faiss



In [46]:
import os

api_key = os.environ.get('OPENAI_API_KEY')

### Split document and build the index

In [47]:
text_splitter = CharacterTextSplitter(chunk_size=1500, separator="\n")
docs = []
metadatas = []
for i, d in enumerate(texts):
    splits = text_splitter.split_text(d)
    docs.extend(splits)
    metadatas.extend([{"source": start_times[i]}] * len(splits))
embeddings = OpenAIEmbeddings()

In [48]:
store = FAISS.from_texts(docs, embeddings, metadatas=metadatas)
faiss.write_index(store.index, "docs.index")

### Create the chain

In [51]:
chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(temperature=0), vectorstore=store)

In [52]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [55]:
result = chain({"question": "What exercises does Dave Castro recommend to stay strong?"})

In [56]:
print(f"Answer: {result['answer']}  Sources: {result['sources']}")

Answer:  Dave Castro recommends a variety of exercises to stay strong, including squats, deadlifts, and pull-ups. He also recommends incorporating Olympic lifts such as the clean and jerk and the snatch into your routine. Additionally, Castro suggests adding in accessory exercises such as bent-over rows, overhead presses, and farmer's carries.
  Sources: 00:00:00


In [61]:
transcript = texts  # Replace <your_transcript_here> with the actual transcript

# Specify the file path where you want to save the transcript
file_path = "/content/transcript.txt"  # Replace with your desired file path

with open(file_path, "w") as file:
    file.write('\n'.join(transcript))

print("Transcript saved successfully.")

Transcript saved successfully.
