<a target="_blank" href="https://colab.research.google.com/github/gox6/colab-demos/blob/main/use-cases/getting-youtube-transcripts
.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
# Installs
!pip install --quiet \
  polars \
  pytube \
  youtube_transcript_api

In [None]:
# Import for Youtube transcripts
from collections import Counter, defaultdict
from langchain_community.document_loaders import YoutubeLoader
from langchain_core.documents.base import Document
import polars as pl

In [None]:
# 30 Youtube videos from BBC News

youtube_bbc_news_videos = ["https://youtu.be/YEHA-u8b43A", "https://youtu.be/TlFfHjOMSXQ", "https://youtu.be/-kZolk9EoMA", "https://youtu.be/JQMZkrz6X08",
                           "https://youtu.be/D1iMZaLjBU4", "https://youtu.be/Sqcv9lCADxE", "https://youtu.be/Sshl0SFO4ZI", "https://youtu.be/mpOyMOZWEcU",
                           "https://youtu.be/Cg3YMWcjLI4", "https://youtu.be/yQIMSv9Luw4", "https://youtu.be/5LYGnqoCLGk", "https://youtu.be/KCepbsLLUMY",
                           "https://youtu.be/lHggWT2iLdo", "https://youtu.be/TTUsxD62398", "https://youtu.be/3YMleRGjeqE", "https://youtu.be/h-5dqQMZTZQ",
                           "https://youtu.be/4GBcZJpw8yI", "https://youtu.be/QOUGlWEpwL4", "https://youtu.be/T05I-SBhXoI", "https://youtu.be/Iz-XY6XfXjk",
                           "https://youtu.be/yyLFQrb--pw", "https://youtu.be/wMqJbMPNM6A", "https://youtu.be/kOcuwLBPBP8", "https://youtu.be/7goRcrFKs3U",
                           "https://youtu.be/FoBOSLofM3E", "https://youtu.be/97nEBjiQI1M", "https://youtu.be/bjmK4lGKNqY", "https://youtu.be/XWLA5A6bpwE",
                           "https://youtu.be/DkmrhVpCmac", "https://youtu.be/Dxar1d1aTUo"
                          ]


count = Counter(youtube_bbc_news_videos)
assert len(count) == 30

# Collecting transcripts of Youtube videos
# Long running cell: around 60s

def get_youtube_transcripts(urls: list[str]) -> list[Document]:

  docs = list(map(lambda url: YoutubeLoader.from_youtube_url(url, add_video_info=True).load()[0], urls))

  return docs

bbc_news = get_youtube_transcripts(youtube_bbc_news_videos)

# Transforming list of LangChain documents into dataframe to review data conveniently

def to_pl(list_of_docs: list[Document]) -> pl.DataFrame:
  data = defaultdict(list)

  for doc in list_of_docs:
    doc_dict = doc.dict()
    for key in doc_dict.keys():
      if key != 'metadata':
        data[key].append(doc_dict[key])
      else:
        metadata = doc_dict['metadata']
        for subkey in metadata.keys():
          data[subkey].append(metadata[subkey])

  df_pl = pl.DataFrame(data).rename({"length": "length_in_seconds"})
  df_pl = df_pl.with_columns([(pl.lit('https://youtu.be/') + pl.col('source')).alias('video_url'),
                              (pl.col('page_content').str.split(by=' ').list.len()).alias("length_in_words")])

  df_pl = df_pl.select([ 'publish_date', 'author', 'title',  'video_url', 'view_count', 'length_in_seconds', 'length_in_words', 'page_content' ])
  return df_pl


df_pl = to_pl(bbc_news)
df_pd = df_pl.to_pandas()   # Switching to Pandas from Polars dataframe as it is then better displayed in Colab

# Displaying data with Colab formatter, or just in pandas
display(df_pd)