# Extract Automatic Transcripts from YouTube Videos

### imports

In [None]:
import polars as pl
from youtube_transcript_api import YouTubeTranscriptApi

### functions

In [None]:
def extract_text(transcript: list) -> str:
    """
        Function to extract text from transcript dictionary
    """

    text_list = [transcript[i]['text'] for i in range(len(transcript))]
    return ' '.join(text_list)

### get transcripts

In [None]:
df = pl.read_parquet('data/video-ids.parquet')
print(df.head())

shape: (5, 3)
┌─────────────┬──────────────────────┬───────────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                             │
│ ---         ┆ ---                  ┆ ---                               │
│ str         ┆ str                  ┆ str                               │
╞═════════════╪══════════════════════╪═══════════════════════════════════╡
│ eayzAZltV9U ┆ 2024-04-29T13:54:55Z ┆ 4 Lessons from AI Consulting #fr… │
│ 03x2oYg9oME ┆ 2024-04-25T15:16:00Z ┆ How to Manage Data Science Proje… │
│ O5i_mMUM94c ┆ 2024-04-19T14:05:54Z ┆ How I’d learned #datascience (if… │
│ xm9devSQEqU ┆ 2024-04-18T15:59:02Z ┆ 4 Skills You Need to Be a Full-S… │
│ Z6CmuVEi7QY ┆ 2024-04-11T10:00:27Z ┆ How I&#39;d Learn Data Science (… │
└─────────────┴──────────────────────┴───────────────────────────────────┘


In [None]:
%%time
transcript_text_list = []

for i in range(len(df)):

    # try to extract captions
    try:
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extract_text(transcript)
    # if not available set as n/a
    except:
        transcript_text = "n/a"

    transcript_text_list.append(transcript_text)

CPU times: user 9.5 s, sys: 608 ms, total: 10.1 s
Wall time: 1min 20s


In [None]:
# add transcripts to dataframe
df = df.with_columns(pl.Series(name="transcript", values=transcript_text_list))
print(df.head())

shape: (5, 4)
┌─────────────┬──────────────────────┬──────────────────────────────┬──────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                        ┆ transcript                   │
│ ---         ┆ ---                  ┆ ---                          ┆ ---                          │
│ str         ┆ str                  ┆ str                          ┆ str                          │
╞═════════════╪══════════════════════╪══════════════════════════════╪══════════════════════════════╡
│ eayzAZltV9U ┆ 2024-04-29T13:54:55Z ┆ 4 Lessons from AI Consulting ┆ are four things I've learned │
│             ┆                      ┆ #fr…                         ┆ fro…                         │
│ 03x2oYg9oME ┆ 2024-04-25T15:16:00Z ┆ How to Manage Data Science   ┆ this video is part of a      │
│             ┆                      ┆ Proje…                       ┆ larger s…                    │
│ O5i_mMUM94c ┆ 2024-04-19T14:05:54Z ┆ How I’d learned #datascience ┆ here's 

### write data to file

In [None]:
# write data to file
df.write_parquet('data/video-transcripts.parquet')
df.write_csv('data/video-transcripts.csv')