# Extract Automatic Transcripts from YouTube Videos

Code authored by: Shaw Talebi<br>

Blog link: https://medium.com/towards-data-science/how-to-build-data-pipelines-for-machine-learning-b97bbef050a5 <br>
Video link: https://youtu.be/OnIQrDiTtRM

### imports

In [3]:
import polars as pl
from youtube_transcript_api import YouTubeTranscriptApi

### functions

In [4]:
def extract_text(transcript: list) -> str:
    """
        Function to extract text from transcript dictionary
    """
    
    text_list = [transcript[i]['text'] for i in range(len(transcript))]
    return ' '.join(text_list)

### get transcripts

In [5]:
df = pl.read_parquet('data/video-ids.parquet')
print(df.head())

shape: (5, 3)
┌─────────────┬──────────────────────┬─────────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                           │
│ ---         ┆ ---                  ┆ ---                             │
│ str         ┆ str                  ┆ str                             │
╞═════════════╪══════════════════════╪═════════════════════════════════╡
│ Ot2c5MKN_-w ┆ 2024-11-20T13:31:14Z ┆ Multimodal AI: LLMs that can s… │
│ gUJJB235DVs ┆ 2024-11-18T15:11:07Z ┆ 5 AI Projects You Can Build Th… │
│ bAe4qwQGxlI ┆ 2024-10-25T13:18:08Z ┆ I Built an AI App in 4 days...… │
│ 4QHg8Ix8WWQ ┆ 2024-10-17T12:50:12Z ┆ Fine-Tuning BERT for Text Clas… │
│ tMiQIxSX64c ┆ 2024-10-10T13:50:57Z ┆ 5 AI Projects You Can Build Th… │
└─────────────┴──────────────────────┴─────────────────────────────────┘


In [6]:
%%time
transcript_text_list = []

for i in range(len(df)):

    # try to extract captions
    try:
        transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
        transcript_text = extract_text(transcript)
    # if not available set as n/a
    except:
        transcript_text = "n/a"
    
    transcript_text_list.append(transcript_text)

CPU times: total: 3.94 s
Wall time: 1min 38s


In [7]:
# add transcripts to dataframe
df = df.with_columns(pl.Series(name="transcript", values=transcript_text_list))
print(df.head())

shape: (5, 4)
┌─────────────┬──────────────────────┬──────────────────────────────┬──────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                        ┆ transcript                   │
│ ---         ┆ ---                  ┆ ---                          ┆ ---                          │
│ str         ┆ str                  ┆ str                          ┆ str                          │
╞═════════════╪══════════════════════╪══════════════════════════════╪══════════════════════════════╡
│ Ot2c5MKN_-w ┆ 2024-11-20T13:31:14Z ┆ Multimodal AI: LLMs that can ┆ multimodal models are        │
│             ┆                      ┆ s…                           ┆ capable …                    │
│ gUJJB235DVs ┆ 2024-11-18T15:11:07Z ┆ 5 AI Projects You Can Build  ┆ five AI project ideas that   │
│             ┆                      ┆ Th…                          ┆ you…                         │
│ bAe4qwQGxlI ┆ 2024-10-25T13:18:08Z ┆ I Built an AI App in 4       ┆ I built

### write data to file

In [8]:
# write data to file
df.write_parquet('data/video-transcripts.parquet')
df.write_csv('data/video-transcripts.csv')