# OpenAI's Whisper

Whisper is an **A**utomatic **S**peech **R**ecognition (ASR) model from OpenAI. We use it to extract highly accurate text from YouTube videos.

In [7]:
!pip install git+https://github.com/openai/whisper.git
!sudo apt update -y && sudo apt install ffmpeg -y

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-3jtxlabq
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-3jtxlabq
  Resolved https://github.com/openai/whisper.git to commit eff383b27b783e280c089475852ba83f20f64998
  Preparing metadata (setup.py) ... [?25ldone
Collecting torch
  Using cached torch-1.13.0-cp38-cp38-manylinux1_x86_64.whl (890.2 MB)
Collecting transformers>=4.19.0
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting ffmpeg-python==0.2.0
  Using cached ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting future
  Using cached future-0.18.2-py3-none-any.whl
Collecting regex!=2019.12.17
  Using cached regex-2022.10.31-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [8]:
import whisper
import torch  # pytorch install steps: pytorch.org

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

model = whisper.load_model("large").to(device)

cpu


100%|█████████████████████████████████████| 2.87G/2.87G [01:11<00:00, 43.2MiB/s]


In [6]:
from datasets import load_dataset

videos_meta = load_dataset(
    "jamescalam/channel-metadata",
    split="train"
)
videos_meta

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration jamescalam--channel-metadata-872f7e2f9a088c57


Downloading and preparing dataset json/jamescalam--channel-metadata to /home/jupyter/.cache/huggingface/datasets/jamescalam___json/jamescalam--channel-metadata-872f7e2f9a088c57/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|          | 0.00/362k [00:00<?, ?B/s][A
Downloading data: 100%|██████████| 362k/362k [00:00<00:00, 3.29MB/s][A
Downloading data files: 100%|██████████| 1/1 [00:00<00:00,  2.48it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 950.01it/s]
                            

Dataset json downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/jamescalam___json/jamescalam--channel-metadata-872f7e2f9a088c57/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.




Dataset({
    features: ['Video ID', 'Channel ID', 'Title', 'Time Created', 'Time Published', 'Duration', 'Description', 'Category', 'Like Count', 'Dislike Count'],
    num_rows: 222
})

Create videos metadata dictionary...

In [14]:
# Import pandas library
import pandas as pd
  
# initialize list elements
#data = ['fTj6z6Cok0Q','ApxYFGGgbgk','cyyO7dSJajo','3kTSDUXTJpU']
#title = ['esportes','games','musica','filmes']
#time = [1234567,7654321,4567123,1236547]

data = ['k2vU3_yv-68']
title = ['games']
time = ['1234567']


# Create the pandas DataFrame with column name is provided explicitly
videos_meta = pd.DataFrame(data, columns=['Video ID'])
videos_meta["Title"]  = title
videos_meta["Time Published"]  = time
# print dataframe.
videos_meta

Unnamed: 0,Video ID,Title,Time Published
0,k2vU3_yv-68,games,1234567


In [16]:
for index, row in videos_meta.iterrows():
  print(row['Video ID'])      

k2vU3_yv-68


In [17]:
videos_dict = {}

for index, row in videos_meta.iterrows():

    # create entry in dict
    videos_dict[row['Video ID']] = {
        'title': row['Title'],
        'published': row['Time Published'],
        'url': f"https://youtu.be/{row['Video ID']}"
    }

In [18]:
from pathlib import Path

# get list of MP3 audio files
paths = [str(x) for x in Path('./mp3').glob('*.mp3')]
print(len(paths))
print(paths[:5])

1
['mp3/3kTSDUXTJpU.mp3']


In [19]:
# we get the IDs like so
paths[0].split('/')[-1][:-4]

'3kTSDUXTJpU'

In [13]:
from tqdm.auto import tqdm  # !pip install tqdm

data = []
for i, path in enumerate(tqdm(paths)):
    _id = path.split('/')[-1][:-4]
    # transcribe to get speech-to-text data
    result = model.transcribe(path)
    # add results to data list
    data.extend(result['segments'])

  0%|          | 0/1 [16:58<?, ?it/s]


KeyboardInterrupt: 

In [7]:
from pathlib import Path
from tqdm.auto import tqdm
import json

# set window (length of text chunk) and stride
window = 1
stride = 1  # smaller stride creates overlap

data = []

results = []
with open("transcription.jsonl", "w", encoding="utf-8") as fp:
    for i, path in enumerate(tqdm(paths)):
        _id = path.split('/')[-1][:-4]
        # transcribe to get speech-to-text data
        result = model.transcribe(path)
        segments = result['segments']
        # get the video metadata...
        video_meta = videos_dict[_id]
        for j in range(0, len(segments), stride):
            j_end = min(j+window, len(segments)-1)
            text = ''.join([x["text"] for x in segments[j:j_end]])
            start = segments[j]['start']
            end = segments[j_end]['end']
            row_id = f"{_id}-t{segments[j]['start']}"
            meta = {
                **video_meta,
                **{
                    "id": row_id,
                    "text": text.strip(),
                    "start": start,
                    "end": end
                }
            }
            data.append(meta)
            json.dump(meta, fp)
            fp.write('\n')

len(data)

  0%|          | 0/108 [00:00<?, ?it/s]2022-10-13 10:56:36.068291: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
100%|██████████| 108/108 [7:10:39<00:00, 239.26s/it] 


27214

---

## Append more to dataset without overwriting/redoing

First check what we already have

In [None]:
import json

existing_ids = []

with open("transcription.jsonl", 'r', encoding='utf-8') as fp:
    for line in fp:
        obj = json.loads(line)
        existing_ids.append(obj['url'].split('/')[-1])

existing_ids = set(existing_ids)
len(existing_ids)

108

In [None]:
list(existing_ids)[:5]

['gVAJ_l_S7uQ', '1gN1snKBLP0', 'YvVQgvAz9dY', '3Wqh4iUupbM', 'jjQetJtQDS4']

Get paths to videos not already in `existing_ids`...

In [None]:
from pathlib import Path

paths = [str(x) for x in Path('./mp3').glob('*.mp3')]
print(len(paths))
print(paths[:5])

108
['mp3/35Pdoyi6ZoQ.mp3', 'mp3/B7wmo_NImgM.mp3', 'mp3/x1lAcT3xl5M.mp3', 'mp3/r-zQQ16wTCA.mp3', 'mp3/DFtP1THE8fE.mp3']


In [None]:
paths = [x for x in paths if x.split('/')[-1][:-4] not in existing_ids]
print(len(paths))
print(paths[:5])

0
[]


In [None]:
from pathlib import Path
from tqdm.auto import tqdm
import json

# set window (length of text chunk) and stride
window = 1
stride = 1  # smaller stride creates overlap

results = []
with open("transcription.jsonl", "a", encoding="utf-8") as fp:
    for i, path in enumerate(tqdm(paths)):
        _id = path.split('/')[-1][:-4]
        # transcribe to get speech-to-text data
        result = model.transcribe(path)
        segments = result['segments']
        # get the video metadata...
        video_meta = videos_dict[_id]
        for j in range(0, len(segments), stride):
            j_end = min(j+window, len(segments)-1)
            text = ''.join([x["text"] for x in segments[j:j_end]])
            start = segments[j]['start']
            end = segments[j_end]['end']
            _id = f"{_id}-t{segments[j]['start']}"
            meta = {
                **video_meta,
                **{
                    "id": _id,
                    "text": text.strip(),
                    "start": start,
                    "end": end
                }
            }
            json.dump(meta, fp)
            fp.write('\n')

len(data)