https://github.com/Kabanosk/whisper-website
https://github.com/openai/whisper/discussions/264

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import yt_dlp
import unzip

# increase column width
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', None)

# Download Audio and Transcribe

In [2]:
VIDEO_URL = "https://youtu.be/DgTjSrrf6GQ"
AUDIO_FILE_NAME = "./data/Lex_Podcast.mp3"
AUDIO_QUALITY = 5 # 0 best - 10 worst (default 5)
AUDIO_FORMAT = "mp3"
FFMPEG_LOCATION = "ffmpeg-master-latest-win64-gpl/bin"
SUBTITLE_LANGUAGE = "en.*"
TRANSCRIPT_FILE_NAME = "./data/transcript.txt"
SUBTITLE_FORMAT = "srt"

In [19]:
import wget
import os
import zipfile

FFMPEG_URL = 'https://github.com/yt-dlp/FFmpeg-Builds/releases/download/latest/ffmpeg-master-latest-win64-gpl.zip'
ZIP_PATH = './ffmpeg.zip'
EXTRACT_DIR = './'

if not os.path.exists(ZIP_PATH):
    print('Downloading ffmpeg...')
    wget.download(FFMPEG_URL, ZIP_PATH)

    print('Unzipping...') 
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    
    print('Removing zip file...')
    os.remove(ZIP_PATH)

else:
    print('Already downloaded.')

Downloading ffmpeg...
Unzipping...
Removing zip file...


In [None]:
!yt-dlp -xv --ffmpeg-location ffmpeg-master-latest-win64-gpl/bin --audio-format mp3  -o data/Lex_Podcast -- {"https://youtu.be/DEu24V8vfb8"}
#!yt-dlp -xv --ffmpeg-location {FFMPEG_LOCATION} --audio-format {AUDIO_FORMAT}  -o {AUDIO_FILE_NAME} -- {"https://youtu.be/DEu24V8vfb8"}

In [None]:
import openai

openai.api_key = "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"
audio_file = open("audio.mp3", "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_file, response_format=SUBTITLE_FORMAT)

## Longer Inputs
By default, the Whisper API only supports files that are less than 25 MB. If you have an audio file that is longer than that, you will need to break it up into chunks of 25 MB's or less or used a compressed audio format. To get the best performance, we suggest that you avoid breaking the audio up mid-sentence as this may cause some context to be lost.

One way to handle this is to use the [PyDub open source Python package](https://github.com/jiaaro/pydub) to split the audio:

In [None]:
from pydub import AudioSegment

song = AudioSegment.from_mp3("good_morning.mp3")

# PyDub handles time in milliseconds
ten_minutes = 10 * 60 * 1000

first_10_minutes = song[:ten_minutes]

first_10_minutes.export("good_morning_10.mp3", format="mp3")

### Prompting
* Check out [OpenAI](https://platform.openai.com/docs/guides/speech-to-text/prompting)

# Download Transcript

In [13]:
# Download the transcript with yt-dlp
!yt-dlp --write-auto-sub --skip-download --sub-format {SUBTITLE_FORMAT} --sub-lang {SUBTITLE_LANGUAGE} --output {TRANSCRIPT_FILE_NAME} -- {VIDEO_URL}

[youtube] Extracting URL: https://youtu.be/DgTjSrrf6GQ
[youtube] DgTjSrrf6GQ: Downloading webpage
[youtube] DgTjSrrf6GQ: Downloading android player API JSON
[info] DgTjSrrf6GQ: Downloading subtitles: en-orig, en, en-en-ehkg1hFWq8A
[info] DgTjSrrf6GQ: Downloading 1 format(s): 22
[info] Writing video subtitles to: transcript.txt.en-orig.vtt
[download] Destination: transcript.txt.en-orig.vtt

[download]    1.00KiB at  909.63KiB/s (00:00:00)
[download]    3.00KiB at    1.45MiB/s (00:00:00)
[download]    7.00KiB at    1.32MiB/s (00:00:00)
[download]   15.00KiB at    1.17MiB/s (00:00:00)
[download]   31.00KiB at  729.35KiB/s (00:00:00)
[download]   63.00KiB at  978.15KiB/s (00:00:00)
[download]  127.00KiB at    1.33MiB/s (00:00:00)
[download]  255.00KiB at    2.07MiB/s (00:00:00)
[download]  511.00KiB at    3.35MiB/s (00:00:00)
[download]  816.60KiB at    4.29MiB/s (00:00:00)
[download] 100% of  816.60KiB in 00:00:00 at 1.88MiB/s
[info] Writing video subtitles to: transcript.txt.en.vtt
[down



## Data Preparation
Let's change the name of the raw caption files:

In [79]:
# Get a clean list of podcast titles
import re 

def clean_titles(title):
    title = re.sub(r'\[(.*?)\]\((.*?)\)', r'\1', title)
    title = re.sub(r'\|.*?\d+', '', title)
    title = title.rstrip().replace(' ', '_').replace(':', '_').replace('&','and').lower()
    title = re.sub(r'[^a-zA-Z0-9_]', '', title)
    return title

with open('./data/Lexicap.md', 'r') as f:
    text = f.read()

titles = text.split('\n')
titles = [clean_titles(title) for title in titles if title != '']

In [None]:
# Rename the transcript files
import os

TRANSCRIPT_PATH = "./data/transcripts/"
FILE_EXTENSION = '.vtt'

for org_filename, line_idx in zip(os.listdir(path=TRANSCRIPT_PATH), titles):
    if org_filename.endswith(FILE_EXTENSION):
        print(org_filename)
        # rename file 
        new_filename = f"{line_idx}{FILE_EXTENSION}"
        os.rename(f"{TRANSCRIPT_PATH}{org_filename}", f"{TRANSCRIPT_PATH}{new_filename}")

### Data Cleaning

In [3]:
TRANSCRIPT_PATH = "./data/transcripts/"
FILE_EXTENSION = '.vtt'

In [4]:
# Create new transcript file with timestamp and text
TRANSCRIPT_FILE_NAME = "45_michio_kaku__future_of_humans_aliens_space_travel_and_physics.vtt"
NEW_TRANSCRIPT_FILE_NAME = "45_michio_kaku__future_of_humans_aliens_space_travel_and_physics.csv"


with open(f"{TRANSCRIPT_PATH}{TRANSCRIPT_FILE_NAME}") as oldfile, open(f"{NEW_TRANSCRIPT_FILE_NAME}", 'w') as newfile:
    old_lines = oldfile.read().split('\n')
    clean_lines = [line for line in old_lines if line not in ['', 'WEBVTT']]

    for line_idx in range(0, len(clean_lines)-1, 2):
         timestamp = clean_lines[line_idx].split('-->')[0].strip()
         text = clean_lines[line_idx+1].rstrip()
         new_line = f"{timestamp};{text}\n"
         newfile.write(new_line)


In [121]:
transcript_df = pd.read_csv(f"{NEW_TRANSCRIPT_FILE_NAME}"
                            ,sep=';', 
                            header=None, names=['timestamps', 'text'],
                            parse_dates=['timestamps'], date_format='%H:%M:%S.%f')

transcript_df.head(25)

Unnamed: 0,timestamps,text
0,00:00.000,The following is a conversation with Michio Kaku.
1,00:02.800,"He's a theoretical physicist, futurist,"
2,00:05.120,and professor at the City College of New York.
3,00:08.360,He's the author of many fascinating books
4,00:10.760,that explore the nature of our reality
5,00:12.840,and the future of our civilization.
6,00:15.520,"They include Einstein's Cosmos, Physics of the Impossible,"
7,00:19.200,"Future of the Mind, Parallel Worlds,"
8,00:21.600,"and his latest, The Future of Humanity,"
9,00:24.240,"Terraforming Mars Interstellar Travel,"


* Ideas
* Summarization of main topics in the audio
* Go to the mentions of the topics
* translation to arabic
* The app ask me questions about the text (for language learning) and create a discussion
* Overall sentiment in the text
* Webapp or Mobile app

* Since Whisper has not skipped the punctuations in the transcript, we can reconstruct full sentences, ensuring that each sentence is ended with a period `.`

In [122]:
# Recreate the dataframe with full sentences
full_transcript_df = pd.DataFrame(columns=transcript_df.columns)

for idx, ts, text in transcript_df.itertuples():
    while text[-1] != '.':
        idx += 1
        text += transcript_df.loc[idx]['text']
    full_transcript_df = pd.concat([full_transcript_df, pd.DataFrame({'timestamps': ts, 'text': text}, index=[0])], ignore_index=True)

# Remove any piece of text if it is included in previous text
not_part_of_previous = [True]
for i in range(1, len(full_transcript_df)):
    not_part_of_previous.append(full_transcript_df['text'][i] not in full_transcript_df['text'][i-1])
full_transcript_df = full_transcript_df[not_part_of_previous] 


# 1. NER
* Add column for persons, orginazations, books, companies, countries, places.

## Mentioned Books
Why identifying book titles can be difficult?
* The book may contain persons names which are not authors.
* The book titles are difficult to identify as such in general. For example "the Republic" might or might not be about the book, and if the only indication the model can use is the capitalization it's probably going to make some errors.

To be clear, I think it could work to some extent but it would probably make quite a lot of errors.

* On the other hand you could obtain a database of books, for instance from Wikipedia (there might be better resources), and you could use this in two ways:

1. Directly identify the books/authors in the documents by simple string matching. I would imagine that even if the coverage of the resource is not perfect, this method would easily catch a majority of occurrences.
2. In case the above method is not sufficient, it provides you with some good training data from which you could train a NER model in order collect titles which don't exist in the database. Note that there might be issues due to the unknown books being labelled as negative in the training data, so ideally you would have to go manually through the training data and annotate the remaining cases.

In [40]:
%%capture
# %pip install -U spaCy
# %pip install 'spacy[transformers]'
#!python -m spacy download en_core_web_trf # download best-matching version of specific model

In [7]:
import spacy
# load a pipeline package by name and return nlp object
nlp = spacy.load("en_core_web_trf", disable=["tok2vec"]) #disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"]

# check processing pipeline components of nlp object
nlp.pipeline


[('transformer',
  <spacy_transformers.pipeline_component.Transformer at 0x2b0023d32e0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2b0023d3340>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2b00231bed0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2b07e81e140>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2b001d89d40>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2b0035ac0b0>)]

In [58]:
# create a Doc by processing a string of text with the nlp object
doc = nlp("So I'm with Jared Diamond, you know, in the book Collapse, \
          where he points out studying the collapse of major civilizations, \
          that it often happens right after things appear to never have been better. Hmm.")


# iterate over tokens in a Doc
print("Entities: ", [e.text for e in doc.ents if e.label_ == 'WORK_OF_ART'])


Entities:  ['Collapse']


In [123]:
# Find book related sentences in the transcript
book_related_phrases = [
    "book", "books", "i read", 
    "everyone should read", "you should read", "he wrote a novel",
    "i recommend", "highly recommend", "you must read", 
    "shouldn't miss", "top books", "best books", 
    "favorite book", "my favorite books", "book you need to read",
    "books to read before", "essential books", "great book for",
    "worthy read", "book of the year", "award winning book"
]


def contains_book_phrase(text):
    return any(phrase in text.lower() for phrase in book_related_phrases)

full_transcript_df["is_book_related"] = full_transcript_df["text"].apply(contains_book_phrase)

In [124]:
def get_book_titles_candidates(text):
    doc = nlp(text)
    book_titles_candidates = [e.text for e in doc.ents if e.label_ == 'WORK_OF_ART']
    return book_titles_candidates

In [125]:

full_transcript_df.query("is_book_related == True")["text"].apply(get_book_titles_candidates)

3                               []
182                             []
341                             []
478       [The Future of the Mind]
638                             []
859                             []
940                             []
954     [The Theory of Everything]
957                             []
1149          [Future of Humanity]
Name: text, dtype: object

In [126]:
# add book candidates to the dataframe
full_transcript_df["book_candidates"] = full_transcript_df.apply(lambda x: get_book_titles_candidates(x["text"]) \
                                                                 if x["is_book_related"] else [], axis=1)

In [127]:
full_transcript_df.query("is_book_related == True")

Unnamed: 0,timestamps,text,is_book_related,book_candidates
3,00:08.360,He's the author of many fascinating books that explore the nature of our reality and the future of our civilization.,True,[]
182,08:30.920,"And Stephen Hawking, for example, even in his last book, even said that this is an argument against the existence of God.",True,[]
341,16:04.760,"If you read the book, the aliens did not have evil intentions toward homo sapiens.",True,[]
478,22:38.600,"I have a book, The Future of the Mind, where I detail some of these breakthroughs.",True,[The Future of the Mind]
638,29:54.080,"Our ancestors were lucky if they had one line, just one line in a church book, saying the date they were baptized and the date they died.",True,[]
859,40:07.240,"For Isidor Rabi, it was a book about the planets.",True,[]
940,44:06.200,"That desk had a book on it, which was opened.",True,[]
954,44:43.160,"And then over the years, I found out the guy had a name, Albert Einstein, and that book was The Theory of Everything.",True,[The Theory of Everything]
957,44:53.080,"Well, today I can read that book.",True,[]
1149,53:36.600,"And in my book, Future of Humanity, I even speculate beyond that, that by the end of this century, we'll probably have the first starships.",True,[Future of Humanity]


In [15]:
## Get book info from Google Books API 
import requests

def get_book_info(title):
    response = requests.get(f"https://www.googleapis.com/books/v1/volumes?q={title}")
    data = response.json()
    if 'items' in data:
        # Just return the first book found
        book = data['items'][0]['volumeInfo']
        return book

get_book_info("Future of Humanity")


{'title': 'The Future of Humanity',
 'subtitle': 'Terraforming Mars, Interstellar Travel, Immortality, and Our Destiny Beyond',
 'authors': ['Michio Kaku'],
 'publisher': 'Penguin UK',
 'publishedDate': '2018-02-28',
 'description': 'Human civilization is on the verge of spreading beyond Earth. More than a possibility, it is becoming a necessity: whether our hand is forced by climate change and resource depletion or whether future catastrophes compel us to abandon Earth, one day we will make our homes among the stars. World-renowned physicist and futurist Michio Kaku explores in rich, accessible detail how humanity might gradually develop a sustainable civilization in outer space. With his trademark storytelling verve, Kaku shows us how science fiction is becoming reality: mind-boggling developments in robotics, nanotechnology, and biotechnology could enable us to build habitable cities on Mars; nearby stars might be reached by microscopic spaceships sailing through space on laser beam

## Identify Other Entities

In [19]:
# %%capture
# %pip install textacy

In [128]:
import textacy

entity_types = ['PERSON', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'DATE', 'TIME', 'MONEY']

def extract_entities(doc, include_types=entity_types, sep='_'):
    ents = textacy.extract.entities(doc, 
             include_types=include_types, 
             exclude_types=None, 
             drop_determiners=True, 
             min_freq=1)
    
    return [sep.join([token.lemma_ for token in e])+'/'+e.label_ for e in ents]

* When processing large volumes of text, it is recommended to use spaCys batch processing for a significant performance gain. The function `nlp.pipeline` takes an iterable of texts, processes them internally as batch, and yields a list fo processed Doc objects in the same order as the input data.
* To use `nlp.pipeline`, we first have to define a batch size. Then we can loop over the batches and call `nlp.pipe`. In the inner loop we extract the features from the processed doc and write the values back into the a list:

In [129]:
# Extract entities from the transcript
import numpy as np
from tqdm import tqdm

batch_size = 50
batches = np.ceil(len(full_transcript_df) / batch_size).astype(int)

named_entities = []

# loop over batches, step size is equal to batch size
for i in tqdm(range(0, len(full_transcript_df), batch_size), total=batches):
    docs = nlp.pipe(full_transcript_df['text'][i:i+batch_size])
    
    for doc in docs:
        named_entities.append(extract_entities(doc)) 

full_transcript_df['named_entities'] = named_entities 

100%|██████████| 13/13 [00:56<00:00,  4.33s/it]


In [130]:
for ent_type in entity_types:
    full_transcript_df[ent_type.lower()] = full_transcript_df["named_entities"].apply(lambda x: \
                                                                            [e.split('/')[0] for e in x if e.split('/')[1] == ent_type])


In [133]:
# Remove empty lists from the dataframe

# Check if a column contains empty lists
def contains_empty_list(col_name):
    return full_transcript_df[col_name].apply(lambda x: x == []).any()

# Find columns with empty lists
cols_to_clean = [col_name for col_name in full_transcript_df.columns if contains_empty_list(col_name)]

for ent_type in cols_to_clean:
    full_transcript_df[ent_type] = full_transcript_df[ent_type].apply(lambda x: x if len(x) > 0 else None)

In [141]:
from collections import Counter 

def count_words(df, col_name, preprocess=None, min_freq=2):
    counter = Counter()
    
    # Check if preprocess is None before calling df[col_name].map()
    if preprocess is None:
        df[col_name].map(counter.update)
    else:
        df[col_name].map(lambda doc: counter.update(preprocess(doc)))

    freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
    freq_df = freq_df.query('freq >= @min_freq')
    freq_df.index.name = col_name
    
    return freq_df.sort_values('freq', ascending=False)

In [153]:
count_words(full_transcript_df, 'person', min_freq=2)

Unnamed: 0_level_0,freq
person,Unnamed: 1_level_1
Einstein,7
Tithonus,5
Albert_Einstein,3
Aurora,3
Michio_Kaku,2
Kardashev,2
Stephen_Hawking,2
Steven_Weinberg,2
Galileo,2
Shakespeare,2
