In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [2]:
has_mps = torch.backends.mps.is_available()
has_cuda = torch.cuda.is_available()
device = "mps" if has_mps else "cuda" if has_cuda else "cpu"
torch_dtype = torch.float16 if has_mps else torch.float32
device, torch_dtype

('cuda', torch.float32)

In [3]:
# Initialize Pipeline
model_id = "openai/whisper-large-v3"

hf_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, 
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True, 
    cache_dir="whisper/models"
).to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    task="automatic-speech-recognition", 
    model=hf_model, 
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128, 
    chunk_length_s=64, 
    batch_size=24,
    return_timestamps=True, 
    torch_dtype=torch_dtype, 
    device=device, 
    ignore_warning=True
)

In [4]:
from pydub import AudioSegment

audio_file = "The Nature of Space and Time ｜ Brian Greene.mp3"
sound = AudioSegment.from_file(audio_file)
print(f"Length of this audio: {round(len(sound)/1000/60, 2)} minutes")

expected_tran_time = len(sound)/1000/20
expected_tran_time

Length of this audio: 58.48 minutes


175.4285

In [5]:
%%time
def transcribe():
    result = pipe(
        audio_file, 
        generate_kwargs={"language": "English"}, 
        return_timestamps=True
    )
    return result

result = transcribe()

del hf_model
del processor
del pipe
torch.cuda.empty_cache()

You have passed language=English, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=English.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


CPU times: user 1min 13s, sys: 847 ms, total: 1min 14s
Wall time: 1min 15s


In [6]:
result['text']

" So, thank you. It's a great pleasure to join you at this gathering. And I'm going to focus my attention on an area of physics that I think fits in well with the themes that you have been hearing about in other arenas, other areas. Because I'm going to be talking about the science of black holes. and observationally supported, but in the next age can migrate to canonical form, to everything that everybody always thought was true, everything everybody thought was right. So in a way, if I was going to give one takeaway message, if I can use that unfortunate language for what I'll be talking about here today, to be true and the things that you do not consider to be true, you can inadvertently cut out a whole collection of ideas that you simply at that moment in time don't have the intellectual architecture to judge. And that certainly is the case with the science of black holes. Now, I recognize that people come to to give everybody a little bit of something, I'm going to start fairly ba

In [7]:
import pandas as pd
df_transcribe = pd.DataFrame(result['chunks'])
df_transcribe

Unnamed: 0,timestamp,text
0,"(0.0, 5.0)","So, thank you. It's a great pleasure to join ..."
1,"(5.86, 13.18)",And I'm going to focus my attention on an are...
2,"(13.18, 20.78)",that I think fits in well with the themes tha...
3,"(21.2, 54.01)",Because I'm going to be talking about the sci...
4,"(54.01, 60.45)","thought was right. So in a way, if I was goin..."
...,...,...
249,"(3435.61, 3438.89)",You stitch it together with the threads of qu...
250,"(3439.89, 3468.88)",So the fabric of space itself... entanglement...
251,"(3468.88, 3473.38)",necessary that all math tells us the true wor...
252,"(3473.38, 3478.38)",is worth our focus to try to determine whethe...


In [8]:
# Parse timestamp function
def parse_audio_slice_timestamp(time_tuple):
    time_list = list(time_tuple)
    return time_list[0], time_list[1]

In [9]:
transcribe_filename = "the_nature_of_space_and_time_brian_greene.csv"

df_transcribe.loc[:, 'start'] = df_transcribe['timestamp'].apply(lambda x: list(x)[0])
df_transcribe.loc[:, 'end'] = df_transcribe['timestamp'].apply(lambda x: list(x)[1])
df_transcribe.to_csv(transcribe_filename, index=False)
df_transcribe.head()

Unnamed: 0,timestamp,text,start,end
0,"(0.0, 5.0)","So, thank you. It's a great pleasure to join ...",0.0,5.0
1,"(5.86, 13.18)",And I'm going to focus my attention on an are...,5.86,13.18
2,"(13.18, 20.78)",that I think fits in well with the themes tha...,13.18,20.78
3,"(21.2, 54.01)",Because I'm going to be talking about the sci...,21.2,54.01
4,"(54.01, 60.45)","thought was right. So in a way, if I was goin...",54.01,60.45


In [7]:
from langchain.embeddings import OllamaEmbeddings, SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(
    model_name="BAAI/bge-large-zh-v1.5", 
    cache_folder="embedding_models",
)

  warn_deprecated(


In [11]:
# Lambda function to embed audio text
add_embed = lambda x: embeddings.embed_query(x['text'])

In [12]:
df_embed = df_transcribe.copy()

In [13]:
df_embed.loc[:, 'text_embed'] = df_embed.apply(add_embed, axis=1)
df_embed.head()

Unnamed: 0,timestamp,text,start,end,text_embed
0,"(0.0, 5.0)","So, thank you. It's a great pleasure to join ...",0.0,5.0,"[0.08198381215333939, -0.004050110466778278, -..."
1,"(5.86, 13.18)",And I'm going to focus my attention on an are...,5.86,13.18,"[0.03211100399494171, -0.014226330444216728, 0..."
2,"(13.18, 20.78)",that I think fits in well with the themes tha...,13.18,20.78,"[0.04614894464612007, -0.02412410080432892, -0..."
3,"(21.2, 54.01)",Because I'm going to be talking about the sci...,21.2,54.01,"[0.04168698936700821, 0.05509776249527931, -0...."
4,"(54.01, 60.45)","thought was right. So in a way, if I was goin...",54.01,60.45,"[0.031016165390610695, -0.01409019622951746, 0..."


In [14]:
# Check embed vector length
len(df_embed['text_embed'].iloc[0])

1024

In [15]:
# Provide a search query
query = "black hole"
search_term_embed = embeddings.embed_query(query)

In [16]:
# Similarity Search function
import numpy as np
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [17]:
# Perform Cosine Similarity
df_embed.loc[:, 'cosine_similarity'] = df_embed['text_embed'].apply(lambda x: cosine_similarity(x, search_term_embed))
df_sorted = df_embed.sort_values(by='cosine_similarity', ascending=False)
df_sorted.head()

Unnamed: 0,timestamp,text,start,end,text_embed,cosine_similarity
88,"(1212.67, 1215.67)","and therefore is black, a black hole.",1212.67,1215.67,"[-0.013117685914039612, 0.03600084036588669, 0...",0.67742
233,"(3255.82, 3259.44)",that this black hole has an alternative descr...,3255.82,3259.44,"[0.007341235410422087, 0.049132268875837326, -...",0.672928
171,"(2448.64, 2454.04)",photograph was released a far sharper image o...,2448.64,2454.04,"[0.04561213403940201, 0.05503472313284874, -0....",0.594769
166,"(2404.81, 2409.21)",The first direct image of a black hole.,2404.81,2409.21,"[0.004232809878885746, 0.03027334436774254, -0...",0.593661
164,"(2360.54, 2365.76)",black holes collided with each other some tim...,2360.54,2365.76,"[0.025313926860690117, 0.07076461613178253, -0...",0.57868


In [18]:
# Playback function
from pydub.playback import play
def playback_by_query(query, k=3, show_text=False):
    search_term = query
    search_term_embed = embeddings.embed_query(search_term)

    df_embed.loc[:, 'cosine_similarity'] = df_embed['text_embed'].apply(lambda x: cosine_similarity(x, search_term_embed))
    df_sorted = df_embed.sort_values(by='cosine_similarity', ascending=False)

    if show_text:
        display(df_sorted.iloc[:k]['text'])

    for index, row in df_sorted.iloc[:k].iterrows():
        play(sound[row.start*1000: row.end*1000])

In [19]:
playback_by_query("What is black hole?")

Input #0, wav, from '/tmp/tmp2vfqnolj.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:03.62, bitrate: 1536 kb/s
    Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, 2 channels, s16, 1536 kb/s
ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default
SDL_OpenAudio (2 channels, 48000 Hz): ALSA: Couldn't open audio device: No such file or directory
ALSA lib confmisc.c:






Input #0, wav, from '/tmp/tmpc7hzbtgu.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:04.40, bitrate: 1536 kb/s
    Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 48000 Hz, 2 channels, s16, 1536 kb/s
ALSA lib confmisc.c:767:(parse_card) cannot find card '0'
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_card_driver returned error: No such file or directory
ALSA lib confmisc.c:392:(snd_func_concat) error evaluating strings
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1246:(snd_func_refer) error evaluating name
ALSA lib conf.c:4732:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5220:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2642:(snd_pcm_open_noupdate) Unknown PCM default
SDL_OpenAudio (2 channels, 48000 Hz): ALSA: Couldn't open audio device: No such file or directory
ALSA lib confmisc.c:

In [20]:
# Save transcribe text into txt file
with open("transcribed_text.txt", "w") as text_file:
    text_file.write(result['text'])

## RAG Search

In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcribed_text.txt")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

In [2]:
len(docs), docs[-2]

(60,
 Document(metadata={'source': 'transcribed_text.txt'}, page_content='to stitch together the individual plaquettes. The individual pixels. How do you stitch them together? You stitch it together with the threads of quantum entanglement. So the fabric of space itself... entanglement. Again a wild, strange, crazy sounding idea, but naturally comes out of the mathematics and we have sort of grown to have a degree of confidence. Not necessary that all math tells us the true workings of the world, but certainly is worth our focus to try to determine whether these'))

In [28]:
from dotenv import load_dotenv
load_dotenv()    

True

In [4]:
import time
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone()

In [5]:
import time
index_name = "nature-of-space-time-asr-index"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [8]:
from langchain_pinecone import PineconeVectorStore
speech_vector = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

In [42]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")

In [10]:
# Setup Project
from langchain.prompts import PromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate, 
    SystemMessagePromptTemplate, 
    AIMessagePromptTemplate, 
    HumanMessagePromptTemplate
)

In [11]:
# Create RAG prompt
rag_prompt = ChatPromptTemplate(
    input_variables=['context', 'question'], 
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=['context', 'question'], 
                template="""You answer questions about the contents of a transcribed audio file.
                Use only the provided audio file transcription as context to answer the question.
                Do not use any additional information.
                If you don't know the answer, just say that you don't know. Do not use the external knowledge.
                Use three sentences maximum and keep the answer concise.
                Make sure to reference your sources with quotes of the provided context as citations.
                \nQuestion: {question} \nContext: {context} \nAnswer:
                """
            )
        )
    ]
)

In [12]:
# Create Retriever
retriever = speech_vector.as_retriever(
    search_type='similarity', 
    search_kwargs={'k': 4}
)

# Check retriever
query = "What is black hole"
docs = retriever.invoke(query)
assert docs == speech_vector.similarity_search(query, )

In [21]:
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

# Chain
chain = (
    {"context": retriever, "question": RunnablePassthrough()} |
    rag_prompt |
    llm |
    StrOutputParser()
)

In [22]:
print(chain.invoke(query))

According to the transcription, a black hole is an object that "will not allow light to escape and therefore is black" (Document 2). It is created when a spherical body is crushed down to a sufficiently small size, causing a warp in the fabric of space (Document 2). The size required to create a black hole is extremely small, such as squeezing the entire sun to a couple of kilometers across (Document 4).


In [23]:
# Summarize the audio content
prompt_test = """
You are commentator. Your task is to write a report on a meeting transcription.
When presented with the meeting minutes, come up with interesting questions to ask,
and answer each question.
Afterwards, combine all the information and write a report in the markdown format.

# Meeting keynotes:
"{text}"

# Instructions:
## Summarize:
In clear and concise language, use only the context information, to summarize the key points.

## Interesting questions:
Generate three distinct and thought-provoking questions that can be
asked about the content of the meeting. For each questions:
- After "Q: ", describe the problem
- After "A: ", provide a detailed explaination of the problem addressed in the question.
- Enclose the ultimate answer in <>.

## Write analysis report
Using the summary and the answers to the interesting questions,
create a comprehensive report in Markdown format.
"""
summary_prompt = PromptTemplate.from_template(prompt_test)

In [24]:
with open('transcribed_text.txt', 'r') as file:
    transcribe_text = file.read().rstrip()

In [43]:
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

docs = [Document(page_content=transcribe_text, metadata={'source': 'local'})]

llm_chain = LLMChain(llm=llm, prompt=summary_prompt)

# Define StuffDocumentsChain
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="text")

print(stuff_chain.invoke(docs)['output_text'])

## Summary

The meeting focused on the science of black holes, tracing the history from Newton's law of gravity to Einstein's theory of general relativity and the modern understanding of black holes. The speaker discussed how Newton's laws laid the foundation for understanding gravitational forces and escape velocity. Moving forward, Einstein's revolutionary idea that gravity is the warping of space-time by massive objects was explained. The concept of black holes was further elaborated, including their formation through stellar collapse and the historical skepticism by Einstein himself. The meeting also covered the significant observational evidence of black holes, including gravitational waves detected by LIGO and the first direct image of a black hole. Lastly, the talk ventured into the holographic principle and quantum entanglement, suggesting that our three-dimensional reality might be a projection of data on a two-dimensional surface, and how these ideas could stitch together the