In [25]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=BblV6AQsd2s&ab_channel=RiseAgainstVEVO"

### Setting up the model
Define the LLM model to use

In [26]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

model.invoke("Are we in a simulation?")

AIMessage(content='There is currently no definitive evidence to prove or disprove the idea that we are living in a simulation. Some scientists and philosophers have proposed the simulation hypothesis, which suggests that the reality we perceive may actually be a computer-generated simulation. However, this idea is purely speculative and remains a topic of debate and discussion within the scientific community. Ultimately, whether or not we are in a simulation is still an open question that requires further investigation and exploration.', response_metadata={'finish_reason': 'stop', 'logprobs': None})

### Using the magic of chains :)

In [27]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

# Chain example
chain = model | parser

chain.invoke("Are we in a simulation?")

'There is currently no definitive evidence to prove or disprove the concept that we are living in a simulation. Some scientists and philosophers have proposed the idea based on various theories, but it remains a topic of speculation and debate. Ultimately, the answer to whether or not we are in a simulation may never be known for certain.'

### Using prompt templates

In [28]:
from langchain_core.prompts import ChatPromptTemplate

template = """
  Answer the question based on the context below. If you can't
  answer the question, just say "I don't know".

  Context: {context}

  Question: {question}
  """

prompt = ChatPromptTemplate.from_template(template)
print(prompt.format(context="The sky is blue.", question="What color is the sky?"))

print("Did it work?")

chain = prompt | model | parser
chain.invoke({
  "context": "I have an appointment tomorrow at 3pm.", 
  "question": "Can I go for lunch tomorrow at 3:00PM?"
})

Human: 
  Answer the question based on the context below. If you can't
  answer the question, just say "I don't know".

  Context: The sky is blue.

  Question: What color is the sky?
  
Did it work?


'No, you cannot go for lunch at 3:00PM tomorrow because you have an appointment.'

### Combining chains

In [29]:
from operator import itemgetter

translation_template = """
  Translate the 

  answer: {answer}

  to

  language: {language}
  """

translation_prompt = ChatPromptTemplate.from_template(translation_template)

translation_chain = (
  {"answer": chain, "language": itemgetter("language") } | translation_prompt | model | parser
)

translation_chain.invoke({
  "context": "I have an appointment tomorrow at 3pm.", 
  "question": "Can I go for lunch tomorrow at 3:00PM?",
  "language": "es"
})


'Respuesta: No, tienes una cita a las 3:00 pm mañana.'

### Transcribing the YouTube video

In [30]:
import tempfile
import whisper
from pytube import YouTube

if not os.path.exists("transcription.txt"):
  youtube = YouTube(YOUTUBE_VIDEO)
  audio = youtube.streams.filter(only_audio=True).first()

  whisper_model = whisper.load_model("base")

  script_dir = os.path.expanduser('~/Documents/tmp')

  with tempfile.TemporaryDirectory() as temp_dir:
    print(f"** Downloading {audio.title} to {temp_dir}")
    # Download the audio file with this name transcription_audio.mp4
    file = audio.download(temp_dir, filename="transcription_audio.mp4")
    
    # Print the file path
    print(file)
    # Validate if the file exists
    print(os.path.exists(file))
    # Print the file name
    print(os.path.basename(file))

    transcription = whisper_model.transcribe(file, fp16=False)["text"].strip()
    
    with open("transcription.txt", "w") as f:
     f.write(transcription)

    transcription

### Splitting into documents

In [31]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("transcription.txt")
text_documents = loader.load()
text_documents


[Document(page_content="This video is only under Stunde Oh Am I loud and clear I'm not breaking up, am I still your charm? I'm not just that luck, are we getting closer? Are we just getting more lost? I'll show you mine if you show me yours first Let's compare scars, I'll tell you who's It is worse, let's unlight these pages and replace them with the wrong ones We live on front porches and swim like a boy We get by just fine here, I've been in the way The flow is a labor, I'll sleep till the end I'll cross these streets until you hold my hand I've been here so long, I think that it's time to move The winter's so cold, summer's over too soon Let's pack our bags and settle down where palm trees grow I've got some friends, some that I hardly know But we've had some times, I wouldn't trade for the world We chase these days down with talks of the places that we will go We live on front porches and swim like a boy We get by just fine here, I've been in the way The flow is a labor, I'll sleep

### Setting up the splitter

In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
text_splitter.split_documents(text_documents)[:5]
documents = text_splitter.split_documents(text_documents)

### Generating the embeddings

In [33]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embeddings_query = embeddings.embed_query("What is the meaning of life?")

print(f"Embeddings query: {embeddings_query}")
print(f"Embeddings length: {len(embeddings_query)}")
print(embeddings_query[:10])

sentence1 = embeddings.embed_query("Life is good")
sentence2 = embeddings.embed_query("Simulations are not real")

Embeddings query: [0.0044326543196640065, -0.029703416759168908, -0.008148670844263245, -0.003370052784742357, -0.026120225921199978, -0.018743795621738665, -0.019163892868785566, 0.011453855384126693, -0.021214961267285667, -0.001400070099794504, 0.0018749973316991735, 0.016915132634251966, 0.014036223735180394, -0.006517700624132583, 0.014715794893065471, -0.0031229363598860387, 0.04052712627272525, -0.007883020402325172, 0.0036171694424293114, -0.012516457151878976, 0.003119847395844186, 0.00377470637573317, -0.0023259852669024607, -0.00847609963571583, -0.012485567045799178, 0.004963955203540149, 0.018039512751518764, -0.030370632992209115, 0.023451366576974397, -0.024365698070717746, 0.02654032503089196, -0.01591431107865928, -0.020498323472220897, -0.006295295678780451, -0.013010689535930344, -0.0004973222212081025, 0.011725683288487198, 0.0011344197742717507, -0.006418854240454564, -0.0051492925803900465, 0.02535416470146556, 0.020201782924203028, 0.0007664476603817698, 0.019299

### Validating embeddings similarity

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embeddings_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embeddings_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.8280397726527704, 0.7217470703214922)

### Using a embeddings store

In [35]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vector_store1 = DocArrayInMemorySearch.from_texts(
    [
        "Life is good", 
        "Simulations are not real",
        "You create your own reality",
        "You can create other realities",
        "Live your life"
    ],
    embedding=embeddings
)

vector_store1.similarity_search_with_score(query="What is the meaning of life?", top_k=3)

[(Document(page_content='Live your life'), 0.8332245944687862),
 (Document(page_content='Life is good'), 0.828018439583259),
 (Document(page_content='You create your own reality'), 0.7679660996110271),
 (Document(page_content='You can create other realities'), 0.7586368231628016)]

### Adding the retrievers

In [36]:
retriever1 = vector_store1.as_retriever()
retriever1.invoke("What is the meaning of life?")

[Document(page_content='Live your life'),
 Document(page_content='Life is good'),
 Document(page_content='You create your own reality'),
 Document(page_content='You can create other realities')]

### Implementing the runnable

In [37]:
from langchain_core.runnables import  RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("Do we live in a simulation?")

chain = setup | prompt | model | parser

chain.invoke("Who creates reality?")

'You create your own reality.'

## Loading transcription into the vector store

In [43]:
vectorestore2 = DocArrayInMemorySearch.from_documents(documents, embeddings)

len(documents), documents[0]

setup2 = RunnableParallel(context=vectorestore2.as_retriever(), question=RunnablePassthrough())

chain = setup2 | prompt | model | parser

chain.invoke("What happens when you living from porches?")

'When living from porches, the individual in the text swims like a boy and gets by just fine.'