In [1]:
# # !pip install pytube pydub
# !pip install -U youtube_dl
# !pip install -U yt-dlp==2024.8.6


In [1]:
import tempfile
import pathlib
import shutil
import whisper
import torch
import yt_dlp as youtube_dl
from pydub import AudioSegment
from IPython.display import Audio
from tqdm import tqdm
from IPython.display import Markdown
from textwrap import dedent

from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
print(whisper.available_models())

['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large', 'large-v3-turbo', 'turbo']


In [3]:
model = whisper.load_model('large-v3-turbo').to(device)

In [4]:
prompt_template = """ Your output should use the following template:

### Summary

### Analogy

### Notes

- [Emoji] Bulletpoint

### Keywords

- Explanation

You have been tasked with creating a concise summary of a YouTube video using its transcription.

Make a summary of the transcript.

Additionally make a short complex analogy to give context and/or analogy from day-to-day life from the transcript.

Create 10 bullet points (each with an appropriate emoji) that summarize the key points or important moments from the video's transcription.

In addition to the bullet points, extract the most important keywords and any complex words not known to the average reader aswell as any acronyms mentioned. For each keyword and complex word, provide an explanation and definition based on its occurrence in the transcription.

Please ensure that the summary, bullet points, and explanations fit within the 330-word limit, while still offering a comprehensive and clear understanding of the video's content. Use the text above:

Please, I need you to translate the answer into Portuguese!

```
{title}

{transcription}
```
"""


In [5]:
def cut_audio(path, start_time, end_time, output_path):
    audio = AudioSegment.from_file(path)
    start_time_ms = start_time * 1000
    end_time_ms = end_time * 1000
    trimmed_audio = audio[start_time_ms:end_time_ms]
    trimmed_audio.export(output_path, format="mp3")

class YoutubeTranscripter:
    def __init__(self, video_url, whisper_model):
        self.video_url = video_url
        self.whisper_model = whisper_model
        self.metadata = None
        self.title = None
        self.label = None
        self.language = None
        self.chapters = None
        self._temp_dir = tempfile.mkdtemp()
        self._raw_video_path = None
        self._video_parts_folderpath = None
        self._audios_to_transcribe = []
        self._transcripted_parts = None

    def get_metadata(self):
        ydl_opts = {
            'skip_download': True,
            'extract_flat': True,
        }
        
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            self.metadata = ydl.extract_info(self.video_url, download=False)

        self.title = self.metadata["title"]
        self.label = self.metadata["title"].lower().replace(" ","_")
        self.language = self.metadata["language"]
        self.chapters = self.metadata["chapters"]


    def download(self):
        output_filename = f"{self._temp_dir}/{self.label}"
        
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'outtmpl': f'{output_filename}.%(ext)s',
        }

        self._raw_video_path = f'{output_filename}.mp3'
        
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([self.video_url])

    def prepare(self):
        if self.chapters:
            self._video_parts_folderpath = self._raw_video_path.replace(".mp3", "")
            pathlib.Path(self._video_parts_folderpath).mkdir(parents=True, exist_ok=True)
        
            for i, c in enumerate(self.chapters):
                print(f"[splitting] - Part {i+1} - {c['title']}")
                output_path = f"{self._video_parts_folderpath}/{str(i+1).zfill(4)}__{c['title']}__.mp3"
                cut_audio(self._raw_video_path, c["start_time"], c["end_time"], output_path)
                c.update({"audio_path": output_path})
                self._audios_to_transcribe.append(c)
        else:
            self._audios_to_transcribe.append(
                {
                    "title": self.title,
                    "audio_path": self._raw_video_path,
                }
            )

    def transcript(self):
        response = []
        for i, a in enumerate(self._audios_to_transcribe):
            print(f"[transcripting] - Part {i+1} - {a['audio_path']}")
            result = self.whisper_model.transcribe(a["audio_path"], language=self.language)
            a.update({"transcription": result})
            response.append(a)
        if not self._transcripted_parts:
            self._transcripted_parts = response
        return response
            
    def cleanup(self):
        if self._temp_dir:
            shutil.rmtree(self._temp_dir)
            self._temp_dir = None
            self._raw_video_path = None
            self._video_parts_folderpath = None
            self._audios_to_transcribe = []
            print("Diretório temporário removido.")

    def summarize(self):
        if not self._transcripted_parts:
            self._transcripted_parts = self.transcript()

        formatted_text = []
        for i,r in enumerate(self._transcripted_parts):
            chapter_text = dedent(f"""
            ## {i+1}. {r['title']}
            {r['transcription']['text']}
            """)
        
            formatted_text.append(chapter_text)
        
        formatted_text = "\n\n".join(formatted_text)
        
        chain = self.set_chain()

        response = chain.invoke({"title": self.title, "transcription": formatted_text})

        return response

    def set_chain(self):

        prompt = PromptTemplate(
            input_variables=["title", "transcription"], template=prompt_template
        )
        
        
        llm = ChatOpenAI(
            model="gpt-4o-mini",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2,
        )
        
        return prompt | llm

    
    def run(self):
        self.get_metadata()
        self.download()
        self.prepare()
        response = self.transcript()
        self.cleanup()
        return response

# Transcrever

In [7]:
%%time

transcripter = YoutubeTranscripter(video_url="https://www.youtube.com/watch?v=xGUHzRo81OM", whisper_model=model)
response = transcripter.run()
summary = transcripter.summarize()
Markdown(summary.content)

[youtube] Extracting URL: https://www.youtube.com/watch?v=xGUHzRo81OM
[youtube] xGUHzRo81OM: Downloading webpage
[youtube] xGUHzRo81OM: Downloading ios player API JSON
[youtube] xGUHzRo81OM: Downloading web creator player API JSON
[youtube] xGUHzRo81OM: Downloading m3u8 information
[youtube] Extracting URL: https://www.youtube.com/watch?v=xGUHzRo81OM
[youtube] xGUHzRo81OM: Downloading webpage
[youtube] xGUHzRo81OM: Downloading ios player API JSON
[youtube] xGUHzRo81OM: Downloading web creator player API JSON
[youtube] xGUHzRo81OM: Downloading m3u8 information
[info] xGUHzRo81OM: Downloading 1 format(s): 251
[download] Destination: /tmp/tmpuqmd9co7/top_10_atores_que_calaram_a_sua_boca_|_gaveta.webm
[download] 100% of   18.69MiB in 00:00:13 at 1.38MiB/s   
[ExtractAudio] Destination: /tmp/tmpuqmd9co7/top_10_atores_que_calaram_a_sua_boca_|_gaveta.mp3
Deleting original file /tmp/tmpuqmd9co7/top_10_atores_que_calaram_a_sua_boca_|_gaveta.webm (pass -k to keep)
[transcripting] - Part 1 - /tmp

### Resumo

O vídeo "Top 10 Atores que CALARAM a sua BOCA" de Gaveta discute escolhas polêmicas de atores em papéis icônicos que inicialmente geraram desconfiança, mas que acabaram se mostrando excelentes. O apresentador compartilha sua experiência pessoal e opiniões sobre cada escolha, destacando como alguns atores, como Michael Keaton, Jennifer Lawrence e Heath Ledger, surpreenderam o público com suas atuações.

### Analogia

Escolher um ator para um papel é como escolher um ingrediente para uma receita: às vezes, o que parece uma combinação estranha pode resultar em um prato delicioso. Assim como um chef pode surpreender com uma mistura inesperada, um diretor pode encontrar um talento oculto em um ator que, à primeira vista, não parece se encaixar.

### Notas

- 🎭 Michael Keaton como Batman: Inicialmente criticado, mas se tornou um ícone.
- 🌟 Jennifer Lawrence como Katniss: Polêmica pela aparência, mas provou seu talento.
- 🦸‍♂️ Chris Evans como Capitão América: Duvidado por seu passado cômico, mas se destacou.
- 🎤 Lady Gaga em Nasce Uma Estrela: Surpreendeu com atuação dramática.
- 🧛‍♂️ Tom Cruise como Lestat: Superou expectativas em Entrevista com o Vampiro.
- 🐺 Hugh Jackman como Wolverine: Inicialmente criticado, mas se tornou o Wolverine definitivo.
- 🔫 Daniel Craig como James Bond: Mudou a percepção do personagem para um tom mais realista.
- 🎬 Ryan Reynolds como Deadpool: Superou a má impressão de sua primeira interpretação.
- 🎥 Robert Pattinson como Batman: Provou ser um ator versátil após Crepúsculo.
- 🃏 Heath Ledger como Coringa: Transformou a visão do personagem e ganhou um Oscar.

### Palavras-chave

- **Polêmica**: Situação que gera controvérsia ou debate.
- **Atuação**: A arte de interpretar um personagem em teatro, cinema ou televisão.
- **Coringa**: Personagem icônico da DC Comics, conhecido por sua complexidade e vilania.
- **Reboot**: Reinterpretação ou reinício de uma franquia, geralmente com novos atores ou diretores.
- **Expectativa**: A crença ou esperança de que algo ocorrerá de uma certa maneira.

In [6]:
transcripter = YoutubeTranscripter(video_url="https://www.youtube.com/watch?v=MnBV8zLq-_Y", whisper_model=model)
response = transcripter.run()
summary = transcripter.summarize()
Markdown(summary.content)

[youtube] Extracting URL: https://www.youtube.com/watch?v=MnBV8zLq-_Y
[youtube] MnBV8zLq-_Y: Downloading webpage
[youtube] MnBV8zLq-_Y: Downloading ios player API JSON
[youtube] MnBV8zLq-_Y: Downloading web creator player API JSON
[youtube] MnBV8zLq-_Y: Downloading m3u8 information
[youtube] Extracting URL: https://www.youtube.com/watch?v=MnBV8zLq-_Y
[youtube] MnBV8zLq-_Y: Downloading webpage
[youtube] MnBV8zLq-_Y: Downloading ios player API JSON
[youtube] MnBV8zLq-_Y: Downloading web creator player API JSON
[youtube] MnBV8zLq-_Y: Downloading m3u8 information
[info] MnBV8zLq-_Y: Downloading 1 format(s): 251
[download] Destination: /tmp/tmpe_gg1glr/ansible_vault.webm
[download] 100% of    5.05MiB in 00:00:04 at 1.23MiB/s   
[ExtractAudio] Destination: /tmp/tmpe_gg1glr/ansible_vault.mp3
Deleting original file /tmp/tmpe_gg1glr/ansible_vault.webm (pass -k to keep)
[transcripting] - Part 1 - /tmp/tmpe_gg1glr/ansible_vault.mp3
Diretório temporário removido.


### Resumo

Ansible Vault é uma ferramenta de linha de comando que permite criptografar variáveis e arquivos sensíveis, garantindo que dados como credenciais de máquina não fiquem expostos em texto simples. O especialista Alex Dordjian explica como usar o Ansible Vault para proteger dados em repouso, criar e editar arquivos criptografados, e gerenciar diferentes cofres e senhas. Ele também destaca a importância de revisar playbooks para evitar a exposição acidental de dados sensíveis e fornece uma demonstração prática de como implementar o Ansible Vault.

### Analogia

Usar o Ansible Vault é como ter um cofre em casa: você pode guardar documentos importantes (dados sensíveis) dentro dele, mas ainda precisa ter cuidado ao abrir o cofre (executar playbooks) para não deixar informações à vista.

### Notas

- 🔒 Ansible Vault é uma ferramenta de criptografia.
- 🗝️ Permite proteger credenciais e arquivos sensíveis.
- 📂 Suporta a criação e edição de arquivos criptografados.
- 🔍 É importante revisar playbooks para evitar vazamentos de dados.
- 🔑 Vários cofres e senhas podem ser gerenciados.
- 🛠️ Funciona com Ansible Playbook e Ansible Navigator.
- 📜 Oferece opções para visualizar e editar arquivos criptografados.
- 🔄 Possui funcionalidade para reconfigurar senhas de criptografia.
- 📚 Documentação disponível para guias e FAQs.
- 💻 Demonstração prática de uso do Ansible Vault.

### Palavras-chave

- **Ansible Vault**: Ferramenta de linha de comando para criptografar dados sensíveis.
- **Criptografia**: Processo de codificar informações para proteger dados.
- **Playbook**: Conjunto de instruções que o Ansible executa.
- **Cofre**: Local seguro para armazenar informações sensíveis.
- **Variáveis**: Elementos que armazenam dados que podem ser utilizados em scripts.
- **Rekey**: Processo de alterar a senha de criptografia de um cofre.

Essas definições ajudam a entender melhor os conceitos discutidos no vídeo e a importância do Ansible Vault na segurança de dados.

### Resultado

In [7]:


formatted_text = []
for i,r in enumerate(response):
    chapter_text = dedent(f"""
    ## {i+1}. {r['title']}
    {r['transcription']['text']}
    """)

    formatted_text.append(chapter_text)

formatted_text = "\n\n".join(formatted_text)

Markdown(formatted_text)


## 1. Intro
 Hey Rilsen! Yeah? What is KnowledgeCraft? Do we need it to enhance our LLM performance? Oh, and also, do you know how to integrate it with Lanchain? Okay guys, hold on, take it easy. I will explain to you in detail step by step. Start from the theory, how to set up our Neo4j database, and then how to integrate it using Lanchain, and of course, all that we will use in open source LLM. So, without further ado, let's get started.



## 2. What is Knowledge Graph?
 What is Nullesquas? Once upon a time, way back in 1736, there was a Swiss madwish named Leonard Ehler who faced a mind-bending challenge. The Southern Bridge of Gunsberg Problem. Is there a way to walk across all bridges once, starting and ending at the same place? Ehler realized something more crucial. What mattered was how things were connected. So it turned the city's landmark into dots or nodes and its bridges into lies or ages, creating a neat little network known as the origin of the graph theory. How we tying this overgraph, sped cold-up, instead of just building about landmarks and bridges, now, that's staying picky. So, in ancient, nodes ask the adjectives such as people, buildings, schools, bands, and many more, and that the ages as the relationship between them. For example, there is Amy working at a band and chatting away in Mandarin. Now, house and Brian, also a band-a-band who loves fried fries. Oh, there is also Anne, Brian's friend who is vanatig for fried fries too, and also flown in Mandarin. Sounds a bit tangled, right? But look at the graph we've got. Suddenly, it all starts to make sense. Now, with the idea of nodes and ages, entities and relationship, we have this final graph. But hey, let's get real. Our world is not just about Anne, Amy, and Brian. It's not just about bands and fried fries too, right? So, it's a vast thrill of people, teams, entities, and all of those have this special relationship. And that's what we call as a knowledge graph. So, once again, knowledge graph is a network of real-world entities and illustrates the relationship between them. All then, we store the knowledge graph in a graph database such as Neo4j database and process it using CypherQuery language. Neo4j's CypherQuery language includes clauses like use, match, read, return, and more. You can delve into them further with this Neo4j cheat sheet. Neo4j. Neo4j. talented ale. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Kraj. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j. Neo4j.



## 3. Why is it important?
 Now, let's talk about how a knowledge graph empowers an ALANA. So using a knowledge graph, you can identify the links within a seemingly disconnected cut-set. What does it mean? So February 2024, researchers from Microsoft conducted a research to compare base retrieval augmented generation system, or RAC, and graph RAC. They used favorite incident information from a news article dataset and deposit equations. What has no furtherance yet done? The result, baseline RAC not able to answer it because sometimes using factor zero research is not always affected. Especially when the query does not provide enough context about its true intent, or when the context is fragment across large corpus of text. You probably have heard about plus in the middle things. And that's the problem. And they proved that graph RAC able to answer these questions because first, they were all search about the entities which has no forzea. This allows the LLM to grow itself and then provide a very detailed answer. In November 2023, there was also a research that showed how knowledge graph optimized the search in the SQL database. The researchers revealed that, when answering using GPT-4 with zero-shot prompts directly on SQL database achieves an accuracy of 16%. Notably, the accuracy increases 54% when questions are posed over a knowledge graph representation of the enterprise SQL database.



## 4. Workflow
 Okay, so that's the reason why I learned knowledge graph for our LLAM is very interesting and important. And before we go into code, it's better for us to understand how it works. So first thing here, we will get the user question. And then we'll pass this user question to our LLAM. In this tab, our LLAM will also receive a database schema. Database schema means like the entities and relationship that we already store to our new 4G database. And the whole process in here, we call it as the graph chain. And then from the graph chain, it will generate a cipher query. And then we will run the cipher query to our new 4G database to get a result. And the result, we will pass it to the LLAM. In here, the LLAM will pass the result, we will create the result into our final answer. And in this tab, we will get the final one. Okay, thumbs to code.



## 5. Code
 So actually all the code, the data I've already provided in this GitHub and I already put it in my description. So the first thing that we need to do is just to clone this. So just copy and then go to our VS code and do GitClone here. Okay guys, once done, our next step will be creating a virtual environment. So just go to our folder and then do pip install at this. It will take a few seconds, so I will skip this part. Now let's open the notebook that we have in here. So since we have already installed all the buttons in from the bit installable that we've just done before. So let's run this one. So in here, I'll go through, since we're going to use the open source LLAM, which is the Gemini. And then having Shays and also Neo4, so I'll go right now on how to get the ATI key for all this. But before that, we need to create a .eans as well to save the file variable and just... And then нужnen it. So go ahead. And see other



## 6. Neo4J, Google Gemini, Hugging Face set up
 to this the google gemini api key so i will just go to this url and then already create api if you have it and for your hugging face just follow this link and then use new token but since i already have these tokens you just copy and then save the rnv variable and then for neo4g actually you can use my neo4g desktop or for this i'll use neo4g error which is a little bit simpler so if you have it gone you should sign up in here okay so the incident is ready and now like click open okay so in here we have the connection url in here we have the database user and also password make sure to also copy the connection url and the database username to the environment variable the dot answer file so right now just copy paste the password and click connect now the database is connected and now the next test is how to connect the python in here to the our database in the neo4g so actually langchain already provide us with a very simple connector which is this one langchain community to graphs neo4g.crown here so just click run it's all done okay



## 7. Data Overview
 So we have done to set up everything. So now let's talk about the data itself. So the data that we will use in here, I will keep it simple. We won't process data from the unstructured format such as the PDF, the text file. Nope. But in here, we will use data from Kegel. So that's for Manus Kumar for providing this data set. So basically, it is a LinkedIn data set, professional information like the name, the CPM and current company that they work in the position and many more. And actually, here we did a little bit preprocessing to finally get this final data. The next.



## 8. Insert Data to Neo4J
 thing is how to insert this data to Neo4j database. So as you can see here, since our data in Neo4j is still empty to node 0 and the relationship 0, and you can also check it by run to cell. You can see in here, there's nothing in here. So how to input it? So the first thing that we hear is by using the cipher query to interact with our database. So I'll explain one by one, start one here. Load CSV with headers from blah, blah, blah, blah. So actually, it's loading the CSV file from this GitHub repository. And then as we read the CSV file for each row, and in here, there's a cipher query inside. So we know that, okay, in here, name is the entity of name is person. So we make it to define the entity that we have in our database. So in here, it's a little bit different than the previous one, right? So basically, we use for each. Because if you see in the languages here, sometimes one people picks many languages, such as Roberto Mirola. He can speak English, Italian, France, Dutch, and German. So in here, we separate with this symbol. And this line is talking about the relationship. If you still remember about the explanation about relationship entity. So this one is the relationship or the ageist. Just run this one. Okay. And let's check it. Okay. So now our data is already in the Neo4j database. Check once again. Let's reload this. Okay. See? Now we have the company. Now we have educated ad, country industrial limits. All the data in from CSV is already imported in our Neo4j. Okay. Thanks, guys.



## 9. Building a Graph Chain
 Hey guys, now let's talk about the most interesting part of this video. How to operate the large language model to interact with our knowledge graph in Neoprene. So in here, we are using chat Google generative AI, Gemini Pro is open source, and then the API ensures the parameter that we have saved before in .elv file, and in here, we set the temporary to . The question is, why are we not set it to 0.5 or 0.9? The reason is because in this task, we need the LLM not to do a creative writing, but we needed to translate the natural language to a cyber query language. And then here, the chain with the one chain with graph cyber QA chain. The line chain has provided it's very easy to use. So the graph is the graph that we have defined in here, and the LLM is the model that we want to use, and for both, I set it to be close to true. What does it mean? Because I want to understand what happened beyond LLM. So run this, and then in here, we have several questions. I've already created a table that contains the pair of the question and the correct answer, so it becomes easier for us to check whether the answer, resided from the LLM, is correct or not. We want to know how it's performed, so let's run it.



## 10. Evaluation 1
 Okay, that's cool. Okay, so now let's talk about the result. So the first question, this will companies and advertising services industry. In here, it generates cyber and full contacts is said, and the result is toolbox creative, big advertising service enable. That's perfect. And the second question is, a worker you graduated from Cyberfresor University is currently employed at. Okay, I'm not sure why, but the general cyber query in here looks a little bit messy, and that's why there's a null in the context area. The result is I do not have the information we filmed here. And then the third question, where is POW looks working? Okay, so in here, the general cyber query is correct, and in the full contacts in here, toolbox creative is correct, but I don't know sure why, but then it's changing here as a result. I don't know the answer. The fourth question is, we see open here, which actually, if you remember about the schema of our database, the relationship, there's no spoken in in our database, right? So that's why it's null, and I don't have that information. The last one is, okay, the problem is the same, is native of. Actually, it's kind of huge needing about relationship, about the properties, about the entities, right? Because we don't have relationship is native of in our schema. So that's why it's also wrong. The answer is zero. The correct answer is one. Now, let's do a quick recap on the result that we have got before. So in here, we have two correct answer and three false answer from the LLM, right? And from this result, we can identify some problems. The first one is, not being able to accurately translate text into a cyber query. And the second one is, hallucinates properties, relationship, and entities. This one is one of the biggest problems of the LLM in-



## 11. Prompting Strategies
 generating a cyber query. So, what is the solution? We call it as prompting strategies. So, in short, we provide examples to our model to help it understand the structure correctly, similar to how pairs guide us when we learn to work, right? More examples makes the models learn and smarter. Now, let's jump to a code. So, the first thing that we need to do is to create pairs of questions and cyber query. So, I won't do it manually. Just copy this and paste the objectivity. Okay, once we got a result, so just copy and paste it to our Facebook, for example, in here. I want to copy these questions. And the most important part is to create text once again. If the general query from the JetsBD is already correct or not, we can do this by go to our Neo4j copy, paste, and text. Okay, good. We have this name, Pakusatya. So, if the general cyber query is correct, just do all that and copy and paste it here. The last thing that you need to do is to add places in here, because if you don't do it, you will not get it. I've already have this one, so I won't use it. I'll use mine to just run this. And in here, actually, LangChain already provide us with a free shot prompt template. And so, prompt template is just the prompt that we want to use, that we will use as a parameter in here. And the example, this is a sample that we have already divided in here. So, it will take the top three examples from it. And then the perfect self-ex, and here is the input variables, which is question and schema. We'll use the question and this, and the schema is here. So, run this one. This is an example of the prompt. Let's create our second chain. And check this one. All right.



## 12. Evaluation 2
 For the first question, the answer is correct. Toolbox Creative, Big Advertising Suit Into People's Tool. And for the second question, we get the correct answer which is Elastic Pet. And in here, we got Toolbox Creative. For the fourth question, Degenerative Cypher Quares is also correct and we get here Vitaly New Hint. And then for the last one, okay, we got the correct answer though which is one. So using the problem strategies will enhance our model. Right now, we have all correct answer.



## 13. Bonus ( How to Create a Dynamic Prompt? )
 Okay guys, our job is done, but as I've told you before, that I have a bonus for you. So, if you're still a member, we have this question. Where do Michael work? And the problem that we got is, which workers are friends. What industries are workers named animal associated with and which workers live in Canada and speak German? So I have a choice, there's no correlation, right? Between which workers speak friends and where do Michael work? And the reason is this line. So in here, we are taking top three from this list. In fact, we have more than three questions. So what's the point to just only take this top three? And that's why we need a dynamic problem. So to create a dynamic problem, we need a semantic similarity example selector. So basically, if you calculate which one is the closest between the question and the list that we have. So the first one that you need is the hacking phase embeddings and then the neo-forged factor. So previously, we are taking the top three questions from the examples list. Now, we will use the example selector to create a dynamic problem. If we run this. So, where do Michael work? What companies do workers named John work in? It makes sense. Where do workers named Alice live? And what industries are workers named Emily associated? So right now, the generative problem is more dynamic and we're not just the top.



## 14. Outro
 to read from our list.


In [15]:
from langchain_core.prompts.prompt import PromptTemplate

prompt_template = """ Your output should use the following template:

### Summary

### Analogy

### Notes

- [Emoji] Bulletpoint

### Keywords

- Explanation

You have been tasked with creating a concise summary of a YouTube video using its transcription.

Make a summary of the transcript.

Additionally make a short complex analogy to give context and/or analogy from day-to-day life from the transcript.

Create 10 bullet points (each with an appropriate emoji) that summarize the key points or important moments from the video's transcription.

In addition to the bullet points, extract the most important keywords and any complex words not known to the average reader aswell as any acronyms mentioned. For each keyword and complex word, provide an explanation and definition based on its occurrence in the transcription.

Please ensure that the summary, bullet points, and explanations fit within the 330-word limit, while still offering a comprehensive and clear understanding of the video's content. Use the text above:

Please, I need you to translate the answer into Portuguese!

```
{title}

{transcription}
```
"""

prompt = PromptTemplate(
    input_variables=["title", "transcription"], template=prompt_template
)


llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

chain = prompt | llm

In [17]:
summary = chain.invoke({"title": transcripter.title, "transcription": formatted_text})

In [18]:
Markdown(summary.content)

### Resumo

O vídeo apresenta um tutorial sobre como integrar um gráfico de conhecimento com modelos de linguagem de grande porte (LLMs) usando Python. O apresentador explica a teoria por trás dos gráficos de conhecimento, como configurar um banco de dados Neo4j e integrá-lo com a biblioteca LangChain. O vídeo também discute a importância dos gráficos de conhecimento para melhorar a precisão das respostas dos LLMs, demonstrando como eles podem identificar conexões em conjuntos de dados aparentemente desconectados.

### Analogia

Integrar um gráfico de conhecimento a um LLM é como usar um mapa detalhado para navegar em uma cidade desconhecida. Sem o mapa, você pode se perder em ruas e avenidas, mas com ele, você pode encontrar o caminho mais eficiente e descobrir conexões que não seriam visíveis à primeira vista.

### Notas

- 🧠 O que é um gráfico de conhecimento? É uma rede de entidades do mundo real e suas relações.
- 📊 Importância: Gráficos de conhecimento ajudam LLMs a responder perguntas complexas com mais precisão.
- 🔗 Conexões: Eles identificam links em conjuntos de dados desconectados, melhorando a compreensão do contexto.
- 💻 Configuração: O vídeo ensina a configurar um banco de dados Neo4j e a usar a linguagem Cypher.
- 📈 Resultados: Pesquisas mostram que o uso de gráficos de conhecimento aumenta a precisão das respostas dos LLMs.
- 🔄 Workflow: O processo envolve receber perguntas, gerar consultas Cypher e retornar resultados.
- 📚 Exemplos: O apresentador usa um conjunto de dados do LinkedIn para demonstrar a inserção e consulta de dados.
- ⚙️ Estratégias de Prompt: Exemplos ajudam os modelos a entender melhor a estrutura das consultas.
- 🔍 Avaliação: O vídeo mostra como avaliar a precisão das respostas geradas pelos LLMs.
- 🎁 Bônus: O apresentador oferece dicas sobre como criar prompts dinâmicos para melhorar a interação.

### Palavras-chave

- **Gráfico de Conhecimento**: Uma representação de entidades e suas relações no mundo real.
- **Neo4j**: Um banco de dados orientado a grafos que armazena dados em forma de nós e arestas.
- **Cypher**: A linguagem de consulta usada para interagir com o banco de dados Neo4j.
- **LLM (Modelo de Linguagem de Grande Porte)**: Modelos de inteligência artificial que processam e geram texto em linguagem natural.
- **LangChain**: Uma biblioteca que facilita a integração de LLMs com bancos de dados e outras fontes de dados.

Essas definições ajudam a entender os conceitos discutidos no vídeo e sua aplicação prática.

In [8]:
formatted_text

NameError: name 'formatted_text' is not defined

In [1]:
from openai import OpenAI

client = OpenAI()

stream = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role": "user", "content": "Say this is a test"}],
    stream=True,
)
for chunk in stream:
    if chunk.choices[0].delta.content is not None:
        print(chunk.choices[0].delta.content, end="")

This is a test.

In [None]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg

### [Extra] - Informações e Metadadaos sobre vídeos do Youtube

In [3]:
import yt_dlp

def get_youtube_video_metadata(video_url):
    ydl_opts = {
        'skip_download': True,  # Não baixa o vídeo
        'extract_flat': True,   # Não extrai streams de mídia, só metadados
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)

    return info_dict

In [4]:
video_metadata = get_youtube_video_metadata(VIDEO_URL)

[youtube] Extracting URL: https://www.youtube.com/watch?v=MnBV8zLq-_Y
[youtube] MnBV8zLq-_Y: Downloading webpage
[youtube] MnBV8zLq-_Y: Downloading ios player API JSON
[youtube] MnBV8zLq-_Y: Downloading web creator player API JSON
[youtube] MnBV8zLq-_Y: Downloading player 96d06116
[youtube] MnBV8zLq-_Y: Downloading m3u8 information


In [5]:
video_metadata.keys()

dict_keys(['id', 'title', 'formats', 'thumbnails', 'thumbnail', 'description', 'channel_id', 'channel_url', 'duration', 'view_count', 'average_rating', 'age_limit', 'webpage_url', 'categories', 'tags', 'playable_in_embed', 'live_status', 'release_timestamp', '_format_sort_fields', 'automatic_captions', 'subtitles', 'comment_count', 'chapters', 'heatmap', 'like_count', 'channel', 'channel_follower_count', 'uploader', 'uploader_id', 'uploader_url', 'upload_date', 'timestamp', 'availability', 'original_url', 'webpage_url_basename', 'webpage_url_domain', 'extractor', 'extractor_key', 'playlist', 'playlist_index', 'display_id', 'fulltitle', 'duration_string', 'release_year', 'is_live', 'was_live', 'requested_subtitles', '_has_drm', 'epoch', 'requested_formats', 'format', 'format_id', 'ext', 'protocol', 'language', 'format_note', 'filesize_approx', 'tbr', 'width', 'height', 'resolution', 'fps', 'dynamic_range', 'vcodec', 'vbr', 'stretched_ratio', 'aspect_ratio', 'acodec', 'abr', 'asr', 'audi

In [9]:
title = video_metadata["title"].lower().replace(" ","_")
language = video_metadata["language"]

print(f"title = {title}")
print(f"language = {language}")

title = ansible_vault
language = en


# Download audio from Youtube

In [10]:
%%time

output_filename = f"data/{title}_raw"

ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
    'outtmpl': f'{output_filename}.%(ext)s',
}

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([VIDEO_URL])

[youtube] Extracting URL: https://www.youtube.com/watch?v=MnBV8zLq-_Y
[youtube] MnBV8zLq-_Y: Downloading webpage
[youtube] MnBV8zLq-_Y: Downloading ios player API JSON
[youtube] MnBV8zLq-_Y: Downloading web creator player API JSON
[youtube] MnBV8zLq-_Y: Downloading m3u8 information
[info] MnBV8zLq-_Y: Downloading 1 format(s): 251
[download] Destination: data/ansible_vault_raw.webm
[download] 100% of    5.05MiB in 00:00:04 at 1.02MiB/s   
[ExtractAudio] Destination: data/ansible_vault_raw.mp3
Deleting original file data/ansible_vault_raw.webm (pass -k to keep)
CPU times: user 409 ms, sys: 66.7 ms, total: 476 ms
Wall time: 12.1 s


As informações disponíveis são:

# Cortar audio

In [11]:
raw_filename = "data/raw_audio.mp3"
output_filename = "data/output_audio.mp3"
start_time = 0
end_time = 10


audio = AudioSegment.from_file(output_filename)

start_time_ms = start_time * 1000
end_time_ms = end_time * 1000
trimmed_audio = audio[start_time_ms:end_time_ms]
trimmed_audio.export(output_filename, format="mp3")
print(f"Áudio recortado salvo em: {output_filename}")

Audio(output_filename, autoplay=False)


FileNotFoundError: [Errno 2] No such file or directory: 'data/output_audio.mp3'

# Speech-to-Text

In [7]:
print(whisper.available_models())

['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large']


In [8]:
model = whisper.load_model("large").to(device)

In [24]:
%%time
audio_filename = f"data/{title}_raw.mp3"

result = model.transcribe(audio_filename, language=language)
print(result["text"])

 My name is Alex Dorjson. I'm an Ansible Solutions Specialist and today I'm going to be talking about Ansible Vault and what it can be used for. So first let's talk about what Ansible Vault is. Ansible Vault is just a command line utility that's installed as part of Ansible that provides a way for you to encrypt different variables and files. So now I don't have to worry about my sensitive data being out there in plain text. Many cases I see this used to protect different machine credentials, some of your hosts and group files, especially if I'm connecting to other utilities or APIs and I can use this to encrypt just individual strings or entire files. I'll personally say I generally use entire files rather than strings just because it's easier to maintain and easier to rekey if I need to. So important reminder, this is only designed to protect data at rest. So obviously if I'm trying to use this in a standard variable and I have a debug statement in my playbook I can still print out t

In [19]:
%%time
audio_filename = "data/raw_audio.mp3"

model = whisper.load_model("small").to(device)
result = model.transcribe(audio_filename, language="pt")
print(result["text"])

 Vamos pro jogo das frases, jogam um ande-marcão. Nesse jogo, vocês escreveram frases lá fora e elas estão nessa caixa. Cada um vai pegar duas frases e colocar no bolso. Durante a cena, eles tiram essa frase e lêm a frase, mas não basta ler. Tem que justificar a frase dentro da cena já começada. E a cena leva o título de Eu Queria Voar. E o jogo das frases começa agora. Jinoveva, volta pro poleiro! Eu sei por que você tá aí fora! Por quê? Eu vi você ontem à noite acordado olhando as corujas de um lado pro outro. Até os morcegos você ficou olhando. Os morcegos, eles parecem ratos, não sabe o que eles fazem. Eles vão, Jinoveva, eu sei. Eles vão... Aceitam como você é, Jinoveva, e outra coisa. Acho que esqueci o ferro ligado. Só um minuto. Isso, vai lá! Vai que o CIRAM vai queimar! Vai queimar tudo que... Agora... É que agora é a chocadeira, eu deixo o ferrinho do lado da chocadeira. Mais fácil do que eu ficar. Tem mais o que fazer. Rebeca, você tem que ter mais ambição. Bicou caovos e fi

In [7]:
%%time

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("data/output_audio.mp3")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions(language="pt")
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

Detected language: pt
candidato Pablo Maisal, se a cidade tá mal, relaxe e vote Pablo Maisal. Pegou a 100 installership, se é uma corrida eleitoral
CPU times: user 2.3 s, sys: 8.69 ms, total: 2.31 s
Wall time: 2.09 s


You are a network graph maker who extracts terms and their relations from a given context. You are provided with a context chunk (delimited by ```) Your task is to extract the ontology of terms mentioned in the given context. These terms should represent the key concepts as per the context. 
Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.
	Terms may include object, entity, location, organization, person, 
	condition, acronym, documents, service, concept, etc.
	Terms should be as atomistic as possible

Thought 2: Think about how these terms can have one on one relation with other terms.
	Terms that are mentioned in the same sentence or the same paragraph are typically related to each other.
	Terms can be related to many other terms

Thought 3: Find out the relation between each such related pair of terms. 

Format your output as a list of json. Each element of the list contains a pair of termsand the relation between them, like the follwing: