In [1]:
# # !pip install pytube pydub
# !pip install -U youtube_dl
# !pip install -U yt-dlp==2024.8.6


In [1]:
import tempfile
import pathlib
import shutil
import whisper
import torch
import yt_dlp as youtube_dl
from pydub import AudioSegment
from IPython.display import Audio
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

In [85]:
print(whisper.available_models())

['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large', 'large-v3-turbo', 'turbo']


In [86]:
model = whisper.load_model('large-v3-turbo').to(device)

100%|█████████████████████████████████████| 1.51G/1.51G [17:38<00:00, 1.53MiB/s]


In [91]:
def cut_audio(path, start_time, end_time, output_path):
    audio = AudioSegment.from_file(path)
    start_time_ms = start_time * 1000
    end_time_ms = end_time * 1000
    trimmed_audio = audio[start_time_ms:end_time_ms]
    trimmed_audio.export(output_path, format="mp3")

class YoutubeTranscripter:
    def __init__(self, video_url, whisper_model):
        self.video_url = video_url
        self.whisper_model = whisper_model
        self.metadata = None
        self.title = None
        self.label = None
        self.language = None
        self.chapters = None
        self._temp_dir = tempfile.mkdtemp()
        self._raw_video_path = None
        self._video_parts_folderpath = None
        self._audios_to_transcribe = []

    def get_metadata(self):
        ydl_opts = {
            'skip_download': True,
            'extract_flat': True,
        }
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            self.metadata = ydl.extract_info(self.video_url, download=False)

        self.title = self.metadata["title"]
        self.label = self.metadata["title"].lower().replace(" ","_")
        self.language = self.metadata["language"]
        self.chapters = self.metadata["chapters"]


    def download(self):
        output_filename = f"{transcripter._temp_dir}/{self.label}"
        
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'outtmpl': f'{output_filename}.%(ext)s',
        }

        self._raw_video_path = f'{output_filename}.mp3'
        
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([VIDEO_URL])

    def prepare(self):
        if self.chapters:
            self._video_parts_folderpath = self._raw_video_path.replace(".mp3", "")
            pathlib.Path(self._video_parts_folderpath).mkdir(parents=True, exist_ok=True)
        
            for i, c in enumerate(self.chapters):
                print(f"[splitting] - Part {i+1} - {c['title']}")
                output_path = f"{self._video_parts_folderpath}/{str(i+1).zfill(4)}__{c['title']}__.mp3"
                cut_audio(self._raw_video_path, c["start_time"], c["end_time"], output_path)
                c.update({"audio_path": output_path})
                self._audios_to_transcribe.append(c)
        else:
            self._audios_to_transcribe.append(
                {
                    "title": self.title,
                    "audio_path": self._raw_video_path,
                }
            )

    def transcript(self):
        response = []
        for i, a in enumerate(self._audios_to_transcribe):
            print(f"[transcripting] - Part {i+1} - {a['audio_path']}")
            result = self.whisper_model.transcribe(a["audio_path"], language=self.language)
            a.update({"transcription": result})
            response.append(a)
        return response
            
    def cleanup(self):
        if self._temp_dir:
            shutil.rmtree(self._temp_dir)
            self._temp_dir = None
            self._raw_video_path = None
            self._video_parts_folderpath = None
            self._audios_to_transcribe = []
            print("Diretório temporário removido.")

    def run(self):
        self.get_metadata()
        self.download()
        self.prepare()
        response = self.transcript()
        self.cleanup()
        return response

# Transcrever

In [92]:
%%time

VIDEO_URL = "https://www.youtube.com/watch?v=UuxLY87MqwU"

transcripter = YoutubeTranscripter(VIDEO_URL, model)
response = transcripter.run()
# transcripter.get_metadata()
# transcripter.download()
# transcripter.prepare()
# transcripter.transcript()
# transcripter.cleanup()

[youtube] Extracting URL: https://www.youtube.com/watch?v=UuxLY87MqwU
[youtube] UuxLY87MqwU: Downloading webpage
[youtube] UuxLY87MqwU: Downloading ios player API JSON
[youtube] UuxLY87MqwU: Downloading web creator player API JSON
[youtube] UuxLY87MqwU: Downloading m3u8 information
[youtube] Extracting URL: https://www.youtube.com/watch?v=UuxLY87MqwU
[youtube] UuxLY87MqwU: Downloading webpage
[youtube] UuxLY87MqwU: Downloading ios player API JSON
[youtube] UuxLY87MqwU: Downloading web creator player API JSON
[youtube] UuxLY87MqwU: Downloading m3u8 information
[info] UuxLY87MqwU: Downloading 1 format(s): 140
[download] Destination: /tmp/tmpyp0ulyl_/how_to_build_a_knowledge_graph_from_scratch_even_if_you_are_not_really_a_full-blown_developer.m4a
[download] 100% of   38.19MiB in 00:00:25 at 1.47MiB/s     
[FixupM4a] Correcting container of "/tmp/tmpyp0ulyl_/how_to_build_a_knowledge_graph_from_scratch_even_if_you_are_not_really_a_full-blown_developer.m4a"
[ExtractAudio] Destination: /tmp/t

### Resultado

In [97]:
from IPython.display import Markdown
from textwrap import dedent

formatted_text = []
for i,r in enumerate(response):
    chapter_text = dedent(f"""
    ## {i+1}. {r['title']}
    {r['transcription']['text']}
    """)

    formatted_text.append(chapter_text)

formatted_text = "\n\n".join(formatted_text)

Markdown(formatted_text)


## 1. Intro
 here good morning good afternoon good evening depending on where you are in the world I'm happy to be able to talk to you a little bit about a knowledge graph that I have put together over the last few years actually from scratch and what I actually meant with that is that I didn't have the luxury to go out and download data from some other source and represented in a graph a lot of the data that you'll see is actually mostly manually entered as well as same automatically entered into into the database and I'm not a really a full-blown developer which means I don't write code on a daily basis as my job but I do write scripts to automate certain things and that has also be done in my example here so a little bit about myself I grew up in Austria and initially started to study my molecular microbiology then switched over to molecular biology decided to get my PhD in Switzerland and then had an opportunity to move to California to do some postdoc time and during that time I decided I wanted to kind of combine biology with computer technology and particular databases and that ended up lending a job in information technology and I've spent the last 25 or so years in that field in particular in the biotechnology space where I got to learn a lot about drug discovery and the data that is involved in there I'm very excited about interactive visualizations worked a lot with the traditional BI tools and love to see data that can be interacted with that are not just static charts that you see on the screen but you can actually see what's behind the chart and for fun I like to drive fast cars and and put the color rally cars if I have the opportunity to do that so how did I get in



## 2. What is knowledge and how do we capture it?
 to Neo4j. It really started with my interest in cancer research, oncology, and in particular because I had a brush with cancer myself, which fortunately turned out to be not serious. But I also have friends that are affected by it. And I've also seen idols in the business being affected by cancer. It's a very non-discriminating disease and it basically can affect anybody at any point in time. And there have been good improvements in treatment of cancer over the years, but there's still a lot of work to be done. So I wanted to learn more about the space, the domain that is involved in cancer research, the companies, what are they working on, what kind of types of cancer they're working on, what kind of molecules they're developing. And so I thought, okay, as I learn about this, how do I capture this knowledge? Because I can't keep everything in my head. I need some kind of database to do that. And initially, relational database came into the mix. And I thought, well, as we've seen from other presentations, usually you start off with a schema that is very fixed. So you have to know everything up front and then you populate the database. So it's not very flexible. So I was looking for something more flexible. And so I came across the idea of node relationship node or subject assertion object. And I was really excited about this idea and thought, okay, this is the way I want to implement a database to capture this knowledge about cancer growth discovery. So I found this slide actually on a presentation from Alessandro Negri from GraphAware. And I thought this actually captures very nicely how one could capture knowledge in a graph. You start off with the data that's just out there. These are the things that we care about in a particular domain. And then we start labeling it to give it some information. David Vogelpohl- The next step would be to say, well, if I have these individual things, how are they connected with each other? And how can I represent this? How can I define the way they're connected with each other? And that leads you to kind of knowledge in a knowledge graph. And ideally, that leads you to additional insight. And then last but not least, wisdom, we can debate about the wisdom, but definitely, you can get the insights through this connected data landscape.



## 3. The Initial Idea
 So the initial idea that I had, and I represented this in the graph just a few years back, labeled competitive intelligence and cancer drug discovery, was that in this space we have lots of different companies, small companies, startups, medium-sized, and very large companies, the pharma giants. And all of these, they're working on what's called therapeutic molecules. These are the drugs that they're marketing in the end when they are approved to treat a particular disease. These therapeutic molecules interact typically with a molecular target. This molecular target is a protein inside your body that has a critical role in the disease process. And the case that I'm interested in here is in driving cancer. And you want to influence that with the therapeutic molecule, with the drug, in a way that obviously benefits the patient, either stops the cancer or eliminates it altogether. So I thought, well, there must be some network in this. And when you look at the bottom right, I kind of show you the basic network that exists there, where you have a company that builds a molecule or designs a molecule that acts on a molecular target. But then there are other companies that work on a different molecule, but they act on the same target. So you can create then sort of a network of companies that work on the same molecular target and treat the same disease or sometimes also different diseases. So it's a start of a network, but it's by no means the end of it.



## 4. The Drug Discovery Process
 When I started to look into the whole drug discovery process that's outlined in these different boxes here, there are very many things involved in bringing a drug to the market. And so the initial idea that I had was really at the very beginning here where I said, okay, target a molecule, then we obviously have to have a disease. The molecules go through some steps along the way to optimize and identify a single molecule that will be pushed through this drug discovery process that is then safe for entering clinical trials. And then the clinical trials themselves are going through various phases before an approval could happen that brings the drug on the market that you can then as a patient receive to treat the particular disease. So looking at this space, I started with this simple model, but as you can guess, this is going to increase in terms of node types very rapidly. And it certainly did. So just as a reminder, why cancer, the statistics are still pretty abysmal when we look at how many deaths are predicted, even in 2019, taken from the American Cancer Society. So there's definitely still lots of work to be done and we need to see how we can support the companies that are actually bringing these drugs to the market. And it costs a lot of money as well to move this forward.



## 5. CanDIG - Answering simple and complex questions
 So, looking at the knowledge graph idea, I wanted to have an environment that allows me to answer simple as well as complex questions. So I wanted to quickly see, well, if I have come across a company in the past, can I look it up by name, for instance? Or a molecule, can I find it by its name? Or any aliases or alternate names that this molecule might be known by? But I also wanted to ask some more complex questions. For instance, well, which of these molecules interact with a specific molecular target or a target family? What is the drug pipeline, for instance, for a specific company? Which of the molecules are they developing themselves? Which of the molecules are they developing in collaboration with others? Then which companies are working on therapeutic molecules that act on a specific molecular target? And which diseases are they targeting? As I mentioned, a molecule may be involved in a disease process, but this disease process isn't necessarily isolated. It may involve multiple parts in a human body. And one molecule could be used in treating multiple diseases. Last example here. Which therapeutic molecules are in apothecarylophase in a clinical trial that is targeting breast cancer, irrespective of a subtype. So there's got to be an opportunity to consume taxonomies of a disease, for instance, to get you the right answer. So with all these questions in mind, I definitely wanted a flexible system. program a can have been



## 6. Simple Data Entry Forms for Concepts
 Well, easier said than done. When I started, there was really nothing. And I resorted to something that I was familiar with that had a decent scripting language and a good framework to build a UI. And I created some input screens that you see here to enter the initial concepts about companies and the basic information that I wanted to know, like such as a name, a description, maybe an image, and then potential link outs to other information that I didn't want to capture directly in my knowledge graph. And I wanted to also have the option to have a minimal entry possible because sometimes all I knew was the name of something and then that would be connected to other things.



## 7. Named Relationships follow
 So then I needed also relationships. I needed to figure out how to connect a molecule as shown in the example on the left hand side labeled subjects with an object, with a company in this case, through a named relationship. So I defined my relationships such as developed by company or has location in city. So I can use that also as a potential filtering mechanism in the future once I have a knowledge graph in place. And in addition, there were some link outs to other source information and buttons for a quick lookup.



## 8. Connect different Concepts via Named Relationships
 Of course, a molecule is then connected not just to a company, but it's also connected to a lot of other information that one would like to know about a particular drug molecule in a pipeline, such as the highest clinical trial phase that this molecule might be in, what it potentially is indicated for, in this case, pancreatic cancer, what type of molecule it is. If you're familiar with this space, traditionally a drug was just a small molecule, a chemical that you consumed in a pill form. But the biotech industry started to bring proteins that typically had to be injected into the mix. And now the design has gone much further where proteins and small molecules are combined in quite sophisticated structures. And sometimes we even have cells that are now used as drugs. So one couldn't necessarily even say therapeutic molecule and would have to be changed to therapeutic agent, which is something that I actually would like to do on my knowledge graph in the future. IZ works for a moment and the future. Thanks, David. Thanks, people! Thanks, everyone! Scientist says, isingo the effect of what Azure Brand is the driver of the engine and so the purpose of theского government profiling? Would you like to accept videoiegenINGS from the vehicle that I really have an apostle from the developer? Recypable LuPおся exhilarate. Can you suggest that Alibas, could you P m, the spot that allows you to design out a giant art situation this morning. But there's a greatlusion step on the narrative of the Internet. Sometimes I really Rashomon in other comercializo. Thanks for joining us, please be if you don't like to be seen in aλαWEI This is the key, which I really want to be seen during the world's So there will be an�� 포인 gang.



## 9. Content domains in the graph
 So if you're familiar with Neo4j, this will remind you, obviously, of the property graph model. You have nodes with properties. You have relationships with properties. And I had designed this in FileMaker Pro initially by setting up two tables, one for subjects, one for objects, and then another table that connects everything together through relationships. And I didn't know Neo4j at the time when this started, but it allowed me to collect some data over time. And then I came across Neo4j and I said, well, this is exactly what I need to continue on this idea. And as I started out with a simple set of concepts, it obviously grew over time and it is still growing. This slide shows you some examples of these additional concepts where I started to get into the actual product that a company sells. I got into some of the financials that gives me a perspective of how profitable are some of these drugs. And then I also wanted to get into the biology and the chemistry more and started adding bioprocess or biological structure in addition to the target, the molecule and the disease. So the graph is still growing as it is, as more data becomes available and as additional questions may come to the foreground.unn goero. Okay. any right to mention that you can see in this program will be the exact same looks here as you see, and how this Speakers may be built into a spectrum of Maßnahmen to improve habits, and how it affects kids with primitives. Shllofi has the CHOC PMZE.alarany. GU consoles and stuff playing frog attacks, you're available to him for other times how it adds to the dangers of significance, to physical progress, and seems to be a game ofexcputation at意思 level and unraveling the plante reading. I've asked my specific suggestions on the הא�emian and I wrote myация EGM Abondo Al novemberoj



## 10. Semi-automated ETL with scripts
 So, I said I didn't have Neo4j. I also didn't want to enter all the data manually. So one piece that I looked into was to get data more automatically. And I looked into clinicaltrials.gov, which lists all the companies that are actually developing drugs and enter them into human trials. And I'd set up a script that allowed me to semi-automate and extract transform load from that database on a weekly basis. So I do this actually, weekly extract the data from clinical trials, and then go through this process by looking up cancer-related entries and assign them. There's a lot of stuff to it as well. On the lower left in the slide here, you see some of the issues that are actually popping up, which is the terminology that is used to describe a disease or the drug is not very strict in this database. So if you really want to harmonize some of that, it requires some manual intervention in this ETL process to establish the proper connections. And that is a process that I go through. It's quite labor-intensive. And you also have to have the knowledge at times to make the right decisions to bring this data together. So just a little bit about what's in the database. So this database is not large. This is not about big data, but it's about highly connected data. So at this point, I got up to about 86 different types of data, different types of labels. I have 160 relationship types that connect these different node types together, 16,000 nodes and 89,000 relationships. The majority, as you can see, is clinical trial data with lots of dates associated with it. Then we have the therapeutic molecules, which are kind of at the center of this graph, the molecular target, and the company, the organization, where I mostly have companies, but I also have some data about some research organizations as well. So I had the data. I had this in the FileMaker database. But how do I get it out into Neo4j? I resorted to a pipelining tool called KNIME, an open source tool, which is actually quite great to read in files, then process it, manipulate it somehow, and then potentially write files again. And so I did export data out of FileMaker, then did some transformation in there to create load CSV files that could be used in a script that I would run to get the data into the Neo4j database. I started off with a standalone database, community edition initially, and then when Neo4j offered the opportunity to get into Neo4j desktop, I started switching over to use desktop to store the knowledge graph. The load process, I wrote a script in sublime text to initially just define some simple constraints. And the only constraints that I have is that I have some unique identifiers for each node type, and I have a uniqueness constraint on the name to make sure that I avoid duplicates, because that is something that you can come across quite often, especially when you're dealing with things like molecules that have many different names throughout the development process. But I wanted to represent it only once in the database, not 15 different ways, just because they have a different name. Then I loaded the nodes first, very simple scripts here, followed by the relationships, which I put together by looking up the nodes initially, and then assigning a particular relationship based on the value that I had extracted from the FileMaker database. And last but not least, I also looked into using Cypher to add some content into the data based on the data that was actually there, such as associating, for instance, a company to a therapeutic area, a particular disease area, that they might be working on based on the type of molecule and the disease that the molecule is supposed to address. And that all went into one file and one Cypher statement and that is used.



## 11. CanDIG Technology Stack
 to basically load the database. So the technology stack that I've been dealing with to build all this is FileMaker Pro Advanced to enable me to write scripts and capture the data initially, the ETL process through KNIME and a load script, getting it into Neo4j. And then I had the ability to write queries through Cypher that lets me actually answer the questions that I set out in the very beginning to this knowledge graph. In addition now, it also offers the opportunity to write, create reports through APOC, through the awesome procedures on Cypher that allows me to put those into a visualization tool such as Tableau to maybe create a dashboard. I could create a GraphQL API. I tried a few examples of those and I'm very excited to get more into this to maybe put a web application on top of it. And I also can visualize this data in Neo4j Bloom and I'd like to get to that actually in a few minutes to show you this life if the demo gods are with us here. And then last but not least, we have the GraphAlgorithms playground that one could use to do GraphAlgorithms.



## 12. Cypher query examples
 Okay, so here are some Cypher query examples. One example, for instance, for some real-time queries, if I wanted to know what is the pipeline of a particular company. The thing I want to point out here is that because not all the data is always available, I had to use quite frequently an optional match phrase here to say, well, if it's there, show me the data. If it's not there, then obviously still show me the original molecule that I wanted to see, even if it doesn't have anything else other than maybe just a name. So that flexibility is really important in this context. The second part here, we have some APOC-generated reports. For instance, if I wanted to get all the data about molecular targets out of the database, I can use APOC to generate this report. And that is data that kind of goes into a visualization tool, such as Tableau. Now let's get to visualization, graph visualization. I was very excited when Neo4j developed Bloom and made it available for startups. And I jumped on it very quickly, ended up writing a little blog about this too. And rather than actually showing this slide here, I want to get into Bloom if I can, and show you some examples of how you can interactively work with the data. So then want toêtes for data. And also now design. And what's the data? What are the main Previously looking for us to find the eigen conventional object to our audience? Here Flight Number v眼 is an idea that there's an Julian Xu<|no|>政治, right? Well, there's an idea that a Darwin Vikeloda gw



## 13. High-level views into this graph landscape
 One example that I want to show here, and I'm not sure whether I can do it online here, is that Bloom doesn't just allow you to look up the individual entities in the knowledge graph, such as a company or a target or a molecule. You can also do these global visualizations that show you, for instance, like on the left-hand side here, companies and therapeutic molecules, which gives you a perspective of how highly connected some of these areas are in the space of big companies that are collaborating with each other, working on the same molecule. And then you have such a halo around these companies, which represents the newer companies, the startups that are just getting into this business and trying to make a mark in that. Similar with therapeutic molecules and molecular targets, the centers here highlight those that are heavily worked on. Those are targets that are so-called validated that many companies then try to develop drugs for, whereas the novel things are on the bottom side here, which don't have too many connections. Those are the most common ones. So let's see demo time. Let's see whether I can share my screen. All right. Okay, so if I go here, I hope people can see my screen. Can you confirm, Karin? Karin? Yes, we can see it. Okay. Perfect. Perfect. Perfect. All right. So here I just wanted to show an example of a molecular target that actually has been in the press quite frequently recently because it is a cancer oncogene that has been very resistant to being targeted by a drug. And so in this case, I have basically a representation of the molecular target. One can double click on it, get it basic information. I don't have much here other than there are many alternate names for the same target, which is sort of a common problem in biology that we have many names for the same thing. And sometimes the same name for different things. So in this case, you can also see then the relationships, which is really where the interesting part comes into the mix. I have these various named relationships that define the relationships to other things. And you can also look up the neighbors directly. So in this case, you see that I have connections to 19 therapeutic molecules, one event, one source reference, one company directly, and one molecular class. So in this case, what I can do is I can highlight all of this and reveal this in the graph, which then lets me further explore what I would like to be, what I might be interested in. I might want to know, well, why do I have revolution medicines on there? What is the GTPAs? Are there other ones? Or what do we know about the molecular molecule here? A typical question, which unfortunately I don't have a good solution for so far is I would love to just highlight all of these molecules that are connected to this target and then say, okay, give me just the companies that these targets are being developed by. But I cannot yet do that. So that's kind of a hint for the Bloom team to see, is that a possibility maybe in the future? What I can do is I can expand this, but then we get into sort of a typical graph problem where you might get a hairball, which is a little bit more difficult to decipher. But Bloom gives you still some tools that you can untangle this graph where I can basically say, okay, I don't care about this. I don't care about that, at least at this point in time. And I can quickly just deselect these examples and say, okay, dismiss all of this and then just leave me what might be interesting for me, which are the molecules and the individual companies. And then I have information about the companies laid up here, including also a hyperlink that lets you then get into more information outside of the context of the knowledge graph. And I wanted to say thank you for enabling those hyperlinks because I think that's a very important feature in sort of the graph visualization tool set. Well, let's see how much time I have. I have a little bit of time here. You have 12 minutes total. Okay. Let me show you. You have more than that. You have like 17 minutes. Okay. All right. Something maybe quickly to showcase as well. I prepared this already beforehand. So, I'm going to go ahead and see how much time I have. So, one can use also the Neo4j browser to look at the structure of the knowledge graph. You can do DB schema, but unfortunately that gives you a hairball. So, you have to be more specific and use here the APARC meta sub graph to show just the relationships between company therapeutic molecule and molecular target. So, a similar example here, for instance, for clinical trials, where I have the clinical trial here that's connected to many different dates that come with the clinical trial record. But then also two therapeutic molecule companies, research institutions, disease and development stage. I was actually wrong about the time. There's like 10 minutes left. But, you can continue, keep going. And anybody who wants to stay can stay for the Q&A, all that stuff. I might just have to jump off in 10 minutes. Okay. Okay. Now, I'll do that quickly. I just want to point out. Because you do have questions in Q&A for sure. But, go keep going what you're doing because this is interesting. I think I can speak for everyone when I say that. I just want to point out. So, I wanted to show briefly that I can run these different queries that I set out in the beginning that were my goal to answer specific questions. And, also want to highlight that in some cases, you don't necessarily just want the graph, but you want a tabular representation of the data, which you can certainly get through running these queries or reports, as I have shown earlier. So, I just want to jump back quickly since I know I am out on time here. And, okay, where are my slides here? I hope this one was the last slide here. Let's take this. Okay. Okay. I think.



## 14. Feeding interactive visualization dashboards
 I have a couple. I just wanted to. Ah, OK. Last one, interactive visualization dashboards can also be generated by this here. This is, for instance, some financial data that I get out of the database here showing the profits that companies can make with these drugs and bring them in perspective when you look at multiple different drugs together. That shows you then what are the real big moneymakers versus what maybe doesn't address such a big market size. And then last but not least, everybody knows now that the world is highly connected. If you're interested in this knowledge domain, connect with me. Everything starts out simple and quickly gets complex. So try to retain simplicity wherever it's possible. And when you're presenting about knowledge graph, also think about non-technical audience that's just interested in the knowledge, not necessarily how you got there. Interactive graph visualizations, I think, are a key to enable data explorations of such a knowledge space. And the graphs really opened the door for me to capturing, learning, understanding, and interrogating complex data landscapes to gain additional insights. And I just want to say thank you to everybody who makes open source tools available to a broad audience so they can explore these new opportunities, in particular Neo4j and also NIME. And if you want to explore that visual interrogation a little bit further, I put a blog out there on medium. And then last but not least, we want to get to the Hunger Games questions as well, before we forget that. And if you want to answer those, go to this address here and address them. I'll take a look at the Q&A section to see what questions are. I'm converting some of the messages from the chat to the Q&A. If that way, you can just go right through them and click answer. Okay, okay. Just go through them. All right. I'll try to address. It's my first knowledge graph. So I don't know whether that's what people think about a knowledge graph or knowledge base. For me, that was a knowledge graph. The results of clinical trials, potentially, yes, they are of interest. And at the moment, I try to capture positive or negative results as part of an event in the graph. It's a very good question, difficult to address. Interfacing NIME with Neo4j. Honestly, I don't directly interface it at this point in time. I basically just export files out of NIME in a format that Neo4j can consume then through a script. So NIME is a data pipelining tool that allows you to read in data from a file or from a database, manipulate the data, convert it in many different ways, and then output it again into files or another database. I haven't tried connecting to Neo4j directly. Something I would certainly be interested in. Who uses Candig at this point in time? It is still, as I look at it, as a prototype. So I'm the primary user. I'm trying to expose it to a few scientists, biologists that might have an interest in it. And now that I have Neo4j Bloom available, I can actually do that. Because before, no biologist is really going to run cipher queries until, unless they're really, really interested in this domain. So Bloom opens the door to that possibility. Clinical trials definitely is providing volume of the data. But then I'm also reading a lot of trade publications such as First Biotech or InPoint News that provide information about novel companies that are just coming on the marketplace that are really new. And I wanted to track them as early as possible. And the link I can share in the chat box. Here we are at my next day. .



## 15. Closing Remarks
 All right, so let's see. Can I do that? I might have to do that differently. I can't get to that link directly here. Sorry. Yeah, the last one about named entity recognition tools. I would love to get into NLP. There's a lot of content that could be looked at actually in what I store in the database, and you could create more tags. I haven't had the time yet. So this has been sort of a side project of mine to get into graphs and to see what you can actually accomplish with it, what answers you can provide. And, yeah, I wish I had more time and hands available to address those. I'd be interested in collaborating. I'd be interested who would be interested in this kind of data and what they would want to do with it. So, yes, please reach out to me via LinkedIn or Twitter. That would be great. All right. Okay, perfect. Thank you, Justin. Appreciate posting the link to the Bloom presentation. I think everything is going to be recorded. I don't know exactly how people can get to the recording after this is over, but I'm pretty sure we'll find out from Neo4j where you can find it. And I'm excited. The power didn't go out in Southern California so far. So I guess the wind must not be blowing that hard so far. Make sure you answer the questions for the Hunger Games. All right. Thank you, everyone, for joining in. I hope that was interesting content and topic for you. Let me know if there are any questions. We're happy to get back to you. Thank you.


### [Extra] - Informações e Metadadaos sobre vídeos do Youtube

In [3]:
import yt_dlp

def get_youtube_video_metadata(video_url):
    ydl_opts = {
        'skip_download': True,  # Não baixa o vídeo
        'extract_flat': True,   # Não extrai streams de mídia, só metadados
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)

    return info_dict

In [4]:
video_metadata = get_youtube_video_metadata(VIDEO_URL)

[youtube] Extracting URL: https://www.youtube.com/watch?v=MnBV8zLq-_Y
[youtube] MnBV8zLq-_Y: Downloading webpage
[youtube] MnBV8zLq-_Y: Downloading ios player API JSON
[youtube] MnBV8zLq-_Y: Downloading web creator player API JSON
[youtube] MnBV8zLq-_Y: Downloading player 96d06116
[youtube] MnBV8zLq-_Y: Downloading m3u8 information


In [5]:
video_metadata.keys()

dict_keys(['id', 'title', 'formats', 'thumbnails', 'thumbnail', 'description', 'channel_id', 'channel_url', 'duration', 'view_count', 'average_rating', 'age_limit', 'webpage_url', 'categories', 'tags', 'playable_in_embed', 'live_status', 'release_timestamp', '_format_sort_fields', 'automatic_captions', 'subtitles', 'comment_count', 'chapters', 'heatmap', 'like_count', 'channel', 'channel_follower_count', 'uploader', 'uploader_id', 'uploader_url', 'upload_date', 'timestamp', 'availability', 'original_url', 'webpage_url_basename', 'webpage_url_domain', 'extractor', 'extractor_key', 'playlist', 'playlist_index', 'display_id', 'fulltitle', 'duration_string', 'release_year', 'is_live', 'was_live', 'requested_subtitles', '_has_drm', 'epoch', 'requested_formats', 'format', 'format_id', 'ext', 'protocol', 'language', 'format_note', 'filesize_approx', 'tbr', 'width', 'height', 'resolution', 'fps', 'dynamic_range', 'vcodec', 'vbr', 'stretched_ratio', 'aspect_ratio', 'acodec', 'abr', 'asr', 'audi

In [9]:
title = video_metadata["title"].lower().replace(" ","_")
language = video_metadata["language"]

print(f"title = {title}")
print(f"language = {language}")

title = ansible_vault
language = en


# Download audio from Youtube

In [10]:
%%time

output_filename = f"data/{title}_raw"

ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
    'outtmpl': f'{output_filename}.%(ext)s',
}

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([VIDEO_URL])

[youtube] Extracting URL: https://www.youtube.com/watch?v=MnBV8zLq-_Y
[youtube] MnBV8zLq-_Y: Downloading webpage
[youtube] MnBV8zLq-_Y: Downloading ios player API JSON
[youtube] MnBV8zLq-_Y: Downloading web creator player API JSON
[youtube] MnBV8zLq-_Y: Downloading m3u8 information
[info] MnBV8zLq-_Y: Downloading 1 format(s): 251
[download] Destination: data/ansible_vault_raw.webm
[download] 100% of    5.05MiB in 00:00:04 at 1.02MiB/s   
[ExtractAudio] Destination: data/ansible_vault_raw.mp3
Deleting original file data/ansible_vault_raw.webm (pass -k to keep)
CPU times: user 409 ms, sys: 66.7 ms, total: 476 ms
Wall time: 12.1 s


As informações disponíveis são:

# Cortar audio

In [11]:
raw_filename = "data/raw_audio.mp3"
output_filename = "data/output_audio.mp3"
start_time = 0
end_time = 10


audio = AudioSegment.from_file(output_filename)

start_time_ms = start_time * 1000
end_time_ms = end_time * 1000
trimmed_audio = audio[start_time_ms:end_time_ms]
trimmed_audio.export(output_filename, format="mp3")
print(f"Áudio recortado salvo em: {output_filename}")

Audio(output_filename, autoplay=False)


FileNotFoundError: [Errno 2] No such file or directory: 'data/output_audio.mp3'

# Speech-to-Text

In [7]:
print(whisper.available_models())

['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3', 'large']


In [8]:
model = whisper.load_model("large").to(device)

In [24]:
%%time
audio_filename = f"data/{title}_raw.mp3"

result = model.transcribe(audio_filename, language=language)
print(result["text"])

 My name is Alex Dorjson. I'm an Ansible Solutions Specialist and today I'm going to be talking about Ansible Vault and what it can be used for. So first let's talk about what Ansible Vault is. Ansible Vault is just a command line utility that's installed as part of Ansible that provides a way for you to encrypt different variables and files. So now I don't have to worry about my sensitive data being out there in plain text. Many cases I see this used to protect different machine credentials, some of your hosts and group files, especially if I'm connecting to other utilities or APIs and I can use this to encrypt just individual strings or entire files. I'll personally say I generally use entire files rather than strings just because it's easier to maintain and easier to rekey if I need to. So important reminder, this is only designed to protect data at rest. So obviously if I'm trying to use this in a standard variable and I have a debug statement in my playbook I can still print out t

In [19]:
%%time
audio_filename = "data/raw_audio.mp3"

model = whisper.load_model("small").to(device)
result = model.transcribe(audio_filename, language="pt")
print(result["text"])

 Vamos pro jogo das frases, jogam um ande-marcão. Nesse jogo, vocês escreveram frases lá fora e elas estão nessa caixa. Cada um vai pegar duas frases e colocar no bolso. Durante a cena, eles tiram essa frase e lêm a frase, mas não basta ler. Tem que justificar a frase dentro da cena já começada. E a cena leva o título de Eu Queria Voar. E o jogo das frases começa agora. Jinoveva, volta pro poleiro! Eu sei por que você tá aí fora! Por quê? Eu vi você ontem à noite acordado olhando as corujas de um lado pro outro. Até os morcegos você ficou olhando. Os morcegos, eles parecem ratos, não sabe o que eles fazem. Eles vão, Jinoveva, eu sei. Eles vão... Aceitam como você é, Jinoveva, e outra coisa. Acho que esqueci o ferro ligado. Só um minuto. Isso, vai lá! Vai que o CIRAM vai queimar! Vai queimar tudo que... Agora... É que agora é a chocadeira, eu deixo o ferrinho do lado da chocadeira. Mais fácil do que eu ficar. Tem mais o que fazer. Rebeca, você tem que ter mais ambição. Bicou caovos e fi

In [7]:
%%time

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("data/output_audio.mp3")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions(language="pt")
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

Detected language: pt
candidato Pablo Maisal, se a cidade tá mal, relaxe e vote Pablo Maisal. Pegou a 100 installership, se é uma corrida eleitoral
CPU times: user 2.3 s, sys: 8.69 ms, total: 2.31 s
Wall time: 2.09 s
