In [2]:
!pip install -U transformers==4.48.3 -qqq

# Base de Dados

In [1]:
import pandas as pd

#  baixar arquivo de https://github.com/emdemor/News-of-the-Brazilian-Newspaper/blob/main/data/brazilian-news.parquet
df = pd.read_parquet("data/brazilian-news.parquet")


In [2]:
temp = df.sample(10000)
texts = temp["text"].to_list() + temp["title"].to_list()

texts = [x[:1000] for x in texts if x]

# Modelo




In [3]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, logging
from datasets import Dataset

torch.backends.cuda.matmul.allow_tf32 = True

# Desativa o paralelismo dos tokenizers para evitar avisos
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Ajusta a verbosidade para INFO para ver mais detalhes nos logs
logging.set_verbosity_info()

# 1. Criação de um corpus fictício com algumas frases em português
texts = [
    "O gato dorme no sofá.",
    "A inteligência artificial está transformando o mundo.",
    "Eu gosto de programar em Python.",
    "Este é um exemplo de corpus em português."
]

# Converte a lista de textos em um Dataset
dataset = Dataset.from_dict({"text": texts})

# 2. Carrega o tokenizador do modelo "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

# Função para tokenizar o dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=32)

# Aplica a tokenização no dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 3. Configura um data collator para treinamento com masked language modeling (MLM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# 4. Carrega o modelo "answerdotai/ModernBERT-base" para MLM
model = AutoModelForMaskedLM.from_pretrained(
    "answerdotai/ModernBERT-base",
    device_map="auto",   # ou device_map={"": "cuda:0"}
    torch_dtype=torch.float16  # ou o dtype desejado
)


# 5. Define os argumentos de treinamento com logging_steps para imprimir logs a cada 5 passos
training_args = TrainingArguments(
    output_dir="./ModernBERT-portuguese-adapted",
    overwrite_output_dir=True,
    num_train_epochs=10,             # Utilize mais épocas para um treinamento real
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,                # Log a cada 5 passos
    logging_dir="./logs",           # Diretório para os logs (opcional)
)

# 6. Configura o Trainer com o modelo, os argumentos, o dataset e o data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# 7. Inicia o treinamento; você verá os logs de avanço no console.
trainer.train()

# 8. Salva o modelo e o tokenizador adaptados
model.save_pretrained("./ModernBERT-portuguese-adapted")
tokenizer.save_pretrained("./ModernBERT-portuguese-adapted")


loading file tokenizer.json from cache at /root/.cache/huggingface/models/models--answerdotai--ModernBERT-base/snapshots/8949b909ec900327062f0ebf497f51aef5e6f0c8/tokenizer.json
loading file tokenizer.model from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/models/models--answerdotai--ModernBERT-base/snapshots/8949b909ec900327062f0ebf497f51aef5e6f0c8/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/models/models--answerdotai--ModernBERT-base/snapshots/8949b909ec900327062f0ebf497f51aef5e6f0c8/tokenizer_config.json
loading file chat_template.jinja from cache at None
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

loading configuration file config.json from cache at /root/.cache/huggingface/models/models--answerdotai--ModernBERT-base/snapshots/8949b909ec900327062f0ebf497f51aef5e6f0c8/config.json
Model config ModernBertConfig {
  "_name_or_path": "answerdotai/ModernBERT-base",
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "layer_norm_eps": 1e-05,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  

Step,Training Loss
5,1.1368
10,0.0
15,0.0
20,0.0


Saving model checkpoint to ./ModernBERT-portuguese-adapted/checkpoint-10
Configuration saved in ./ModernBERT-portuguese-adapted/checkpoint-10/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/checkpoint-10/model.safetensors
Deleting older checkpoint [ModernBERT-portuguese-adapted/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./ModernBERT-portuguese-adapted/checkpoint-20
Configuration saved in ./ModernBERT-portuguese-adapted/checkpoint-20/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/checkpoint-20/model.safetensors
Deleting older checkpoint [ModernBERT-portuguese-adapted/checkpoint-20] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in ./ModernBERT-portuguese-adapted/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/model.safetensors
tokenizer config file saved in ./ModernBERT-portuguese-adapted/tokenizer_config.json
Speci

('./ModernBERT-portuguese-adapted/tokenizer_config.json',
 './ModernBERT-portuguese-adapted/special_tokens_map.json',
 './ModernBERT-portuguese-adapted/tokenizer.json')

In [4]:
from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM
import torch

# Carrega o tokenizador e o modelo da pasta onde foram salvos
tokenizer = AutoTokenizer.from_pretrained("./ModernBERT-portuguese-adapted")
model = AutoModelForMaskedLM.from_pretrained("./ModernBERT-portuguese-adapted")
if torch.cuda.is_available():
    model.to("cuda")

# Cria um pipeline de "fill-mask" para a tarefa de preenchimento de máscara
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Exemplo de frase com token de máscara (geralmente [MASK])
sentence = "O gato [MASK] no sofá."

# Executa o pipeline para obter as predições para o token mascarado
results = fill_mask(sentence)

for result in results:
    print(f"Token: {result['token_str']} | Score: {result['score']:.4f}")


loading file tokenizer.json
loading file tokenizer.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
loading configuration file ./ModernBERT-portuguese-adapted/config.json
Model config ModernBertConfig {
  "_name_or_path": "./ModernBERT-portuguese-adapted",
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
 

Token: " | Score: nan
Token: # | Score: nan
Token: <|padding|> | Score: nan
Token: |||IP_ADDRESS||| | Score: nan
Token: ! | Score: nan


# Mudar o tokenizador

In [3]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, logging
from datasets import Dataset

torch.backends.cuda.matmul.allow_tf32 = True

os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.set_verbosity_info()



In [4]:
# # 1. Criação de um corpus fictício com algumas frases em português
# texts = [
#     "O gato dorme no sofá.",
#     "A inteligência artificial está transformando o mundo.",
#     "Eu gosto de programar em Python.",
#     "Este é um exemplo de corpus em português."
# ]

dataset = Dataset.from_dict({"text": texts})

In [5]:
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")

loading configuration file config.json from cache at /root/.cache/huggingface/models/models--neuralmind--bert-base-portuguese-cased/snapshots/94d69c95f98f7d5b2a8700c420230ae10def0baa/config.json
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_s

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=32)

In [3]:


tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=new_tokenizer, mlm=True, mlm_probability=0.15)

# 3. Carrega o modelo original para MLM
model = AutoModelForMaskedLM.from_pretrained(
    "answerdotai/ModernBERT-base",
    device_map="auto",
    torch_dtype=torch.float16
)

# 4. Redimensiona a camada de embeddings para o novo vocabulário
new_vocab_size = len(new_tokenizer)
model.resize_token_embeddings(new_vocab_size)

# 5. Garanta que o modelo esteja na GPU (movendo novamente após o redimensionamento)
model.to("cuda")

# 6. (Opcional) Ao tokenizar uma entrada para teste, mova os tensores para a GPU
example_text = "Este é um teste de adaptação de tokenizador."
inputs = new_tokenizer(example_text, return_tensors="pt").to("cuda")
outputs = model(**inputs)

print("Saída do modelo com o novo tokenizador:", outputs)


# 5. Define os argumentos de treinamento com logging_steps para imprimir logs a cada 5 passos
training_args = TrainingArguments(
    output_dir="./ModernBERT-portuguese-adapted",
    overwrite_output_dir=True,
    num_train_epochs=10,             # Utilize mais épocas para um treinamento real
    per_device_train_batch_size=2,
    learning_rate=0.00001,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,                # Log a cada 5 passos
    logging_dir="./logs",           # Diretório para os logs (opcional)
)

# 6. Configura o Trainer com o modelo, os argumentos, o dataset e o data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# 7. Inicia o treinamento; você verá os logs de avanço no console.
trainer.train()

# 8. Salva o modelo e o tokenizador adaptados
model.save_pretrained("./ModernBERT-portuguese-adapted")
new_tokenizer.save_pretrained("./ModernBERT-portuguese-adapted")

loading configuration file config.json from cache at /root/.cache/huggingface/models/models--neuralmind--bert-base-portuguese-cased/snapshots/94d69c95f98f7d5b2a8700c420230ae10def0baa/config.json
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_s

Map:   0%|          | 0/19955 [00:00<?, ? examples/s]

loading configuration file config.json from cache at /root/.cache/huggingface/models/models--answerdotai--ModernBERT-base/snapshots/8949b909ec900327062f0ebf497f51aef5e6f0c8/config.json
Model config ModernBertConfig {
  "_name_or_path": "answerdotai/ModernBERT-base",
  "architectures": [
    "ModernBertForMaskedLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 50281,
  "classifier_activation": "gelu",
  "classifier_bias": false,
  "classifier_dropout": 0.0,
  "classifier_pooling": "mean",
  "cls_token_id": 50281,
  "decoder_bias": true,
  "deterministic_flash_attn": false,
  "embedding_dropout": 0.0,
  "eos_token_id": 50282,
  "global_attn_every_n_layers": 3,
  "global_rope_theta": 160000.0,
  "gradient_checkpointing": false,
  "hidden_activation": "gelu",
  "hidden_size": 768,
  "initializer_cutoff_factor": 2.0,
  "initializer_range": 0.02,
  "intermediate_size": 1152,
  "layer_norm_eps": 1e-05,
  "local_attention": 128,
  "local_rope_theta": 10000.0,
  

Saída do modelo com o novo tokenizador: MaskedLMOutput(loss=None, logits=tensor([[[  4.3398,   1.9277,   6.0703,  ...,   0.6348,   2.3750,  -0.5469],
         [  3.0078,  -1.8711,   4.4531,  ...,   2.8477,   5.6719,  -0.8979],
         [  1.5713,  -0.5005,  -3.9258,  ...,  -1.5059,   5.1719,  -6.7070],
         ...,
         [-12.4062,  -1.8916,   6.3242,  ...,   1.7012,  13.8750,  -7.0977],
         [  8.8438,   2.2246,  10.6094,  ...,  -5.7734,   3.0801,  -1.6514],
         [  6.4961,   4.0195,  13.4062,  ...,  -1.3145,  -0.0541,  -7.0078]]],
       device='cuda:0', dtype=torch.float16, grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)


The following columns in the training set don't have a corresponding argument in `ModernBertForMaskedLM.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `ModernBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 19,955
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 99,780
  Number of trainable parameters = 133,833,826


Step,Training Loss
5,16.3853
10,0.0
15,0.0
20,0.0
25,0.0


Saving model checkpoint to ./ModernBERT-portuguese-adapted/checkpoint-10
Configuration saved in ./ModernBERT-portuguese-adapted/checkpoint-10/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/checkpoint-10/model.safetensors
Deleting older checkpoint [ModernBERT-portuguese-adapted/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./ModernBERT-portuguese-adapted/checkpoint-20
Configuration saved in ./ModernBERT-portuguese-adapted/checkpoint-20/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/checkpoint-20/model.safetensors
Deleting older checkpoint [ModernBERT-portuguese-adapted/checkpoint-20] due to args.save_total_limit
Saving model checkpoint to ./ModernBERT-portuguese-adapted/checkpoint-30
Configuration saved in ./ModernBERT-portuguese-adapted/checkpoint-30/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/checkpoint-30/model.safetensors


KeyboardInterrupt: 

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `ModernBertForMaskedLM.forward` and have been ignored: token_type_ids, text. If token_type_ids, text are not expected by `ModernBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 100
  Number of trainable parameters = 133,833,826


Step,Training Loss
5,15.6455
10,0.0
15,0.0
20,0.0
25,0.0
30,0.0
35,0.0
40,0.0
45,0.0
50,0.0


Saving model checkpoint to ./ModernBERT-portuguese-adapted/checkpoint-10
Configuration saved in ./ModernBERT-portuguese-adapted/checkpoint-10/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/checkpoint-10/model.safetensors
Deleting older checkpoint [ModernBERT-portuguese-adapted/checkpoint-10] due to args.save_total_limit
Saving model checkpoint to ./ModernBERT-portuguese-adapted/checkpoint-20
Configuration saved in ./ModernBERT-portuguese-adapted/checkpoint-20/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/checkpoint-20/model.safetensors
Deleting older checkpoint [ModernBERT-portuguese-adapted/checkpoint-20] due to args.save_total_limit
Saving model checkpoint to ./ModernBERT-portuguese-adapted/checkpoint-30
Configuration saved in ./ModernBERT-portuguese-adapted/checkpoint-30/config.json
Model weights saved in ./ModernBERT-portuguese-adapted/checkpoint-30/model.safetensors
Deleting older checkpoint [ModernBERT-portuguese-adapted/checkpoint-30] due

('./ModernBERT-portuguese-adapted/tokenizer_config.json',
 './ModernBERT-portuguese-adapted/special_tokens_map.json',
 './ModernBERT-portuguese-adapted/vocab.txt',
 './ModernBERT-portuguese-adapted/added_tokens.json',
 './ModernBERT-portuguese-adapted/tokenizer.json')

# ------ Tests

The documents are clustered into taxonomies and the corpus can be loaded in complete or taxonomy modes. To load a single taxonomy, it is possible to pass a code as a parameter to the loading script (see the example bellow). Codes are 3-letters string and possible values are:

dat : datasets and other corpora;

jud : judicial branch;

leg : legislative branch;

pub : public domain works;

soc : social media;

uni : university domains;

wik : wikis.

In [1]:
import random
from datasets import load_dataset

# https://huggingface.co/datasets/carolina-c4ai/corpus-carolina
# dataset = load_dataset("carolina-c4ai/corpus-carolina")
dataset = load_dataset("carolina-c4ai/corpus-carolina", taxonomy="wik")



all_texts = dataset["corpus"]["text"]
sample_texts = random.sample(all_texts, 10)


Downloading data:   0%|          | 0/193 [00:00<?, ?files/s]

Generating corpus split: 0 examples [00:00, ? examples/s]

In [None]:
sample_texts

In [5]:
import os
# Desativa o paralelismo do tokenizers para evitar os avisos
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
import torch

# 1. Criação de um corpus fictício com algumas frases em português
texts = [
    "O gato dorme no sofá.",
    "A inteligência artificial está transformando o mundo.",
    "Eu gosto de programar em Python.",
    "Este é um exemplo de corpus em português."
]

# Converte a lista de textos em um Dataset
dataset = Dataset.from_dict({"text": texts})

In [6]:

# 2. Carrega o tokenizador do modelo "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

# Função para tokenizar o dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=32)

# Aplica a tokenização no dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 3. Configura um data collator para treinamento com masked language modeling (MLM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# 4. Carrega o modelo "answerdotai/ModernBERT-base" para MLM
model = AutoModelForMaskedLM.from_pretrained("answerdotai/ModernBERT-base")

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [7]:
# Se houver GPU disponível, move o modelo para GPU (necessário para Flash Attention 2.0)
if torch.cuda.is_available():
    print("moving to cuda")
    model.to("cuda")

moving to cuda


In [8]:
# 5. Define os argumentos de treinamento
training_args = TrainingArguments(
    output_dir="./ModernBERT-portuguese-adapted",
    overwrite_output_dir=True,
    num_train_epochs=1,             # Utilize mais épocas em um cenário real
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
)

In [9]:
# 6. Configura o Trainer com o modelo, os argumentos, o dataset e o data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

In [10]:
# 7. Inicia o treinamento (continuação do pré-treinamento)
trainer.train()


# 8. Salva o modelo e o tokenizador adaptados
model.save_pretrained("./ModernBERT-portuguese-adapted")
tokenizer.save_pretrained("./ModernBERT-portuguese-adapted")



Step,Training Loss


('./ModernBERT-portuguese-adapted/tokenizer_config.json',
 './ModernBERT-portuguese-adapted/special_tokens_map.json',
 './ModernBERT-portuguese-adapted/tokenizer.json')

In [7]:
import whisper
import torch
import yt_dlp
import yt_dlp as youtube_dl
from pydub import AudioSegment
from IPython.display import Audio

device = "cuda" if torch.cuda.is_available() else "cpu"

In [168]:
from functools import lru_cache
import re

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

@lru_cache(maxsize=128)
def get_youtube_video_metadata(video_url):
    ydl_opts = {
        'skip_download': True,  # Não baixa o vídeo
        'extract_flat': True,   # Não extrai streams de mídia, só metadados
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)

    return info_dict



In [18]:
VIDEO_URL = "https://www.youtube.com/watch?v=vX3A96_F3FU"

# VIDEO_URL = "https://www.youtube.com/watch?v=evfLsSRtryk"


In [172]:
import json

class YoutubeTranscripter:
    CACHE_FOLDER = '/root/.cache/diskcache'
    def __init__(self, video_url, language=None):
        self._cache = dc.Cache(self.CACHE_FOLDER)
        self.video_url = video_url
        self.metadata = self._get_metadata()
        self.language = language if language else self.metadata["language"]
        self.title = self.metadata["title"]
        self.label = remove_special_characters(self.title.lower()).replace(" ","_")
        self.chapters = self.metadata["chapters"]

    def _get_metadata(self):
        index = ("metadata", self.video_url)
        if index not in self._cache:
            print(f"Download do vídeo: {index}")
            self._cache[index] = json.dumps(
                get_youtube_video_metadata(self.video_url),
                ensure_ascii=False
            )
        return json.loads(self._cache[index])

    def _download_audio(self):
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'outtmpl': f'{output_filename}.%(ext)s',
        }
        
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([VIDEO_URL])

    def clear_cache(self):
        self._cache.clear()
        print("Cache limpo.")


In [173]:
yt_transcriper = YoutubeTranscripter(VIDEO_URL)

In [161]:
sdf = yt_transcriper._get_metadata()

In [175]:
yt_transcriper.label

'graph_rag_improving_rag_with_knowledge_graphs'

In [131]:
# yt_transcriper._cache.get('metadata_https://www.youtube.com/watch?v=vX3A96_F3FU')

In [158]:
yt_transcriper.clear_cache()

Cache limpo.


In [166]:


clean_text = remove_special_characters(yt_transcriper.title.lower())
print(clean_text)  # Saída: "Olá Bemvindo ao mundo de Python 2024"


graph rag improving rag with knowledge graphs


In [None]:
output_filename = f"data/{title}_raw"

ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
    'outtmpl': f'{output_filename}.%(ext)s',
}

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([VIDEO_URL])

In [66]:
yt_transcriper = YoutubeTranscripter(VIDEO_URL)

In [67]:
yt_transcriper.chapters

[{'start_time': 0.0,
  'title': 'Introduction to GraphRAG and Its Cost Issue',
  'end_time': 44.0},
 {'start_time': 44.0,
  'title': 'Understanding Traditional RAG',
  'end_time': 106.0},
 {'start_time': 106.0,
  'title': 'Limitations of Traditional RAG',
  'end_time': 142.0},
 {'start_time': 142.0, 'title': 'Introduction to GraphRAG', 'end_time': 159.0},
 {'start_time': 159.0,
  'title': 'Technical Details of GraphRAG',
  'end_time': 346.0},
 {'start_time': 346.0,
  'title': 'Setting Up GraphRAG on Your Local Machine',
  'end_time': 382.0},
 {'start_time': 382.0,
  'title': 'Running the Indexing Process',
  'end_time': 720.0},
 {'start_time': 720.0,
  'title': 'Running Queries with GraphRAG',
  'end_time': 866.0},
 {'start_time': 866.0,
  'title': 'Cost Implications and Alternatives',
  'end_time': 958}]

In [75]:
import diskcache as dc

cache = dc.Cache('./cache-directory')

def expensive_function(n):
    if ("expensive_function", n) in cache:
        return cache[("expensive_function", n)]
    
    result = n * n
    cache[("expensive_function", n)] = result
    return result

print(expensive_function(5))  # Calcula e armazena em cache
print(expensive_function(5))  # Recupera do cache


25
25


In [71]:
cache[("expensive_function", 4)]

KeyError: ('expensive_function', 4)

In [19]:
video_metadata = get_youtube_video_metadata(VIDEO_URL)

[youtube] Extracting URL: https://www.youtube.com/watch?v=vX3A96_F3FU
[youtube] vX3A96_F3FU: Downloading webpage
[youtube] vX3A96_F3FU: Downloading ios player API JSON
[youtube] vX3A96_F3FU: Downloading web creator player API JSON
[youtube] vX3A96_F3FU: Downloading m3u8 information


In [20]:
video_metadata["language"]

'en'

In [21]:
video_metadata["title"]

'Graph RAG: Improving RAG with Knowledge Graphs'

In [22]:
if video_metadata["chapters"]:
    _download_video()

In [57]:
# video_metadata["chapters"]

In [1]:
from absl import logging

# Inicializar o sistema de logs da Abseil
logging.set_verbosity(logging.ERROR)


In [1]:
# %%time
# !pip uninstall tensorflow -y
# !pip install tensorflow[and-cuda]

In [1]:
import sys
import os
import tensorflow as tf
from absl import logging

# Redirecionar STDERR temporariamente
sys.stderr = open(os.devnull, 'w')

# Inicializar o TensorFlow (ou qualquer código que gere os avisos)
physical_devices = tf.config.list_physical_devices('GPU')

# Restaurar STDERR
sys.stderr = sys.__stderr__

# Continuar o código normal
logging.set_verbosity(logging.ERROR)


I0000 00:00:1726462539.725237     341 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462539.748697     341 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462539.748960     341 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [5]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3946435560444492848
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14230618112
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2834319952901452859
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.9"
xla_global_id: 416903419
]


I0000 00:00:1726462255.734598      60 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462255.734887      60 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462255.735040      60 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462255.815505      60 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [6]:
import os
import tensorflow as tf

# Configurar TensorFlow para mostrar apenas erros
os.environ['TF_CPP_MIN_LOG_LEVEL']

'3'