In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset

# 1. Criação de um corpus fictício com algumas frases em português
texts = [
    "O gato dorme no sofá.",
    "A inteligência artificial está transformando o mundo.",
    "Eu gosto de programar em Python.",
    "Este é um exemplo de corpus em português."
]




In [2]:

# Converte a lista de textos em um Dataset
dataset = Dataset.from_dict({"text": texts})

# 2. Carrega o tokenizador do modelo "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

# Função para tokenizar o dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=32)

# Aplica a tokenização no dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 3. Configura um data collator para treinamento com masked language modeling (MLM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

# 4. Carrega o modelo "answerdotai/ModernBERT-base" para masked language modeling
model = AutoModelForMaskedLM.from_pretrained("answerdotai/ModernBERT-base")


tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

ValueError: The checkpoint you are trying to load has model type `modernbert` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date.

In [None]:

# 5. Define os argumentos de treinamento
training_args = TrainingArguments(
    output_dir="./ModernBERT-portuguese-adapted",
    overwrite_output_dir=True,
    num_train_epochs=1,             # Aumente o número de épocas para um treinamento real
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
)

# 6. Configura o Trainer com o modelo, os argumentos, o dataset e o data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# 7. Inicia o treinamento (continuação do pré-treinamento)
trainer.train()

# 8. Salva o modelo e o tokenizador adaptados
model.save_pretrained("./ModernBERT-portuguese-adapted")
tokenizer.save_pretrained("./ModernBERT-portuguese-adapted")


In [7]:
import whisper
import torch
import yt_dlp
import yt_dlp as youtube_dl
from pydub import AudioSegment
from IPython.display import Audio

device = "cuda" if torch.cuda.is_available() else "cpu"

In [168]:
from functools import lru_cache
import re

def remove_special_characters(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

@lru_cache(maxsize=128)
def get_youtube_video_metadata(video_url):
    ydl_opts = {
        'skip_download': True,  # Não baixa o vídeo
        'extract_flat': True,   # Não extrai streams de mídia, só metadados
    }
    
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(video_url, download=False)

    return info_dict



In [18]:
VIDEO_URL = "https://www.youtube.com/watch?v=vX3A96_F3FU"

# VIDEO_URL = "https://www.youtube.com/watch?v=evfLsSRtryk"


In [172]:
import json

class YoutubeTranscripter:
    CACHE_FOLDER = '/root/.cache/diskcache'
    def __init__(self, video_url, language=None):
        self._cache = dc.Cache(self.CACHE_FOLDER)
        self.video_url = video_url
        self.metadata = self._get_metadata()
        self.language = language if language else self.metadata["language"]
        self.title = self.metadata["title"]
        self.label = remove_special_characters(self.title.lower()).replace(" ","_")
        self.chapters = self.metadata["chapters"]

    def _get_metadata(self):
        index = ("metadata", self.video_url)
        if index not in self._cache:
            print(f"Download do vídeo: {index}")
            self._cache[index] = json.dumps(
                get_youtube_video_metadata(self.video_url),
                ensure_ascii=False
            )
        return json.loads(self._cache[index])

    def _download_audio(self):
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'outtmpl': f'{output_filename}.%(ext)s',
        }
        
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([VIDEO_URL])

    def clear_cache(self):
        self._cache.clear()
        print("Cache limpo.")


In [173]:
yt_transcriper = YoutubeTranscripter(VIDEO_URL)

In [161]:
sdf = yt_transcriper._get_metadata()

In [175]:
yt_transcriper.label

'graph_rag_improving_rag_with_knowledge_graphs'

In [131]:
# yt_transcriper._cache.get('metadata_https://www.youtube.com/watch?v=vX3A96_F3FU')

In [158]:
yt_transcriper.clear_cache()

Cache limpo.


In [166]:


clean_text = remove_special_characters(yt_transcriper.title.lower())
print(clean_text)  # Saída: "Olá Bemvindo ao mundo de Python 2024"


graph rag improving rag with knowledge graphs


In [None]:
output_filename = f"data/{title}_raw"

ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
    'outtmpl': f'{output_filename}.%(ext)s',
}

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download([VIDEO_URL])

In [66]:
yt_transcriper = YoutubeTranscripter(VIDEO_URL)

In [67]:
yt_transcriper.chapters

[{'start_time': 0.0,
  'title': 'Introduction to GraphRAG and Its Cost Issue',
  'end_time': 44.0},
 {'start_time': 44.0,
  'title': 'Understanding Traditional RAG',
  'end_time': 106.0},
 {'start_time': 106.0,
  'title': 'Limitations of Traditional RAG',
  'end_time': 142.0},
 {'start_time': 142.0, 'title': 'Introduction to GraphRAG', 'end_time': 159.0},
 {'start_time': 159.0,
  'title': 'Technical Details of GraphRAG',
  'end_time': 346.0},
 {'start_time': 346.0,
  'title': 'Setting Up GraphRAG on Your Local Machine',
  'end_time': 382.0},
 {'start_time': 382.0,
  'title': 'Running the Indexing Process',
  'end_time': 720.0},
 {'start_time': 720.0,
  'title': 'Running Queries with GraphRAG',
  'end_time': 866.0},
 {'start_time': 866.0,
  'title': 'Cost Implications and Alternatives',
  'end_time': 958}]

In [75]:
import diskcache as dc

cache = dc.Cache('./cache-directory')

def expensive_function(n):
    if ("expensive_function", n) in cache:
        return cache[("expensive_function", n)]
    
    result = n * n
    cache[("expensive_function", n)] = result
    return result

print(expensive_function(5))  # Calcula e armazena em cache
print(expensive_function(5))  # Recupera do cache


25
25


In [71]:
cache[("expensive_function", 4)]

KeyError: ('expensive_function', 4)

In [19]:
video_metadata = get_youtube_video_metadata(VIDEO_URL)

[youtube] Extracting URL: https://www.youtube.com/watch?v=vX3A96_F3FU
[youtube] vX3A96_F3FU: Downloading webpage
[youtube] vX3A96_F3FU: Downloading ios player API JSON
[youtube] vX3A96_F3FU: Downloading web creator player API JSON
[youtube] vX3A96_F3FU: Downloading m3u8 information


In [20]:
video_metadata["language"]

'en'

In [21]:
video_metadata["title"]

'Graph RAG: Improving RAG with Knowledge Graphs'

In [22]:
if video_metadata["chapters"]:
    _download_video()

In [57]:
# video_metadata["chapters"]

In [1]:
from absl import logging

# Inicializar o sistema de logs da Abseil
logging.set_verbosity(logging.ERROR)


In [1]:
# %%time
# !pip uninstall tensorflow -y
# !pip install tensorflow[and-cuda]

In [1]:
import sys
import os
import tensorflow as tf
from absl import logging

# Redirecionar STDERR temporariamente
sys.stderr = open(os.devnull, 'w')

# Inicializar o TensorFlow (ou qualquer código que gere os avisos)
physical_devices = tf.config.list_physical_devices('GPU')

# Restaurar STDERR
sys.stderr = sys.__stderr__

# Continuar o código normal
logging.set_verbosity(logging.ERROR)


I0000 00:00:1726462539.725237     341 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462539.748697     341 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462539.748960     341 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [5]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3946435560444492848
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14230618112
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2834319952901452859
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.9"
xla_global_id: 416903419
]


I0000 00:00:1726462255.734598      60 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462255.734887      60 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462255.735040      60 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726462255.815505      60 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [6]:
import os
import tensorflow as tf

# Configurar TensorFlow para mostrar apenas erros
os.environ['TF_CPP_MIN_LOG_LEVEL']

'3'