In [1]:
import os 
import gc
import spacy
import torch
import GPUtil
import warnings
import chromadb
import numpy as np
import pandas as pd
import transformers
from tqdm import tqdm
from chromadb.config import Settings
from sentence_transformers.util import pytorch_cos_sim
from deepmultilingualpunctuation import PunctuationModel
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import CrossEncoder, SentenceTransformer, util
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
warnings.simplefilter('ignore')

In [2]:
path = os.path.dirname(os.getcwd())
os.chdir(path)
print(f'path: {path}')    

path: c:\Diego\5. Proyectos\Language Models\1. LLM learning from YouTube


In [3]:
from src.utils.utils import *

#### Leveraging GPU for Perfomance 
To optimize performance, we'll use GPU accelaration if available

In [4]:
device = get_device()

GPU is available
GPU name: NVIDIA GeForce RTX 4090 Laptop GPU


#### Setting Up the Vector Database
We'll use Chroma as our vector database to store and retrieve our processed video content

In [5]:
model_name = 'all-MiniLM-L6-v2'
encoder_model, max_seq_length, collection, client = setup_vector_db(
    model_name=model_name,
    path_name='./data/vectordb', 
    collection_name='youtube_knowledgebase', 
    device=device
    )

print(f'embedding model max length is: {max_seq_length}')

embedding model max length is: 254


#### Extracting Video Data
Fetching transcripts and metada for each video id

In [6]:
youtube_api_key = ''
video_id = [
    'qZyROOYq4LI',
    'vsu7HW0ouVA',
    'BBozTcgOFGc',
    '4io43JYVpZ0',
    '1z_9RTbbGcU',
    'mdKst8zeh-U',
    '1-VGkaqDxbY',
    'CIWq_k2tiYg',
    'zoPtrb2eMCQ',
    'NjDq9amO-s0'
]

In [7]:
video_data_list = get_video_data(video_id, youtube_api_key)

100%|██████████| 10/10 [00:09<00:00,  1.06it/s]


#### Semantic Chunking
One of the key steps in our process is semantic chunking, which helps us break down the transcript into meaningful segments

In [8]:
nlp = spacy.load("en_core_web_sm")
chunks_list = [semantic_chunking(video_data['transcript'], encoder_model, nlp) for video_data in video_data_list]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


385 sentences extracted from transcript
509 sentences extracted from transcript
173 sentences extracted from transcript
160 sentences extracted from transcript
170 sentences extracted from transcript
246 sentences extracted from transcript
161 sentences extracted from transcript
188 sentences extracted from transcript
190 sentences extracted from transcript
342 sentences extracted from transcript


#### Processing Chunks and Populating the Database
Finally, we'll process our chunks and store them in our vector database

In [9]:
for video_data, chunks in tqdm(zip(video_data_list, chunks_list), total=len(video_data_list)):
    processed_chunks, embeddings = process_chunks(chunks, max_seq_length, encoder_model)
    populate_database(collection, processed_chunks, embeddings, video_data['metadata'], video_data['video_id'])

100%|██████████| 10/10 [00:06<00:00,  1.58it/s]
