In [1]:
import os 
import gc
import torch
import GPUtil
import warnings
import chromadb
import numpy as np
import pandas as pd
import transformers
from torch import cuda, bfloat16
from huggingface_hub import HfApi
from chromadb.config import Settings
from IPython.display import display, HTML
from huggingface_hub import login as hf_login
from sentence_transformers import CrossEncoder
from sentence_transformers.util import pytorch_cos_sim
from sentence_transformers import CrossEncoder, SentenceTransformer, util
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, BitsAndBytesConfig, pipeline

warnings.simplefilter('ignore')

In [2]:
path = os.path.dirname(os.getcwd())
os.chdir(path)
print(f'path: {path}')    

path: c:\Diego\5. Proyectos\Language Models\1. LLM learning from YouTube


In [3]:
from src.utils.utils import *

#### Leveraging GPU for Perfomance 
To optimize performance, we'll use GPU accelaration if available

In [4]:
device = get_device()

GPU is available
GPU name: NVIDIA GeForce RTX 4090 Laptop GPU


#### Accessing the Vector Database
First, we'll access our previously created Chroma database

In [5]:
# load DB from disk
client = chromadb.PersistentClient(
    path='data/vectordb'
    )
collection = client.get_collection(
    name='youtube_knowledgebase'
    )

In [6]:
# query the available titles in the collection
titles_list = np.unique([col['title'] for col in collection.get()['metadatas']])
print('-------------------------------------------------------------')
for idx, title in enumerate(titles_list,start=1):
    print(f'{idx}. {title}')

-------------------------------------------------------------
1. Secret Teachings of Plato & Theology of Arithmetic - Pythagorean Origins of Sacred Geometry
2. The Nazi Quest To Find The Holy Grail | Myth Hunters
3. The Occult Philosophy of Cornelius Agrippa - 1 of X - Life and Works
4. The Real Assassin's Creed: Deadliest Special Forces Of The Dark Ages | Ancient Black Ops | Chronicle
5. The Testament of Solomon - The Origins of Solomonic Magic, Occultism & Demonology
6. What is Hermeticism?
7. Who is Metatron? The Origins of the Angel from the 3rd Book of Enoch - Sefer Hekhalot Mysticism
8. Who is Set - The Egyptian God of the Desert, Violence & Foreigners
9. Who is Thoth?  The Egyptian God of Writing, Magic, the Moon and Fate who Became Hermes Trismegistus
10. Who is Yahweh - How a Warrior-Storm God became the God of the Israelites and World Monotheism


#### Setting Up the Language Model
For the generation part of RAG, we'll use a large language model. In this case, we're using Mistral-7B-Instruct

In [7]:
# initialize Mistral model
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# set quantization configuration
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# load the Mistral model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)

# create a pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=4000,
    temperature=0.6, # Higher values (e.g., 1.0) make output more random, lower values (e.g., 0.1) make it more deterministic.
    top_p=0.9 # The model considers the smallest set of tokens whose cumulative probability exceeds this value.
    )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
print_gpu_stats()

GPU Name: NVIDIA GeForce RTX 4090 Laptop GPU
GPU Load: 1.00%
GPU Free Memory: 11.55GB
GPU Used Memory: 4.12GB
GPU Total Memory: 15.99GB
GPU Temperature: 42.00 °C
GPU UUID: GPU-485b456e-7834-020d-296e-c970b89f2e0f


#### Understanding the Overall Structure of the Video

In [9]:
# retrieving transcript
title = 'What is Hermeticism?'
flag = [col['title']==title for col in collection.get()['metadatas']]
documents = pd.DataFrame(collection.get()['documents'],columns=['documents'])
documents = documents[flag].reset_index(drop=True) 
documents = documents['documents'].to_list()

In [10]:
# sections and subsections prompt
context = ' '.join(documents)
prompt = f"""
Summarize the following text about '{title}'. 
Divide your summary into 3-5 main sections or topics. For each section:
1. Provide a brief title.
2. Write a 100 words explanation of the main key points discussed in that section.

Text to summarize:
---------------------
<chunk>
{context}
</chunk>
---------------------

Summary:
"""    


In [11]:
response = pipe(prompt, 
                max_new_tokens=len(prompt)+2000,
                truncation=True)
response = response[0]['generated_text']
summary = response.split("Summary:")[1].strip()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [12]:
# summarization prompt
fin_prompt = f"""
Summarize the following text about '{title}'. It should look like an abstract, rember to look at the text structure to not forget any detail for the global summary.
The global summary should not have more than 500 words. 

Text structure:
---------------------
<structure>
{summary}
</structure>
---------------------

Text to summarize:
---------------------
<context>
{context}
</context>
---------------------

Summary:
"""    

In [13]:
response = pipe(fin_prompt, 
                max_new_tokens=len(prompt)+2000,
                truncation=True)
response = response[0]['generated_text']
fin_summary = response.split("Summary:")[1].strip()

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [14]:
summ = '<br>'.join(summary.split('\n'))

long_text = f'''
<h2 style="color: #FF0000;">YouTube Video:</h2>
<p>{title}</p>
<hr style="border-top: 2px solid #e0e0e0;">

<h2 style="color: #4a86e8;">Final Summary:</h2>
<p>{fin_summary}</p>
<hr style="border-top: 2px solid #e0e0e0;">

<h2 style="color: #6aa84f;">Detailed Summary:</h2>
<p>{summ}</p>
'''

html_code = f'''
<div style="overflow:auto; border:1px solid #ddd; padding:20px; font-family: Arial, sans-serif; line-height: 1.6;">
    {long_text}
</div>
'''

display(HTML(html_code))

#### Q&A our Documents

In [15]:
# query 
query = 'Tell me more about the merging of Toth and Hermes'
results = collection.query(query_texts=query, n_results=20)
df = pd.DataFrame({
            'id':results['ids'][0], 
            'score':results['distances'][0],
            'channel':[item['channel'] for sublist in results['metadatas'] for item in sublist],
            'title':[item['title'] for sublist in results['metadatas'] for item in sublist],
            'content':results['documents'][0],
            })

In [16]:
# re-ranking
reranker = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
model = CrossEncoder(reranker, max_length=512)
scores = model.predict([(query, doc) for doc in results["documents"][0]])
df['rerank'] = (scores - scores.min())/(scores.max()-scores.min())
df['score_'] = df['rerank']*.6 + df['rerank']*.4
df.sort_values('score_',ascending=True,inplace=True)

In [17]:
top_k = 5
sources = df.iloc[:5,[2,3]]
sources = (sources['channel'] + ': ' + sources['title']).unique().tolist()
sources = '<br>'.join(sources)
context = df.iloc[:5,4]
context = ' '.join(context)

In [18]:
qa_prompt = f"""
You are a helpful and smart assistant, whose sole purpose is to answer questions related\n
to the user's context. If the given context is not sufficient to answer the question, you need\n 
to reply that you can answer based on partial information. If you are answering, please provide a\n 
detailed answer based on the context provided of at least 200 words.\n
Context information is below.
---------------------
<context>
{context}
</context>
---------------------
Query: {query}.
Answer:
"""


In [19]:
response = pipe(qa_prompt, 
                max_new_tokens=len(qa_prompt)+2000,
                truncation=True)
response = response[0]['generated_text']
qa_answer = response.split("Answer:")[1].strip()

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [20]:
long_text = f'''
<h2 style="color: #4a86e8;">Query:</h3>
<p>{query}</p>
<hr style="border-top: 2px solid #e0e0e0;">

<h2 style="color: #6aa84f;">Answer:</h3>
<p>{qa_answer}</p>
<hr style="border-top: 2px solid #e0e0e0;">

<h2 style="color: #e69138;">Sources:</h3>
<p>{sources}</p>
'''

html_code = f'<div style="overflow:auto; border:1px solid #ddd; padding:10px;">{long_text}</div>'
display(HTML(html_code))