<a href="https://colab.research.google.com/github/frank-morales2020/Cloud_curious/blob/master/fine_tuning_vectordb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 , L4  IN GOOGLE COLAB
#!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# Uncomment only if you're using A100 GPU
#!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet

!pip install mistral_inference -q

!pip install trl==0.8.6 -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

Let's illustrate how to combine fine-tuning and vector databases in Python using a simplified example with the transformers library (for fine-tuning)

1. Prepare Your Data and Fine-tune the Model:

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from datasets import load_dataset

# Load a dataset (example: CNN/Daily Mail for text summarization)
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Load pre-trained model and tokenizer (e.g., T5 for text summarization)
model_name = "t5-small"  # Choose an appropriate model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Prepare dataset for fine-tuning
def preprocess_function(examples):
    # Tokenize input and target text
    inputs = tokenizer(examples["article"], padding="max_length", truncation=True)
    targets = tokenizer(examples["highlights"], padding="max_length", truncation=True)
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": targets["input_ids"]}

Downloading readme:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [3]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [4]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    num_train_epochs=0.05,
    weight_decay=0.01,
    eval_steps=250,
    save_strategy="steps",
    save_steps=250,
    metric_for_best_model = "loss",
    logging_steps=250,
    optim="adamw_torch_fused",
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    #hub_token=access_token_write,
    load_best_model_at_end=True,
)

# Create Trainer and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
)
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")



Epoch,Training Loss,Validation Loss
0,0.3244,0.316324


2. Create & Populate the Vector Database (Chroma):

In [5]:
!pip install chromadb -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m91.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.0/107.0 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [18]:
dataset["train"]["highlights"][0]

"Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund ."

In [6]:
import chromadb

# Create a Chroma client
client = chromadb.Client()

# Create a collection
collection = client.create_collection(name="knowledge_base")

# Add documents (here, we'll use summaries from our dataset)
for i, summary in enumerate(dataset["train"]["highlights"][:100]):  # Add a subset for this example
    collection.add(
        documents=[summary],
        ids=[f"doc_{i}"],
        metadatas=[{"source": "cnn_dailymail"}]  # Optional metadata
    )

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:06<00:00, 12.5MiB/s]


3. Use the Fine-tuned Model with Vector Database Retrieval:

In [23]:
# Load the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_model")

# Define a function to get relevant documents from the vector database
def get_relevant_docs(query, top_k=3):
    # Decode the query tensor into a string
    query_string = tokenizer.decode(query[0], skip_special_tokens=True)

    results = collection.query(
        query_texts=[query_string],
        #query_texts=[query],
        n_results=top_k,
    )
    return results["documents"][0]

# Example usage
article = "This is a news article about Harry Potter"
summary = model.generate(tokenizer(article, return_tensors="pt")["input_ids"])

# Get relevant documents from the vector database
relevant_docs = get_relevant_docs(summary[0], top_k=2)

In [24]:
# Use the relevant docs to augment the summary or provide additional context
print("Generated Summary:", tokenizer.decode(summary[0], skip_special_tokens=True))
print("Relevant Documents:", relevant_docs)

Generated Summary: This is a news article about Harry Potter.
Relevant Documents: ['Amount almost double what was spent in 2004 election cycle .\nLower TV production costs help more candidates advertise .\nMitt Romney leads presidential candidates in TV spending .\nAdvertisers face challenge of cutting through clutter of ads, analyst says .', 'Documents say after suicide attempt, Jeffs repeatedly banged head on cell wall .\nTranscripts say Jeffs confessed to "immorality," said he is not "the prophet"\nJeffs\' attorneys say he has recanted statements .\nJeffs due to be sentenced November 20 on accomplice to rape charge .']


Key Points:

* You'll need to replace the example dataset and task with your own.
* This is a simplified example. In a real-world scenario, you'd likely use a more advanced embedding model, optimize the retrieval process, and integrate this into a larger application.