# varie cose iniziali da fare

## import necessari

In [None]:
# SOLO PER COLAB
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [1]:
import os
import pandas as pd
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from langchain.llms import HuggingFacePipeline

import warnings
warnings.filterwarnings("ignore")

import langchain
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain import PromptTemplate, LLMChain
from langchain.vectorstores import FAISS

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import format_document
from langchain_core.messages import get_buffer_string
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.memory import ConversationBufferMemory
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts.chat import ChatPromptTemplate

## caricamento del dataset e creazione di quello da popolare

In [2]:
# importa le domande e le risposte corrette
test_df = load_dataset("saracandu/harry-potter-trivia-human", split="test")
test_df

Dataset({
    features: ['question', 'answer', 'text'],
    num_rows: 256
})

In [3]:
# crea un nuovo dataframe che contenga le domande, le risposte per ogni LLM e le ground truths:
comp_df = pd.DataFrame({'question': test_df['question'], 'ground_truth': test_df['answer']})

## carica i documenti su cui il RAG gioca

In [4]:
loader = DirectoryLoader('HP content/',
                         glob="./*.pdf",
                         loader_cls=PyPDFLoader,
                         show_progress=True,
                         use_multithreading=True)

documents = loader.load()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:21<00:00, 11.70s/it]


In [5]:
# pulisci
for i in range(len(documents)):
    documents[i].page_content = documents[i].page_content.replace('\t', ' ')\
                                                         .replace('\n', ' ')\
                                                         .replace('       ', ' ')\
                                                         .replace('      ', ' ')\
                                                         .replace('     ', ' ')\
                                                         .replace('    ', ' ')\
                                                         .replace('   ', ' ')\
                                                         .replace('  ', ' ')

# controlla che abbia funzionato
documents[186].page_content

'186could feel it vibrating and let go; it hung in midair, unsupported, at exactly the right height for him to mount it. His eyes moved from thegolden registration number at the top of the handle, right down to theperfectly smooth, streamlined birch twigs that made up the tail. "Who sent it to you?" said Ron in a hushed voice."Look and see if there\'s a card," said Harry.Ron ripped apart the Firebolt\'s wrappings."Nothing! Blimey, who\'d spend that much on you?""Well," said Harry, feeling stunned, "I\'m betting it wasn\'t the Dursleys." I bet it was Dumbledore," said Ron, now walking around and around the Firebolt, taking in every glorious inch. "He sent you the InvisibilityCloak anonymously...." "That was my dad\'s, though," said Harry. "Dumbledore was just Passing it on to me. He wouldn\'t spend hundreds of Galleons on me. He can\'t gogiving students stuff like this --" "That\'s why he wouldn\'t say it was from him!" said Ron. "In case some git like Malfoy said it was favoritism. Hey

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

db = FAISS.from_documents(texts,
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))

retriever = db.as_retriever(k = 1)

# Llama2 - 7b

## ORIGINAL Llama2 - 7b

In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
#################################################################
# Tokenizer
#################################################################

model_name="meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
original_llama2_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
response_generation_pipeline = pipeline(
 model=original_llama2_model,
 tokenizer=tokenizer,
 task="text-generation",
 do_sample=False,
 temperature=0.0,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=100,
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

In [13]:
llama_outputs = []

for item in test_df['question']:
  result = response_generation_llm.invoke(item)
  llama_outputs.append(result)
  # print(result)

In [14]:
comp_df['llama2-7b'] = llama_outputs

In [15]:
comp_df.to_csv('comparison_1.csv', index=False)

## ORIGINAL RAG Llama2 - 7b

In [10]:
prompt_template = """
### [INST] Instruction: Answer directly and completely the question based on your Harry Potter world.
Here is context to help:
{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=response_generation_llm, prompt=prompt)

In [11]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

In [12]:
rag_llama_outputs = []

for item in test_df['question']:
  result = rag_chain.invoke(item)
  rag_llama_outputs.append(result['text'])

In [13]:
comp_df['rag-llama2-7b'] = rag_llama_outputs

In [14]:
comp_df.to_csv('comparison_2.csv', index=False)

## CUSTOM 64 Llama2 - 7b

In [7]:
#################################################################
# Tokenizer
#################################################################

model_name="saracandu/llama-2-7b-LoRA-64-harrypotter"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
custom_llama2_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
response_generation_pipeline = pipeline(
 model=custom_llama2_model,
 tokenizer=tokenizer,
 task="text-generation",
 do_sample=False,
 temperature=0.0,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=100,
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

In [11]:
hp_llama_outputs = []

for item in test_df['question']:
  result = response_generation_llm.invoke(item)
  hp_llama_outputs.append(result)

In [12]:
comp_df['hp-llama2-7b'] = hp_llama_outputs

In [13]:
comp_df.to_csv('comparison_3.csv', index=False)

## CUSTOM 64 RAG Llama2 - 7b

In [14]:
prompt_template = """
### [INST] Instruction: Answer directly and completely the question based on your Harry Potter world.
Here is context to help:
{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=response_generation_llm, prompt=prompt)

In [15]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

In [16]:
hp_rag_llama_outputs = []

for item in test_df['question']:
  result = rag_chain.invoke(item)
  hp_rag_llama_outputs.append(result['text'])

In [17]:
comp_df['rag-hp-llama2-7b'] = hp_rag_llama_outputs

In [18]:
comp_df.to_csv('comparison_4.csv', index=False)

## CUSTOM 128 Llama2 - 7b

In [7]:
#################################################################
# Tokenizer
#################################################################

model_name="saracandu/llama-2-7b-LoRA-128-harrypotter"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
custom_128_llama2_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
response_generation_pipeline = pipeline(
 model=custom_128_llama2_model,
 tokenizer=tokenizer,
 task="text-generation",
 do_sample=False,
 temperature=0.0,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=100
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

In [10]:
hp_llama_128_outputs = []

for item in test_df['question']:
  result = response_generation_llm.invoke(item)
  hp_llama_128_outputs.append(result)

In [11]:
comp_df['hp-128-llama2-7b'] = hp_llama_128_outputs

In [12]:
comp_df.to_csv('comparison_5.csv', index=False)

## CUSTOM 128 RAG Llama2 - 7b

In [13]:
prompt_template = """
### [INST] Instruction: Answer directly and completely the question based on your Harry Potter world.
Here is context to help:
{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=response_generation_llm, prompt=prompt)

In [14]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

In [15]:
hp_rag_128_llama_outputs = []

for item in test_df['question']:
  result = rag_chain.invoke(item)
  hp_rag_128_llama_outputs.append(result['text'])

In [16]:
comp_df['rag-hp-128-llama2-7b'] = hp_rag_128_llama_outputs

In [17]:
comp_df.to_csv('llama2-comparison.csv', index=False)

# Mistral 7b

In [None]:
# crea un nuovo dataframe che contenga le domande, le risposte per ogni LLM e le ground truths:
comp_df = pd.DataFrame({'question': test_df['question'], 'ground_truth': test_df['answer']})

## ORIGINAL Mistral - 7b

In [7]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
my_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [8]:
response_generation_pipeline = pipeline(
 model=my_model,
 tokenizer=tokenizer,
 task="text-generation",
 temperature=0.0,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=100,
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

In [9]:
mistral_outputs = []

for item in test_df['question']:
  result = response_generation_llm.invoke(item)
  mistral_outputs.append(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [10]:
comp_df['mistral-7b'] = mistral_outputs

In [11]:
comp_df.to_csv('comparison_7.csv', index=False)

## ORIGINAL RAG Mistral - 7b

In [12]:
prompt_template = """
### [INST] Instruction: Answer directly and completely the question based on your Harry Potter world.
Here is context to help:
{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=response_generation_llm, prompt=prompt)

In [13]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

In [14]:
rag_mistral_outputs = []

for item in test_df['question']:
  result = rag_chain.invoke(item)
  rag_mistral_outputs.append(result['text'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [15]:
comp_df['rag-mistral-7b'] = rag_mistral_outputs

In [16]:
comp_df.to_csv('comparison_8.csv', index=False)

## CUSTOM 64 Mistral 7b

In [7]:
#################################################################
# Tokenizer
#################################################################

model_name='saracandu/mistral-7b-LoRA-64-harrypotter'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
custom_mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
response_generation_pipeline = pipeline(
 model=custom_mistral_model,
 tokenizer=tokenizer,
 task="text-generation",
 do_sample=False,
 temperature=0.0,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=100,
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

In [10]:
hp_mistral_outputs = []

for item in test_df['question']:
  result = response_generation_llm.invoke(item)
  hp_mistral_outputs.append(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [11]:
comp_df['hp-mistral-7b'] = hp_mistral_outputs

In [12]:
comp_df.to_csv('comparison_9.csv', index=False)

## CUSTOM 64 RAG Mistral - 7b

In [9]:
prompt_template = """
### [INST] Instruction: Answer directly and completely the question based on your Harry Potter world.
Here is context to help:
{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=response_generation_llm, prompt=prompt)

In [10]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

In [11]:
hp_rag_mistral_outputs = []

for item in test_df['question']:
  result = rag_chain.invoke(item)
  hp_rag_mistral_outputs.append(result['text'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [12]:
comp_df['rag-hp-mistral-7b'] = hp_rag_mistral_outputs

In [13]:
comp_df.to_csv('comparison_10.csv', index=False)

## CUSTOM 128 Mistral - 7b

In [14]:
#################################################################
# Tokenizer
#################################################################

model_name="saracandu/mistral-7b-LoRA-128-harrypotter"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
custom_128_mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

tokenizer_config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [15]:
response_generation_pipeline = pipeline(
 model=custom_128_mistral_model,
 tokenizer=tokenizer,
 task="text-generation",
 do_sample=False,
 temperature=0.0,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=100,
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

In [16]:
hp_mistral_128_outputs = []

for item in test_df['question']:
  result = response_generation_llm.invoke(item)
  hp_mistral_128_outputs.append(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [17]:
comp_df['hp-128-mistral-7b'] = hp_mistral_128_outputs

In [18]:
comp_df.to_csv('comparison_11.csv', index=False)

## CUSTOM 128 RAG Mistral - 7b

In [19]:
prompt_template = """
### [INST] Instruction: Answer directly and completely the question based on your Harry Potter world.
Here is context to help:
{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=response_generation_llm, prompt=prompt)

In [20]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

In [21]:
hp_rag_128_mistral_outputs = []

for item in test_df['question']:
  result = rag_chain.invoke(item)
  hp_rag_128_mistral_outputs.append(result['text'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

In [22]:
comp_df['rag-hp-128-mistral-7b'] = hp_rag_128_mistral_outputs

In [23]:
comp_df.to_csv('mistral-comparison', index=False)