In [1]:
!pip install -q -U torch datasets  tensorflow langchain playwright html2text sentence_transformers faiss-cpu
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7
# !pip install -q -U transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m670.2/670.2 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.4/802.4 kB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.2/37.2 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m70.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m29

In [16]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [2]:
!pip install -q -U playwright

In [None]:
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
# !pip install -q datasets loralib sentencepiece
# !pip -q install bitsandbytes accelerate xformers einops
# !pip -q install langchain

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


### Import necessary libraries

In [None]:
import os
import torch
import transformers
from transformers import (
  AutoTokenizer,
  AutoModelForCausalLM,
  BitsAndBytesConfig,
  pipeline
)

from transformers import BitsAndBytesConfig

from langchain.embeddings.huggingface import HuggingFaceEmbeddings


from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

import nest_asyncio

### Tokenizer

Input sequences need to be padded so that they are of equal length. We are using the 'EOS' token to pad the sequences

In [None]:
model_name = 'mistralai/Mistral-7B-Instruct-v0.1'

model_config = transformers.AutoConfig.from_pretrained(
    model_name
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

### Bits and Bytes parameters

Optimizing model performance by adjusting precision and quantization

In [None]:
use_4bit = True
bnb_4bit_compute_dtype = 'float16'
bnb_4bit_quant_type = 'nf4'
use_nested_quant = False

#### Setting up Quantization Configuration

In [None]:
compute_dtype = getattr(torch,bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit= use_4bit,
    bnb_4bit_quant_type= bnb_4bit_quant_type,
    bnb_4bit_compute_dtype= compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)


#### Loading pre-trained configuration

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config
)

#### Testing out a random prompt with Mistral 7B

In [None]:
# Tokenize and encode the inout prompt
inputs_not_chat = tokenizer.encode_plus("[INST]Tell me what you know about fantasy soccer? [/INST]", return_tensors='pt')['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat,
                               max_new_tokens = 1000,
                               do_sample = True)

# Convert back to human interpretable form
decoded = tokenizer.batch_decode(generated_ids)

In [None]:
decoded

#### Checking number of parameters in the model

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

### Setting up playwright

In [38]:
!pip install playwright



In [None]:
!playwright install

In [42]:
!pip install html2text

Collecting html2text
  Downloading html2text-2020.1.16-py3-none-any.whl (32 kB)
Installing collected packages: html2text
Successfully installed html2text-2020.1.16


In [48]:
!pip install sentence-transformers

### Creating a Vector Database

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader
from langchain.vectorstores import FAISS
import nest_asyncio

nest_asyncio.apply()

# Articles that we will be using for additional context
articles = ["https://www.fantasypros.com/2023/11/rival-fantasy-nfl-week-10/",
            "https://www.fantasypros.com/2023/11/5-stats-to-know-before-setting-your-fantasy-lineup-week-10/",
            "https://www.fantasypros.com/2023/11/nfl-week-10-sleeper-picks-player-predictions-2023/",
            "https://www.fantasypros.com/2023/11/nfl-dfs-week-10-stacking-advice-picks-2023-fantasy-football/",
            "https://www.fantasypros.com/2023/11/players-to-buy-low-sell-high-trade-advice-2023-fantasy-football/"]

# Scrape the articles
loader = AsyncChromiumLoader(articles)
docs = loader.load()

# Converting HTML to plain text
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

# Chunk text
text_splitter = CharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap = 0
)
chunked_documents = text_splitter.split_documents(docs_transformed)

# Loading the chunked files into the FAISS index
db = FAISS.from_documents(chunked_documents,
                          HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2'))


# Connect query to FAISS index using a retriever
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4}
)

In [None]:
query = "Tell me about Patrick Mahomes"
docs = db.similarity_search(query)
print(docs[0].page_content)

`retriever` : Acts as an interface between the vector database (where we provide additional context) and the LLM

### Building the LLM Chain

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

text_generation_pipeline = transformers.pipeline(
    model = model,
    tokenizer = tokenizer,
    task = 'text-generation',
    temperature = 0.2,
    repetition_penalty = 1.1,
    return_full_text = True,
    max_new_tokens =  300
)

prompt_template = """
### [INST]
Instruction: Answer the question based on your fantasy football knowledge.
Here is some context to help:

{context}

### QUESTION:
{question}

[/INST]
"""

mistral_llm = HuggingFacePipeline(pipeline = text_generation_pipeline)

# Creating a prompt from the prompt template
prompt = PromptTemplate(
    input_variables = ["context","question"],
    template = prompt_template
)

# Creating the LLM chain
llm_chain = LLMChain(llm = mistral_llm, prompt = prompt)


So currently we have our LLMChain which doesn't have the external context data source

In [None]:
llm_chain.invoke({"context":"",
                  "question":"Should I pick Alvin Kamara for my fantasy team?"})

We see that the response is pretty generic. Let's now provide it with additional context ie, we integrate our FAISS database with the llm chain

In [None]:
query = "Should I pick up Alvin Kamara for my fantasy team?"

retriever = db.as_retriever()

rag_chain = (
    {"context":retriever,
     "question":RunnablePassthrough()}
    | llm_chain
)

rag_chain.invoke(query)

In [None]:
query = "I have Josh Jacobs, should I trade him for Kareem Hunt?"

rag_chain.invoke(query)

In [None]:
query = "Should I trade Saquon Barkley? What are some alternatives."

rag_chain.invoke(query)

## Building a conversational RAG

Here I will be incorporating conversation history and including a second LLM responsible for generating a standalone question that can appropriately query the vector data base

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

standalone_query_generation_pipeline = pipeline(
    model = model,
    tokenizer = tokenizer,
    task = 'text-generation',
    temperature = 0.0,
    repetition_penalty = 1.1,
    return_full_text = True,
    max_new_tokens = 1000
)

standalone_query_generation_llm = HuggingFacePipeline(pipeline = standalone_query_generation_pipeline)

response_generation_pipeline = pipeline(
    model = model,
    tokenizer = tokenizer,
    task = "text-generation",
    temperature = 0.2,
    repetition_penalty = 1.1,
    return_full_text = True,
    max_new_tokens = 1000
)

response_generation_llm = HuggingFacePipeline(pipeline =  response_generation_pipeline)


Key point is how `temperature` has been set to 0.2 in the response generation pipeline whereas in the standalone query generation llm it is 0.0 . This is to make sure there is little chance of hallucination and relevant context is extracted

### Chain for Standalone Questions Generation

