In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import gradio as gr

from textwrap import fill
from IPython.display import Markdown, display

from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
    )

from langchain import PromptTemplate
from langchain import HuggingFacePipeline

from langchain.vectorstores import Chroma
from langchain.schema import AIMessage, HumanMessage
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredMarkdownLoader, UnstructuredURLLoader, PyMuPDFLoader, PyPDFDirectoryLoader
from langchain.chains import LLMChain, SimpleSequentialChain, RetrievalQA, ConversationalRetrievalChain

from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=quantization_config
)

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.0001
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15

pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)

Loading checkpoint shards: 100%|██████████| 2/2 [03:36<00:00, 108.46s/it]


In [4]:
llm = HuggingFacePipeline(
    pipeline=pipeline,
    )

In [5]:
query = "Explain the difference between ChatGPT and open source LLMs in a couple of lines."
result = llm(
    query
)

display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{result}</p>"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>Explain the difference between ChatGPT and open source LLMs in a couple of lines.</b>

<p>
ChatGPT is a proprietary model developed by OpenAI, while open source LLMs are models that are made available for anyone to use, modify, and distribute under an open-source license.</p>

In [6]:
embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-large",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},
)

(…)9d85c1cec4167ed28b43b04c2/.gitattributes: 100%|██████████| 1.52k/1.52k [00:00<00:00, 5.62MB/s]
(…)ec4167ed28b43b04c2/1_Pooling/config.json: 100%|██████████| 191/191 [00:00<00:00, 697kB/s]
(…)4d9ec9d85c1cec4167ed28b43b04c2/README.md: 100%|██████████| 67.9k/67.9k [00:00<00:00, 606kB/s]
(…)9ec9d85c1cec4167ed28b43b04c2/config.json: 100%|██████████| 619/619 [00:00<00:00, 4.69MB/s]
model.safetensors: 100%|██████████| 670M/670M [00:37<00:00, 18.1MB/s] 
(…)85c1cec4167ed28b43b04c2/onnx/config.json: 100%|██████████| 632/632 [00:00<00:00, 4.74MB/s]
model.onnx: 100%|██████████| 1.34G/1.34G [01:08<00:00, 19.5MB/s]
(…)d28b43b04c2/onnx/special_tokens_map.json: 100%|██████████| 125/125 [00:00<00:00, 901kB/s]
(…)1cec4167ed28b43b04c2/onnx/tokenizer.json: 100%|██████████| 712k/712k [00:00<00:00, 6.02MB/s]
(…)7ed28b43b04c2/onnx/tokenizer_config.json: 100%|██████████| 342/342 [00:00<00:00, 2.68MB/s]
(…)9d85c1cec4167ed28b43b04c2/onnx/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.01MB/s]
pytorch_m

In [7]:
template = """
[INST] <>
Act as a Machine Learning engineer who is teaching high school students.
<>

{text} [/INST]
"""

prompt = PromptTemplate(
    input_variables=["text"],
    template=template,
)

In [8]:
query = "Explain what are Deep Neural Networks in 2-3 sentences"
result = llm(prompt.format(text=query))

display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{result}</p>"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>Explain what are Deep Neural Networks in 2-3 sentences</b>

<p>Deep neural networks (DNN) are a type of artificial intelligence model that simulates the structure and function of the human brain. They consist of multiple layers of interconnected nodes, or neurons, that process information by passing it through successive layers until an output is generated. DNNs can be trained on large amounts of data to recognize patterns and make predictions, making them useful for tasks such as image recognition, speech recognition, and natural language processing.</p>

In [12]:
from glob import glob
pdf_paths = "/home/guilherme/phd/papers/"

In [13]:
pdfs = PyPDFDirectoryLoader(pdf_paths)

In [16]:
documents = pdfs.load()

In [18]:
len(documents)

120

In [21]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts_chunks = text_splitter.split_documents(documents[:10])

len(texts_chunks)

61

In [24]:
query_doc = documents[11]

In [22]:
db = Chroma.from_documents(texts_chunks, embeddings, persist_directory="db")

In [32]:
template = """
[INST] <>
Use the following information to answer the question at the end.
<>

{context}

{question} [/INST]
"""

prompt = PromptTemplate(template=template, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [33]:
query = "What are these papers about?"
result_ = qa_chain(
    query
)
result = result_["result"].strip()


display(Markdown(f"<b>{query}</b>"))
display(Markdown(f"<p>{result}</p>"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>What are these papers about?</b>

<p>These papers are about using transformers in reinforcement learning (RL) to improve the performance of RL algorithms. They discuss various approaches that have been developed to enhance the architecture and optimize the trajectories of RL models using transformers. Some examples of these approaches include replacing LSTMs with gated transformer-XL in V-MPO, adding recurrence to the Factorized World Prediction (FWP) network, and combining contrastive loss and a hybrid LSTM-transformer in COBERL. The papers also highlight several applications of transformer-based RL (TRL) in areas such as robotic manipulation, text-based games, and autonomous driving.</p>