In [1]:
import os
import tempfile
from pathlib import Path

import joblib
import requests
import torch
import transformers
from dotenv import load_dotenv
from IPython.display import display_markdown
from langchain.chains.conversational_retrieval.base import ConversationalRetrievalChain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import BSHTMLLoader
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain_community.vectorstores.faiss import FAISS

PROJECT_ROOT = Path().resolve().parent

# Sanity checks regarding GPU
assert torch.cuda.is_available()
assert torch.cuda.is_bf16_supported()

load_dotenv()

HF_TOKEN: str = os.environ["HF_TOKEN"]
LLM_MODEL: str = "mistralai/Mistral-7B-Instruct-v0.2"
EMBED_MODEL: str = "sentence-transformers/all-mpnet-base-v2"

device = f"cuda:{torch.cuda.current_device()}"

memory = joblib.Memory(".cache")

In [2]:
# Use BitsAndBytes to for lower quantization to reduce LLM's footprint
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Picks model configuration for pretrained model provided by HuggingFace community
model_config = transformers.AutoConfig.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
    token=HF_TOKEN,
)

# Ensure and load LLM model
model = transformers.AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    token=HF_TOKEN,
)

# Enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on cuda:0


In [3]:
# Pipeline requires a tokenizer that handles translating plaintext into tokens
tokenizer = transformers.AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=LLM_MODEL,
    token=HF_TOKEN,
)

# Tweaks to tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
# Initialize HuggingFace pipeline
generate_text = transformers.pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    # do_sample=True,
    # temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=8192,  # max number of tokens to generate in the output
    repetition_penalty=1.1,  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generate_text)

embedding = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs={"device": "cuda"},
)

In [5]:
# Get content from the EEML 2024 website
with tempfile.TemporaryDirectory() as tmpdir:
    with open(tmpdir + "/eeml.html", mode="w") as fp:
        fp.write(requests.get("https://www.eeml.eu/").text)

    docs = BSHTMLLoader(tmpdir + "/eeml.html").load()

In [6]:
# Split text into chunks for vector store and address limitation of LLM's context length
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20) # TODO: Chunk size could probably be smaller
all_splits = text_splitter.split_documents(docs)

# Initialize FAISS (Facebook AI Similarity Search) embeddings database.
vectorstore = FAISS.from_documents(all_splits, embedding)
retreiver = vectorstore.as_retriever(
    search_type="similarity",
    k=10,
)

In [7]:
PROMPT_TEMPLATE = """
You are a helpful AI QA assistant. When answering questions, use the context enclosed by triple backquotes if it is relevant.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Reply your answer in markdown format.

```
{context}
```

### Question:
{question}

### Answer:
"""

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=PROMPT_TEMPLATE.strip(),
)

# Construct complete LLM chain
llm_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retreiver,
    return_source_documents=False,
    # TODO: I don't like how I inject prompt template, but I couldn't find other way.
    combine_docs_chain_kwargs={"prompt": prompt_template},
    verbose=False,
)

def answer_question(question: str, history: dict[str] = None) -> str:
    if history is None: # Currently I don't have context/history implemented
        history = []

    response = llm_chain.invoke({"question": question, "chat_history": history})
    answer = response["answer"].split("### Answer:")[-1].strip()
    return answer

In [8]:
question = "What is the EEML conference about?"
display_markdown(answer_question(question), raw=True)

The Eastern European Machine Learning (EEML) Summer School is an annual event focused on machine learning research and development. It brings together leading researchers, industry experts, and students to learn about the latest advancements in machine learning and related fields. The conference covers various topics such as basics of machine learning, multimodal learning, natural language processing, advanced deep learning architectures, generative models, reinforcement learning, and AI for science. Speakers include renowned researchers and professionals from top institutions and companies like Google DeepMind, New York University, McGill University, and the University of Cambridge. The event also offers opportunities for networking, collaboration, and learning through tutorials and workshops.

In [9]:
question = "Who are the speakers?"
display_markdown(answer_question(question), raw=True)

The speakers for the Eastern European Machine Learning Summer School 2024 include Aleksandra Faust from Google DeepMind, Alfredo Canziani from New York University, Chris Dyer from Google DeepMind, Doina Precup from McGill University and Google DeepMind, Jovana Mitrović from Google DeepMind, Kyunghyun Cho from New York University, Martin Vechev from ETH Zürich, INSAIT, Michael Bronstein from the University of Oxford, Mihaela van der Schaar from the University of Cambridge, Nenad Tomašev from Google DeepMind, Razvan Pascanu from Google DeepMind, Sander Dieleman from Google DeepMind, Velibor Ilić from the Institute for AI Research and Development of Serbia, Vladimir Gligorijević from Genentech, Yee Whye Teh from the University of Oxford, Anastasija Ilić from Google DeepMind, Andreea Deac from Isomorphic Labs, Cristian Bodnar from Microsoft Research, Ioana Bica from Google DeepMind, Iulia Duță from the University of Cambridge, Matko Bošnjac from Google DeepMind, Ognjen Milinković from the University of Belgrade, Petar Veličković from Google DeepMind, and teaching assistants TBD. The organizing team includes Doina Precup from McGill University and Google DeepMind, Razvan Pascanu from Google DeepMind, Viorica Patraucean from Google DeepMind, Branislav Kisačanin from NVIDIA, Dubravko Ćulibrk from the Institute for AI Research and Development of Serbia, Matko Bošnjak from Google DeepMind, and Nemanja Rakićević from Google DeepMind. Technical support will be provided by Gabriel Marchidan from IasiAI and Feel IT Services. Partners include The Institute for Artificial Intelligence Research and Development of Serbia and various sponsors. For more information or to sponsor the event, contact contact@eeml.eu.

In [10]:
question = "Is Petar Veličković involved and what is his affiliation?"
display_markdown(answer_question(question), raw=True)

Yes, Petar Veličković is involved in Eastern European Machine Learning Summer School 2023 as a speaker. His affiliation is with Google DeepMind and the University of Cambridge.