In [4]:
from IPython.display import Markdown
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import requests
import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
import plotly.express as px
from langchain import hub

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_milvus.retrievers import MilvusCollectionHybridSearchRetriever
from langchain_milvus.utils.sparse import BM25SparseEmbedding
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from pymilvus import (
    Collection,
    CollectionSchema,
    DataType,
    FieldSchema,
    RRFRanker,
    connections,
)

In [5]:
load_dotenv()

True

In [10]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
result = llm.invoke("hello!")
print(result.content)

Hello! How can I help you today? 



In [11]:
embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
len(embedding.embed_query("dog"))

768

In [12]:
sessions = pd.read_json("data/pydata_eindhoven_2024_sessions.json")

In [13]:
CONNECTION_URI = "http://localhost:19530"

In [14]:
texts = sessions["text"].values

In [15]:
sparse_embedding_func = BM25SparseEmbedding(corpus=texts)
sparse_embedding_func.embed_query(texts[1])

{0: 0.9591461605237726,
 1: 15.324768712979717,
 5: 1.3499267169490157,
 11: 0.4883314709749867,
 12: 1.0216512475319814,
 15: 1.7578579175523736,
 16: 1.3499267169490157,
 17: 0.4883314709749867,
 20: 1.3499267169490157,
 21: 1.0216512475319814,
 23: 0.4727775561284613,
 27: 0.4883314709749867,
 31: 1.475197886261558,
 42: 0.4883314709749867,
 46: 0.9591461605237726,
 47: 0.9766629419499734,
 66: 0.0,
 68: 0.0,
 78: 1.3499267169490157,
 79: 1.7578579175523736,
 81: 1.4649944129249601,
 82: 0.4883314709749867,
 83: 0.4883314709749867,
 84: 0.4883314709749867,
 85: 0.4883314709749867,
 92: 0.23638877806423064,
 93: 0.4883314709749867,
 94: 0.737598943130779,
 95: 0.4883314709749867,
 102: 0.9766629419499734,
 104: 0.4795730802618863,
 120: 1.3499267169490157,
 121: 0.4795730802618863,
 122: 0.0,
 123: 6.129907485191888,
 124: 7.006124747451109,
 125: 1.7578579175523736,
 126: 7.0314316702094946,
 127: 1.9533258838999468,
 128: 2.3353749158170363,
 129: 1.7578579175523736,
 130: 4.049780


Initialize connection URI and establish connection

In [17]:
connections.connect(uri=CONNECTION_URI)

Define field names and their data types

In [18]:
pk_field = "doc_id"
dense_field = "dense_vector"
sparse_field = "sparse_vector"
text_field = "text"
fields = [
    FieldSchema(
        name=pk_field,
        dtype=DataType.VARCHAR,
        is_primary=True,
        auto_id=True,
        max_length=100,
    ),
    FieldSchema(name=dense_field, dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name=sparse_field, dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name=text_field, dtype=DataType.VARCHAR, max_length=65_535),
]

Create a collection with the defined schema

In [34]:
schema = CollectionSchema(fields=fields, enable_dynamic_field=False)
collection = Collection(
    name="PyDataSessions1107", schema=schema, consistency_level="Strong"
)

Define index for dense and sparse vectors

In [35]:
dense_index = {"index_type": "FLAT", "metric_type": "IP"}
collection.create_index("dense_vector", dense_index)
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
collection.create_index("sparse_vector", sparse_index)
collection.flush()

Insert entities into the collection and load the collection

In [36]:
entities = []
for text in texts:
    entity = {
        dense_field: embedding.embed_documents([text])[0],
        sparse_field: sparse_embedding_func.embed_documents([text])[0],
        text_field: text,
    }
    entities.append(entity)
collection.insert(entities)
collection.load()

## Use retriever

In [53]:
sparse_search_params = {"metric_type": "IP"}
dense_search_params = {"metric_type": "IP", "params": {}}
retriever = MilvusCollectionHybridSearchRetriever(
    collection=collection,
    rerank=RRFRanker(k=60),
    anns_fields=[dense_field, sparse_field],
    field_embeddings=[embedding, sparse_embedding_func],
    field_search_params=[dense_search_params, sparse_search_params],
    top_k=4,
    text_field=text_field,
)

In [54]:
retriever.invoke(
    "predicting the"
)

[Document(page_content="# How I lost 1000€ betting on CS:GO with machine learning and Python\nPeople have been using machine learning for sports betting for decades. Logistic regression applied to horse racing made someone a multi-millionaire in the 80s. While fun, betting is a losing proposition for most. The house always wins, right?\r\n\r\nWith a friend, I thought we could beat the house in e-sports by leveraging modern ML tools like LightGBM. E-sports betting is less sophisticated than football or horse racing i.e. the market is less efficient. There is a lot of online data and unknown teams. It was a space ripe for money-making, or so we thought.\r\n\r\nFirst, I will explain the theory behind e-sports betting with ML: what is an edge, financial decision-making, the expected value and decision rule for one bet, multiple bets with the Kelly criterion, probability calibration and the winner's curse.\r\n\r\nThen, I will explain how we built a web scraper to extract features, developed

### Build the RAG chain



In [55]:
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [56]:
Markdown(
    rag_chain.invoke(
        "I'm really into responsible AI. Which talk(s) should I go to?"
    )
)

The most relevant talk for you is "Risks and Mitigations for a Safe and Responsible AI," presented by Maria Medina. This talk focuses on the ethical and safety considerations of building AI solutions, aligning with your interest in responsible AI. The other talks focus on evaluating LLM frameworks, running GenAI on local machines, and predicting cycling races using neural networks, which do not directly address responsible AI principles. 


## Test the full system on previously failing queries

In [57]:
retriever.invoke(
    "Maximizing "
)

[Document(page_content='# Computer vision at the Dutch Tennis Federation: Utilizing YOLO to create insights for coaches\nThrough single-camera tennis match footage, via a YOLO-driven computer vision system, and culminating in actionable insights for strength and conditioning coaches, the Dutch Tennis Federation offers a pathway for creating tennis data and insights. In our presentation, we will delve into technical specifications and algorithms of our system, navigate through the challenges of working with tennis video footage, and elaborate on our approach to actively engage coaches in our co-creation approach. After the presentation, you will have a deeper understanding of the intricate workings behind implementing such system in a competitive tennis environment. All output of the project will be presented on Github.\n\n## Description\nTennis is seen within the community more as a skill sport than a physical sport. In this way, tennis is an exception compared to other ball sports, in

In [58]:
retriever.invoke(
    "What's the talk starting with Maximizing about?"
)

[Document(page_content="# The Levels of RAG 🦜\nLLM's can be supercharged using a technique called RAG, allowing us to overcome dealbreaker problems like hallucinations or no access to internal data. RAG is gaining more industry momentum and is becoming rapidly more mature both in the open-source world and at major Cloud vendors. But what can we expect from RAG? What is the current state of the tech in the industry? What use-cases work well and which are more challenging? Let's find out together!\n\n## Description\nRetrieval Augmented Generation (RAG) is a popular technique to combine retrieval methods like vector search together with Large Language Models (LLM's). This gives us several advantages like retrieving extra information based on a user search query: allowing us to quote and cite LLM-generated answers. Because the underlying techniques are very broadly applicable, many types of data can be used to build up a RAG system, like textual data, tables, graphs or even images.\r\n\r\n

In [44]:
Markdown(
    rag_chain.invoke(
        "What's the talk starting with Maximizing about?"
    )
)

The talk titled "Maximizing marketplace experimentation: switchback design for small samples and subtle effects" explores switchback design as an alternative to A/B testing. It addresses challenges like small sample sizes and subtle effects in industries like airlines and ride-sharing.  The talk will cover the basics of switchback design, its benefits, and a real-world case study. 


In [45]:
retriever.invoke(
    "I'm gonna run "
)

[Document(page_content="# Predicting the Spring Classics of cycling with my first neural network\nLast year I attended PyData Eindhoven for the first time. I got inspired and now I’m back to present my first neural network, a network that was trained to predict the Spring Classics of cycling! With this neural network, I’m attempting to beat my friends, and myself, in a well-known fantasy cycling game.\n\n## Description\nLast year I attended PyData Eindhoven for the first time. I got inspired and now I’m back to present my first neural network, a network that was trained to predict the Spring Classics of cycling! With this neural network, I’m attempting to beat my friends, and myself, in a well-known fantasy cycling game.\r\n\r\nIn this talk, I will elaborate on the process of building a model from scratch. This will include data collection, model training and finetuning, and of course a discussion of the predicted results. The predictions will also be compared to an existing cycling pr

In [46]:
retriever.invoke(
    "What's the talk starting with Maximizing about?"
)

[Document(page_content="# The Levels of RAG 🦜\nLLM's can be supercharged using a technique called RAG, allowing us to overcome dealbreaker problems like hallucinations or no access to internal data. RAG is gaining more industry momentum and is becoming rapidly more mature both in the open-source world and at major Cloud vendors. But what can we expect from RAG? What is the current state of the tech in the industry? What use-cases work well and which are more challenging? Let's find out together!\n\n## Description\nRetrieval Augmented Generation (RAG) is a popular technique to combine retrieval methods like vector search together with Large Language Models (LLM's). This gives us several advantages like retrieving extra information based on a user search query: allowing us to quote and cite LLM-generated answers. Because the underlying techniques are very broadly applicable, many types of data can be used to build up a RAG system, like textual data, tables, graphs or even images.\r\n\r\n

Drop the collection

In [None]:
collection.drop()