## ChromaDB + Ollama

In [1]:
#!pip install chromadb==0.5.23
# !pip install ollama==0.4.7

In [29]:
import os
import sys
import pickle
import chromadb
import torch
from chromadb import Documents, EmbeddingFunction, Embeddings
from ollama import Client
import numpy as np
from scipy.spatial.distance import cosine

# Add "src" path to Python path
sys.path.append(os.path.abspath("../src"))

# Import custom embedding function
from embedding_utils import get_ollama_embedding
from sage_utils import get_new_sage_embedding
from graphsage import GraphSAGE


In [25]:
# Load graph
with open("../data/multihop_graph_w_sage100_embeddings.pkl", "rb") as f:
    G = pickle.load(f)

# Load raw entities (for article summaries retrieval)
with open('../data/multihop_dataset_raw_entities.pkl', 'rb') as f:
    raw_entities = pickle.load(f)

In [3]:
G.nodes["Sam Altman_PER"].keys()

dict_keys(['type', 'embedding', 'SAGE_embedding'])

In [4]:
# Instantiate client
chroma_client = chromadb.Client()

In [6]:
#chroma_client.delete_collection("semantic_rag")
#chroma_client.delete_collection("SAGE_rag")

In [5]:
# Create a collection
collection_semrag = chroma_client.create_collection(name="semantic_rag")

# Iterate over the nodes and filter those with type "chunk"
for node_id in G.nodes:
    node_data = G.nodes[node_id]
    if node_data.get("type") == "chunk":
        doc_text = node_data.get("text", "")
        doc_embedding = node_data.get("embedding", None)
        
        # Ensure that doc_embedding is a list (or array) of floats
        # Add the node to the collection by specifying IDs, documents, and embeddings
        collection_semrag.add(
            ids=[str(node_id)],         # The ID will be the node's ID (converted to a string)
            documents=[doc_text],         # The node's text
            embeddings=[doc_embedding]    # The precomputed embedding you already have
        )

In [6]:
# Create a collection
collection_SAGErag = chroma_client.create_collection(name="SAGE_naive_rag")

# Iterate over the nodes and filter those with type "chunk"
for node_id in G.nodes:
    node_data = G.nodes[node_id]
    if node_data.get("type") == "chunk":
        doc_text = node_data.get("text", "")
        doc_embedding = node_data.get("SAGE_embedding", None)
        
        # Ensure that doc_embedding is a list (or array) of floats
        # Add the node to the collection by specifying IDs, documents, and embeddings
        collection_SAGErag.add(
            ids=[str(node_id)],         # The ID will be the node's ID (converted to a string)
            documents=[doc_text],         # The node's text
            embeddings=[doc_embedding]    # The precomputed embedding you already have
        )

In [7]:
# All nodes indexed

# Create a collection
collection_SAGEmixedrag = chroma_client.create_collection(name="SAGE_mixed_rag")

# Iterate over the nodes adding them to the collection
for node_id in G.nodes:
    node_data = G.nodes[node_id]
    doc_embedding = node_data.get("SAGE_embedding", None)
        
    # Ensure that doc_embedding is a list (or array) of floats
    # Add the node to the collection by specifying IDs, and embeddings
    collection_SAGEmixedrag.add(
        ids=[str(node_id)],         # The ID will be the node's ID (converted to a string)
        embeddings=[doc_embedding]    # The precomputed embedding you already have
        )

In [8]:
# Define el modelo (debe tener la misma arquitectura que el modelo guardado)
model = GraphSAGE(in_channels=1024,   # Input features (BGE-M3 embeddings)
    hidden_channels=512,  # First hidden layer (alto para máxima capacidad)
    out_channels=256,   # Output embeddings (más ricos)
    num_layers=2)  # Reemplaza con la clase de tu modelo
model.load_state_dict(torch.load("../data/graphsage_100_model.pth"))
model.eval()  # Cambia el modelo a modo de evaluación

print("Modelo cargado exitosamente")


Modelo cargado exitosamente


In [9]:
# Set device for model inference
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model = model.to(device)

In [10]:
# Suppose you have a query embedding 'query_embedding'
query = "Some query here"
query="Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?"
query_bgem3 = get_ollama_embedding(query)["embedding"]
query_sage = get_new_sage_embedding(model=model, new_feature=torch.tensor(query_bgem3, dtype=torch.float32).reshape(1024), device=device)

In [11]:
query_sage = np.array(query_sage.cpu())

In [12]:
# Query the collection (semantic)
sem_results = collection_semrag.query(
    query_embeddings=[query_bgem3],
    n_results=5
)

print(sem_results)

{'ids': [['31_chunk_1', '374_chunk_1', '10_chunk_1', '529_chunk_1', '53_chunk_2']], 'embeddings': None, 'documents': [['On Nov. 8, in the crypto exchange’s final few days, he tried to pull out his cash and crypto. How much? Almost $100,000, he said. And was he ever able to? “Never,” he told prosecutors.\n\nShortly afterward, as the trial neared late afternoon, the government called Adam Yedidia to the stand. A fast-talking graduate of MIT, he and Bankman-Fried were close friends in college, he said. And after Bankman-Fried left Jane Street, the high-frequency trading firm where the former billionaire got his start in finance after MIT, he persuaded Yedidia to join him as a trader at Alameda and then as a developer at FTX.\n\nWhen Yedidia first took the stand, Danielle Sassoon, one of the lead prosecutors, said that the college friend of Bankman-Fried had legal immunity during his testimony. Why did he make such a deal with the government, she asked.\n\n“I was concerned that I had unwit

In [13]:
# Query the collection (SAGE naive)
SAGE_naive_results = collection_SAGErag.query(
    query_embeddings=[query_sage],
    n_results=5
)

print(SAGE_naive_results)

{'ids': [['31_chunk_1', '175_chunk_0', '30_chunk_1', '53_chunk_2', '30_chunk_0']], 'embeddings': None, 'documents': [['On Nov. 8, in the crypto exchange’s final few days, he tried to pull out his cash and crypto. How much? Almost $100,000, he said. And was he ever able to? “Never,” he told prosecutors.\n\nShortly afterward, as the trial neared late afternoon, the government called Adam Yedidia to the stand. A fast-talking graduate of MIT, he and Bankman-Fried were close friends in college, he said. And after Bankman-Fried left Jane Street, the high-frequency trading firm where the former billionaire got his start in finance after MIT, he persuaded Yedidia to join him as a trader at Alameda and then as a developer at FTX.\n\nWhen Yedidia first took the stand, Danielle Sassoon, one of the lead prosecutors, said that the college friend of Bankman-Fried had legal immunity during his testimony. Why did he make such a deal with the government, she asked.\n\n“I was concerned that I had unwitt

In [None]:
# Query the collection (SAGE mixed)
SAGE_mixed_results = collection_SAGEmixedrag.query(
    query_embeddings=[query_sage],
    n_results=5
)

print(SAGE_mixed_results) 

{'ids': [['175', '30', '10', 'Jacquelyn Melinek', '31']], 'embeddings': None, 'documents': [[None, None, None, None, None]], 'uris': None, 'data': None, 'metadatas': [[None, None, None, None, None]], 'distances': [[17.60219383239746, 17.62687110900879, 17.770408630371094, 17.940692901611328, 18.134260177612305]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [32]:
def top_k_similar_nodes(G, node_ids, query_sage, k=5):
    """
    Given a list of node IDs from a graph G, compute cosine similarity between
    each node's 'SAGE_embedding' and a reference embedding 'query_sage'.
    Returns the IDs of the top K most similar nodes.

    Parameters:
        G (networkx.Graph): The graph.
        node_ids (list): List of node IDs to compare.
        query_sage (list or np.array): Reference embedding.
        k (int): Number of top similar nodes to return.

    Returns:
        List of top K node IDs with highest cosine similarity to query_sage.
    """
    similarities = []

    for node_id in node_ids:
        embedding = G.nodes[node_id].get("SAGE_embedding")
        if embedding is not None:
            # scipy's cosine() returns distance, so we subtract from 1 to get similarity
            sim = 1 - cosine(query_sage, embedding)
            similarities.append((node_id, sim))

    # Sort by similarity descending and return top K node IDs
    top_k = sorted(similarities, key=lambda x: x[1], reverse=True)[:k]
    return [node_id for node_id, _ in top_k]

In [39]:
hop_0 = SAGE_mixed_results['ids'][0]  

In [41]:
hop_0_articles = []
hop_0_chunks = []  
hop_0_PER = []
hop_0_LOC = []
hop_0_ORG = []   

for hop_0_node in hop_0:

    try:
        hop_0_node = int(hop_0_node)
    except:
        pass  
    
    node_type = G.nodes[hop_0_node]["type"]  
    if node_type == 'article':
        hop_0_articles.append(hop_0_node)
    elif node_type == 'chunk':
        hop_0_chunks.append(hop_0_node)
    elif node_type == 'entity_PER':
        hop_0_PER.append(hop_0_node)
    elif node_type == 'entity_LOC':
        hop_0_LOC.append(hop_0_node)
    elif node_type == 'entity_ORG':
        hop_0_ORG.append(hop_0_node)

In [42]:
hop_1 = [] 
for similar_node in hop_0:
    print(similar_node)

    try:
        similar_node = int(similar_node)
    except:
        pass  
    
    for neighbor in list(G.neighbors(similar_node)):
        hop_1.append(neighbor)

print(hop_1)

175
30
10
Jacquelyn Melinek
31
['Elizabeth Lopatto', 'The Verge', 'technology', '175_chunk_0', '175_chunk_1', '175_chunk_2', 'Sam Bankman - Fried_PER', 'SBF_ORG', 'FTX_ORG', 'Caroline Ellison_PER', 'Gary Wang_PER', 'Alameda_ORG', 'Binance_ORG', 'Ingrid Lunden', 'TechCrunch', 'technology', '30_chunk_0', '30_chunk_1', 'Sam Bankman - Fried_PER', 'SBF_ORG', 'FTX_ORG', 'Alameda Research_ORG', 'Sam Bankman_PER', 'Fried_PER', 'Michael Lewis_PER', 'Trump_PER', 'Biden_PER', 'Jacquelyn Melinek', 'TechCrunch', 'technology', '10_chunk_0', '10_chunk_1', 'SBF_ORG', 'FTX_ORG', 'Alameda Research_ORG', 'Caroline Ellison_PER', 'Gary Wang_PER', 'Alameda_ORG', 'Sam Bankman_PER', 'Fried_PER', 'FTT_ORG', 'Binance_ORG', 'U. S._LOC', 'BlockFi_ORG', 10, 374, 465, 'technology', 'TechCrunch', 'Ben Weiss', 'Fortune', 'business', '31_chunk_0', '31_chunk_1', 'Sam Bankman - Fried_PER', 'FTX_ORG', 'Alameda Research_ORG', 'Caroline Ellison_PER', 'Gary Wang_PER', 'Alameda_ORG', 'Bankman - Fried_PER', 'Marc - Antoine Ju

In [43]:
hop_1_articles = []
hop_1_chunks = []  
hop_1_PER = []
hop_1_LOC = []
hop_1_ORG = []   

for hop_1_node in hop_1:
    node_type = G.nodes[hop_1_node]["type"]  
    if node_type == 'article':
        hop_1_articles.append(hop_1_node)
    elif node_type == 'chunk':
        hop_1_chunks.append(hop_1_node)
    elif node_type == 'entity_PER':
        hop_1_PER.append(hop_1_node)
    elif node_type == 'entity_LOC':
        hop_1_LOC.append(hop_1_node)
    elif node_type == 'entity_ORG':
        hop_1_ORG.append(hop_1_node)

In [44]:
hop_1_articles_f = top_k_similar_nodes(G, hop_1_articles, query_sage, k=2)
hop_1_chunks_f = top_k_similar_nodes(G, hop_1_chunks, query_sage, k=2)
hop_1_PER_f = top_k_similar_nodes(G, hop_1_PER, query_sage, k=2)
hop_1_LOC_f = top_k_similar_nodes(G, hop_1_LOC, query_sage, k=2)
hop_1_ORG_f = top_k_similar_nodes(G, hop_1_ORG, query_sage, k=2)

In [45]:
hop_01_articles = set(hop_1_articles_f + hop_0_articles)
hop_01_chunks = set(hop_1_chunks_f + hop_0_chunks)
hop_01_PER = set(hop_1_PER_f + hop_0_PER)
hop_01_LOC = set(hop_1_LOC_f + hop_0_LOC)
hop_01_ORG = set(hop_1_ORG_f + hop_0_ORG)

print(hop_01_articles)
print(hop_01_chunks)
print(hop_01_PER)
print(hop_01_LOC)
print(hop_01_ORG)

{10, 175, 374, 30, 31}
{'175_chunk_0', '31_chunk_1'}
{'Sam Bankman - Fried_PER'}
{'U. S._LOC'}
{'SBF_ORG'}


In [47]:
G.nodes['175_chunk_0']['text'] 

'The trial of Sam Bankman-Fried is likely to be more consequential than just whether the man himself is found guilty. Depending on what evidence is introduced during the trial, it could be rough for the entire crypto industry.\n\n“How much damage can this trial do to the already beaten-down reputation of the industry at this point?” asks Yesha Yadav, a law professor at Vanderbilt University. “This trial is going to be an excruciating moment for the industry because no one knows what kind of evidence might come out.”\n\nBankman-Fried, the founder of FTX and Alameda Research, is facing seven counts of criminal charges: two counts of wire fraud, and five counts of conspiracy charges. FTX was a failed cryptocurrency exchange founded in 2019. According to a now-deleted profile from FTX investors Sequoia Capital, FTX was founded because of Bankman-Fried’s frustration with other exchanges when he was running Alameda Research, his crypto trading firm. According to the SEC, FTX was a fraud “fro

In [None]:
retrieved_article_summaries = [] 
for article in hop_01_articles:
    retrieved_article_summaries.append(raw_entities[article]['summary'])

retrieved_text_chunks = []
for chunk in hop_01_chunks:
    retrieved_text_chunks.append(G.nodes[chunk]['text'])

retrieved_entities = []
for i in [hop_01_PER, hop_01_LOC, hop_01_ORG]:
    for ret_entity in i:
        retrieved_entities.append(ret_entity.split('_')[0])

In [52]:
mixed_retrieval_text_fragments = retrieved_article_summaries + retrieved_text_chunks
mixed_retrieval_text_fragments

['Sam Bankman-Fried’s (SBF) criminal trial began Tuesday, addressing seven counts of fraud and conspiracy related to the collapse of his crypto exchange, FTX. Founded in 2019, FTX quickly grew, reaching a $32 billion valuation before filing for bankruptcy in November 2022.\n\nFTX’s downfall stemmed from concerns about the financial relationship between FTX and its affiliated trading firm, Alameda Research, which held a significant amount of FTT, FTX’s native token. When Binance began withdrawing funds, a “bank run” ensued. \n\nFollowing the collapse, SBF resigned, and John J. Ray III was appointed CEO. SBF was arrested in the Bahamas and extradited to the U.S., facing charges that carry a potential 115-year prison sentence. Key figures, including former CTO Gary Wang and Alameda CEO Caroline Ellison, have pleaded guilty and will cooperate with prosecutors.\n\nThe collapse of FTX triggered broader issues within the crypto industry, contributing to the bankruptcy filings of BlockFi and G

In [53]:
mixed_retrieval_entities = retrieved_entities
mixed_retrieval_entities

['Sam Bankman - Fried', 'U. S.', 'SBF']

In [55]:
mixed_retrieval_prompt = f"""
You are an expert assistant whose task is to answer the user's question using the provided relevant context. 
Use the context to support your answer and ensure that your response is clear, accurate, and concise.
The context comprises diverse text fragments containing potentially relevant information, as well as different 
selected entities connected to the query.

Text fragments:
{mixed_retrieval_text_fragments}

Entities:
{mixed_retrieval_entities} 

User Question:
{query}

Answer:
"""

print(mixed_retrieval_prompt)


You are an expert assistant whose task is to answer the user's question using the provided relevant context. 
Use the context to support your answer and ensure that your response is clear, accurate, and concise.
The context comprises diverse text fragments containing potentially relevant information, as well as different 
selected entities connected to the query.

Text fragments:
['Sam Bankman-Fried’s (SBF) criminal trial began Tuesday, addressing seven counts of fraud and conspiracy related to the collapse of his crypto exchange, FTX. Founded in 2019, FTX quickly grew, reaching a $32 billion valuation before filing for bankruptcy in November 2022.\n\nFTX’s downfall stemmed from concerns about the financial relationship between FTX and its affiliated trading firm, Alameda Research, which held a significant amount of FTT, FTX’s native token. When Binance began withdrawing funds, a “bank run” ensued. \n\nFollowing the collapse, SBF resigned, and John J. Ray III was appointed CEO. SBF wa

In [56]:
client = Client(
  host='http://localhost:11434',
  #headers={'x-some-header': 'some-value'}
)
response = client.chat(model='hf.co/unsloth/QwQ-32B-GGUF:Q6_K', messages=[
  {
    'role': 'user',
    'content': f"""{mixed_retrieval_prompt} 
""",
  },
])

print(response['message']['content'])
# or access fields directly from the response object
# print(response.message.content)

<think>
Okay, so I need to figure out who the individual is that's associated with the cryptocurrency industry, facing criminal trial on fraud and conspiracy charges as reported by The Verge and TechCrunch, and accused by prosecutors of committing fraud for personal gain. Let me start by going through the information provided.

First, looking at the entities listed: Sam Bankman-Fried, U.S., and SBF. SBF is likely an abbreviation for Sam Bankman-Fried. The user's question mentions "the individual," so it's probably Sam Bankman-Fried. But I need to confirm this with the context given in the articles provided.

Looking at the articles, they mention "Sam Bankman-Fried" multiple times. For example, in the first article excerpt, it says: "Bankman-Fried is accused of using 'billions of dollars in misappropriated FTX customer deposits' to help buy more than $200 million of real estate for himself..." and "Additionally, Bankman-Fried allegedly used more than $100 million of customer funds to ma

In [17]:
client = Client(
  host='http://localhost:11434',
  #headers={'x-some-header': 'some-value'}
)
response = client.chat(model='hf.co/unsloth/QwQ-32B-GGUF:Q6_K', messages=[
  {
    'role': 'user',
    'content': f"""
You are an expert assistant whose task is to answer the user's question using the provided context. Use the context to support your answer and ensure that your response is clear, accurate, and concise.

Context:
{sem_results}

User Question:
{query}

Answer:
""",
  },
])

print(response['message']['content'])
# or access fields directly from the response object
# print(response.message.content)

<think>
Okay, let's tackle this question. The user is asking about an individual in the crypto industry facing a criminal trial for fraud and conspiracy, specifically mentioned by The Verge and TechCrunch, and accused by prosecutors of fraud for personal gain.

First, I need to look through the provided context to find relevant information. The context has several chunks. Let's go through each one.

The first chunk talks about someone named Bankman-Fried. It mentions his trial, prosecutors, and how he couldn't retrieve $100k. There's a reference to Adam Yedidia testifying against him. The context also mentions FTX's collapse, which aligns with the crypto industry. 

Another chunk discusses Binance and Zhao, but the question is about someone facing fraud charges, so maybe not Zhao since the focus here is on Bankman-Fried's trial details. 

The third chunk mentions Bankman-Fried's defense attorney and the aftermath of FTX's collapse. The fourth chunk is about Trump, which seems unrelated

In [18]:
client = Client(
  host='http://localhost:11434',
  #headers={'x-some-header': 'some-value'}
)
response = client.chat(model='hf.co/unsloth/QwQ-32B-GGUF:Q6_K', messages=[
  {
    'role': 'user',
    'content': f"""
You are an expert assistant whose task is to answer the user's question using the provided context. Use the context to support your answer and ensure that your response is clear, accurate, and concise.

Context:
{SAGE_results}

User Question:
{query}

Answer:
""",
  },
])

print(response['message']['content'])
# or access fields directly from the response object
# print(response.message.content)

<think>
Okay, let's tackle this question step by step. The user is asking about an individual in the cryptocurrency industry who's facing a criminal trial for fraud and conspiracy, as reported by The Verge and TechCrunch. The key points here are the individual's association with crypto, the specific charges (fraud and conspiracy), and the mention by both outlets.

First, I'll look through the provided context from TechCrunch. The user mentioned both The Verge and TechCrunch, but since the context here is only from TechCrunch, I'll focus on that. 

Looking at the documents in the context:

1. The first document talks about Klarna's IPO plans and the unicorn term, so probably not related.
2. The second document mentions FTX's collapse, BlockFi, Genesis, and refers to Bankman-Fried. It says he's represented by Mark Cohen, who also handled Ghislaine Maxwell's case. The trial is starting, and the focus is on the investors and creditors affected. This seems relevant because FTX was a major c