In [None]:
# Combine RAG and Vector db (FAISS) with LLM model to extract relevant data from latest documents for a User Query
# RAGVectorDBFaiss_Llama3_InsuranceClaim
# 

In [1]:
#!pip install numpy torch transformers faiss-cpu pandas

In [2]:
import os
from huggingface_hub import login

hfg_token = " "
login(token=hfg_token)
os.environ["HUGGINGFACE_TOKEN"] = hfg_token


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import sqlite3
import json

# Initialize the model and tokenizer for the LLM
model_name = "meta-llama/Meta-Llama-3-8B" #'gpt2'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
# Connect to SQLite database
conn = sqlite3.connect('documents.db')
cursor = conn.cursor()

# Create table
cursor.execute('''
CREATE TABLE IF NOT EXISTS documents (
    id INTEGER PRIMARY KEY,
    text TEXT NOT NULL,
    embedding TEXT NOT NULL
)
''')
conn.commit()

In [5]:
# Check if the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [6]:
# Sample health insurance documents (for demonstration)
documents = [
    "Policy Holder: John Doe, Claim Amount: $5000, Diagnosis: Flu.",
    "Policy Holder: Jane Smith, Claim Amount: $15000, Diagnosis: ADHD.",
    "Policy Holder: Sam Brown, Claim Amount: $10000, Diagnosis: Ulcer.",
    "Auto Insurance Claimant: Elliot, Claim Amount: $2,500, Paid: No",
    "Auto Insurance Claimant: John, Claim Amount: $3,000, Paid: Partial",
    "Auto Insurance Claimant: David, Claim Amount: $7,500, Paid: Yes"
]
# Step 1: Create a FAISS index for vector retrieval
embeddings = []

for doc in documents:
    # Encode the document using the LLM
    inputs = tokenizer(doc, return_tensors='pt')
    outputs = model(**inputs)
    # Use the last hidden state as the embedding
    embeddings.append(outputs.logits.mean(dim=1).detach().numpy())
    json_embed = json.dumps(np.array(embeddings).tolist())
    cursor.execute('INSERT INTO documents (text, embedding) VALUES (?, ?)', (doc, json_embed))
conn.commit()

embeddings = np.vstack(embeddings)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)  # Add embeddings to the index


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [None]:
cursor.execute(f"SELECT * FROM {'documents'} Limit 1;")
rows = cursor.fetchall()

# Print column names
column_names = [description[0] for description in cursor.description]
print(column_names)

# Print all rows
for row in rows:
    print(row)
print("\n")

In [None]:
['id', 'text', 'embedding']
(1, 'The quick brown fox jumps over the lazy dog.', b'BM\xa8\xc2d\xce\xa8\xc2\x15\x87\xb1\xc2J\xa9\xb3\xc2m\
xa1\xb0\xc2x\x83\xb0\xc2\xda)\xaa\xc2\x1c\xa0\xab\xc2\xc1\xb6\xa9\xc2@\x88\xac\xc2\xfc\x84\xaf\xc2\xe8\x88\x9f\
xc2N\xa7\xa2\xc2\xb2\xb0\xa2\xc25~\xa7\xc2\x12\x13\xb1\xc
 

In [8]:
# To print all the vectors stored in the index
for i in range(index.ntotal):
    vector = index.reconstruct(i)
    print(f"Vector {i}: {vector}")

Vector 0: [ 4.063478   3.634199   5.0635753 ... -5.2163105 -5.216363  -5.216341 ]
Vector 1: [ 4.0459576  3.3205242  4.744379  ... -5.6514874 -5.651537  -5.6515217]
Vector 2: [ 4.16554    3.1222484  4.652902  ... -5.5586405 -5.558699  -5.558677 ]
Vector 3: [ 5.640151   3.7278209  5.2984076 ... -5.414397  -5.414443  -5.4143996]
Vector 4: [ 5.697008   3.4584913  5.0969944 ... -5.6340694 -5.6341114 -5.634085 ]
Vector 5: [ 5.749886   3.707664   5.148497  ... -5.6200976 -5.620149  -5.6201205]


In [9]:
# Step 2: Define a function to retrieve similar documents
def retrieve_similar_documents(query, top_k=2):
    # Encode the query
    document_inputs = tokenizer(query, return_tensors='pt')
    query_output = model(**document_inputs)
    query_embedding = query_output.logits.mean(dim=1).detach().numpy()

    # Search for the closest documents in the FAISS index
    distances, indices = index.search(query_embedding, top_k)
    return [documents[i] for i in indices[0]]


In [10]:
# Example query for fraud detection
query = "Claim Amount: $10000"
similar_docs = retrieve_similar_documents(query)
print("Similar Documents:", similar_docs)

Similar Documents: ['Auto Insurance Claimant: John, Claim Amount: $3,000, Paid: Partial', 'Policy Holder: Sam Brown, Claim Amount: $10000, Diagnosis: Ulcer.']


In [22]:
# Create a text generation pipeline
text_generator = pipeline(
    "text-generation",
    model=model_name,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
)

# Define your prompt
prompt_text = " ".join(similar_docs)
prompt = "Give as output only the claim amount in the given text"

# Generate text
prompt_output = text_generator(f"{prompt} {prompt_text}", max_new_tokens=100)[0]['generated_text']

# Print the generated text
print(prompt_output)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Give as output only the claim amount in the given text Auto Insurance Claimant: John, Claim Amount: $3,000, Paid: Partial Policy Holder: Sam Brown, Claim Amount: $10000, Diagnosis: Ulcer. The output should be: 3000 10000
```
import re

text = "Auto Insurance Claimant: John, Claim Amount: $3,000, Paid: Partial Policy Holder: Sam Brown, Claim Amount: $10000, Diagnosis: Ulcer"
print(re.findall(r'Claim Amount: \$(\d+)', text))
```
```
['3000', '10000']
```
CLICK HERE to find out more related problems solutions.


In [24]:
# # Load embeddings from the database
# cursor.execute('SELECT id, embedding FROM documents')
# rows = cursor.fetchall()

# # Prepare data for FAISS
# ids = []
# embeddings = []
# for row in rows:
#     ids.append(row[0])
#     embeddings.append(np.array(row[1], dtype=np.unicode_))


# embeddings = np.stack(embeddings)

# # Create FAISS index
# dimension = embeddings.shape[1]
# index = faiss.IndexFlatL2(dimension)
# index.add(embeddings)

# # Save FAISS index
# faiss.write_index(index, 'faiss_index.bin')