<a href="https://colab.research.google.com/github/droy9/nyc-officer-complaints-RAG/blob/main/complaintsrag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install portkey_ai

In [None]:
from google.colab import userdata

# RAG with this data

Complaint Data

In [None]:
import pandas as pd

def narrative(r):
    date = f"On {r['incident_date']}" if r['incident_date'] != "Unlisted" else "On an unknown date"
    if r['first_name'] != "Unlisted" and r['last_name'] != "Unlisted":
        on = f", {r['complaint_id']} was filed against officer {r['first_name']}, {r['last_name']}"
    else:
        on = ", against an unknown officer"
    where= f" at a(n) {r['location_type']}." if r['location_type'] != "Unlisted" else " at an unknown location."
    reason =  f" for the reason: \"{r['contact_reason']}\"."
    alleg = f" The allegation was classified with the FADO type of {r['fado_type']} with a specific allegation of {r['allegation_cat']}."
    outcome = f" The contact outcome was {r['contact_outcome']}."
    ccrb_disposition = f"The ccrb_disposition is \"{r['ccrb_disposition']}\"." if r['ccrb_disposition'] != " Complainant Unavailable" else " The ccrb_disposition is unavailable."
    penalty = f" The penalty received is {r['penalty_rec']}." if r['penalty_rec'] != "Not Applicable" else " There was no penalty received."
    status_cat = f" The complaint status is {r['status_cat']} as of 4/1/2021." if r['status_cat'] != "Unlisted" else " The complaint status is unknown"
    overview = date + on + where + reason + alleg + outcome + ccrb_disposition + penalty + status_cat;

    officer_race = f"The officer race is {r['officer_race']}." if r['officer_race'] != "Unlisted" else "The officer race is unknown."
    officer_sex = f" The officer sex is {r['officer_gender']}." if r['officer_gender'] != "Unlisted" else " The officer sex is unknown."
    days_on_force = f" The officer was on force for {r['days_on_force']} days when this dataset was last recorded." if r['days_on_force'] != "Unlisted" else " The officer was on force for an unknown number of days when this dataset was last recorded."
    officer_incident_rank = f" The officer rank at the time of the incident was {r['officer_incident_rank']}." if r['officer_incident_rank'] != "Unlisted" else " The officer rank during the incident is unknown."
    officer_current_rank = f" The officer current rank is {r['officer_current_rank']}." if r['officer_current_rank'] != "Unlisted" else " The officer current rank is unknown."
    officer_statistics = officer_race + officer_sex + days_on_force + officer_incident_rank + officer_current_rank;

    impacted_race = f"The race of the victim / alleged victim is {r['impacted_race']}" if r['impacted_race'] != "Unlisted" else "The race of the victim / alleged victim is unknown."
    impacted_gender = f" The gender of the victim / alleged victim is {r['impacted_gender']}" if r['impacted_gender'] != "Unlisted" else " The gender of the victim / alleged victim is unknown."
    impacted_statistics = impacted_race + impacted_gender;

    summary = (
    f"{overview}\n\n"
    f"{officer_statistics}\n\n"
    f"{impacted_statistics}"
    )
    return summary


pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

df = pd.read_csv("complaintclean.csv")
df = df.head()
summary = df.apply(narrative, axis=1)
summary
#print(df.head())


In [None]:
# Corrected code to assign the summary to a new column
df['summary'] = summary

# Display the first few rows with the new 'summary' column
display(df.head())

In [None]:
# Read CSV files (adjust file paths to where your CSVs are saved)
df.to_csv("complaintclean_narrative.csv", index=False)

In [None]:
##For RAG
!pip install pandas faiss-cpu sentence-transformers portkey-ai streamlit

In [None]:
#To Build Index
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

CSV_PATH = "complaintclean_narrative.csv" #change Path to where your CSV is
META_PATH  = "eebo_cc_meta.json"
INDEX_PATH = "eebo_cc_allmini.cosine.faiss"

df = pd.read_csv(CSV_PATH, low_memory=False)
df = df[["complaint_id", "summary"]].dropna(subset=["summary"]).reset_index(drop=True)

def chunk_text(t, size=800, overlap=100):
    t = str(t)
    if len(t) <= size:
        return [t]
    chunks, start = [], 0
    while start < len(t):
        end = start + size
        chunks.append(t[start:end])
        if end >= len(t): break
        start = end - overlap
    return chunks

docs, meta = [], []
for i, row in df.iterrows():
    title = str((row["complaint_id"] or ""))
    text  = str(row["summary"])
    for j, ch in enumerate(chunk_text(text, size=200)):
        docs.append((title + "\n\n" + ch).strip())
        meta.append({"row_id": int(i), "chunk_id": int(j), "title": title})

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embs = model.encode(docs, convert_to_numpy=True, normalize_embeddings=True)

dim = embs.shape[1]
index = faiss.IndexFlatIP(dim)  # cosine if vectors are normalized
index.add(embs)

faiss.write_index(index, INDEX_PATH)
with open(META_PATH, "w") as f:
    json.dump(meta, f)

print(f"Built index with {len(docs)} chunks â†’ {INDEX_PATH}")


In [13]:
#To chat using LLM API's from Portkey
import os, json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from portkey_ai import Portkey
from google.colab import userdata

#INDEX_PATH = "eebo_1415.faiss"
INDEX_PATH = "eebo_cc_allmini.cosine.faiss"
META_PATH  = "eebo_cc_meta.json"
CSV_PATH   = "complaintclean_narrative.csv" #add in your PATH

# --- Load retrieval artifacts ---
index = faiss.read_index(INDEX_PATH)
with open(META_PATH) as f:
    META = json.load(f)
df = pd.read_csv(CSV_PATH, low_memory=False)[["complaint_id", "summary"]]

# Embedder (must match build_index.py)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def retrieve(query, k=4, max_ctx_chars=2000):
    q = embedder.encode([query], normalize_embeddings=True).astype(np.float32)
    D, I = index.search(q, k)
    hits = []
    for idx, score in zip(I[0], D[0]):
        m = META[idx]
        row = df.iloc[m["row_id"]]
        context = (str(row["complaint_id"]) + "\n\n" + str(row["summary"])).strip()
        hits.append({
            "score": float(score),
            "row_id": m["row_id"],
            "chunk_id": m["chunk_id"],
            "title": m["title"],
            "context": context[:max_ctx_chars]
        })
    return hits

# --- Portkey client (your format) ---
# Make sure your Portkey API key is correctly set up in Colab's secrets with the name `PORT_KEY`
# and has the required permissions.
portkey = Portkey(
  api_key = userdata.get('PORT_KEY')
)

MODEL = "@first-integrati-db9427/gemini-2.5-flash-lite"

def call_llm_portkey(system_prompt, user_prompt):
    response = portkey.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        max_tokens=512,
    )
    return response.choices[0].message.content # Access content attribute directly

def answer(query):
    contexts = retrieve(query, k=2)
    ctx_block = "\n\n---\n".join(
        [f"[{i+1}] Title: {c['title']}\n{c['context']}" for i, c in enumerate(contexts)]
    )
    cites = "\n".join([f"- [{i+1}] row_id={c['row_id']} chunk={c['chunk_id']} title={c['title']}"
                       for i, c in enumerate(contexts)])
    system = (
        "You are a helpful assistant. Use the provided CONTEXT to answer.\n"
        "but if you need to, use verified external knowledge and make a disclaimer to let"
        "the user know that you're using external data"
        "Don't be afraid to summarize all of the data"
        "If insufficient, say you don't know."
    )
    user = f"QUESTION: {query}\n\nCONTEXT:\n{ctx_block}\n\nReturn a concise answer"
    out = call_llm_portkey(system, user)
    return out, cites

if __name__ == "__main__":
    print("RAG chat ready (Portkey). Type a question (Ctrl+C to quit).")
    while True:
        try:
            q = input("\nYou: ").strip()
            if not q:
                continue
            ans, src = answer(q)
            print("\nAssistant:", ans)
            print("\nSources:\n", src)
        except KeyboardInterrupt:
            print("\nBye!")
            break


Bye!
