In [4]:
import os
import getpass
from langchain.retrievers import EnsembleRetriever
from langchain.chat_models import init_chat_model
from langchain_community.retrievers import WikipediaRetriever
import requests
from langchain.schema.document import Document
from langchain.schema.retriever import BaseRetriever
from langchain.document_loaders import TextLoader
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough


if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

api_key = input("Enter your OpenSanctions API Key: ")
os.environ["OPENSANCTIONS_API_KEY"] = api_key

In [5]:
model = init_chat_model("deepseek-r1-distill-llama-70b", model_provider="groq")

In [6]:
class OpenSanctionsRetriever(BaseRetriever):

    def _get_relevant_documents(self, query):
        """
        Queries OpenSanctions API and returns relevant documents.
        """

        person_name=""
        company_name=""
        api_key = "3b9678eb2e0dff14c268b43f7acf4798"
        if "Company:" in query or "Person:" in query:
            parts = query.split(",")
            for part in parts:
                if "Company:" in part:
                    company_name = part.split("Company:")[-1].strip()
                    print(company_name)
                elif "Person:" in part:
                    person_name = part.split("Person:")[-1].strip()
                    print(person_name)


        headers = {"Authorization": api_key}
        # params = {"q": query}

        query = {
            "queries": {
                "query-A": {"schema": "Person", "properties": {"name": [person_name]}},
                "query-B": {"schema": "Company", "properties": {"name": [company_name]}},
            }
        }
        response = requests.post(
            "https://api.opensanctions.org/match/default", headers=headers, json=query
        )
        # if response.status_code != 200:
        #     return []
        
        response.raise_for_status()
        response_json = response.json()

        # print("\nFull API Response:")
        # pprint(response_json, sort_dicts=False)

        # if not response_json.get("results"):
        #     print("empty list lool")
        #     return []

        documents = []
        # print("outside first for")
        for query_id, query_response in response_json["responses"].items():
            # print(f"\nResults for query {query_id}:")
            # results = []
            
            for result in query_response["results"]:
                # print("in for result")
                entity_topics = set(result["properties"].get("topics", []))  # Fix extraction
                entity_datasets = set(result.get("datasets", []))  # Fix extraction

                # print(f"\n🔎 Checking Entity: {result['id']}")
                # print(f"📌 Topics: {entity_topics}")
                # print(f"📌 Datasets: {entity_datasets}")
                
                name_to_store_page_content=result["properties"].get("name")
                # print("hehe", name_to_store_page_content[0])
                entity_info = {
                    "id": result["id"],
                    "name": result["properties"].get("name", []),
                    "match": result["match"],
                    "topics": list(entity_topics),  
                    "datasets": list(entity_datasets),  
                }
                doc = Document(page_content=f"Sanctions data for {name_to_store_page_content}", metadata=entity_info)
                # print("doc = ", doc)
                documents.append(doc)
                # results.append(entity_info)
        return documents

In [7]:
# Step 1: Load and vectorize Instructions.md
loader = TextLoader("Instructions.md")
docs = loader.load()

# Use a local embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Store Instructions.md in FAISS and create a retriever
vectorstore = FAISS.from_documents(docs, embedding_model)


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [8]:
instructions_retriever = vectorstore.as_retriever()
OpenSanctions_retriever = OpenSanctionsRetriever()
wiki_retriever = WikipediaRetriever()

In [9]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[wiki_retriever, OpenSanctions_retriever, instructions_retriever], weights=[0.25, 0.5, 0.25]
)

In [10]:
prompt = ChatPromptTemplate.from_template(
    """
    You are an agent that helps data analysts in a financial institution by risk scoring a transaction 
    among entities/corporations. It can also include individuals. Given an input transaction, you need to output 
    the risk score (0 to 1) of the transaction, confidence score (0 to 1) and reason for your answer.
    Context: {context}
    Transaction: {transaction}
    """
)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

chain_test = (
    {"context": ensemble_retriever | format_docs, "transaction": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [11]:
chain_test.invoke("TXN20250322113045,2025-03-22T11:30:45Z,Tesla, Inc.,TESLA987654321,JPMorgan Chase Bank, USA,Adani Green Energy Ltd.,ADANIGREEN123456,State Bank of India, India,500000000,USD,Wire Transfer,Investment in Renewable Energy Collaboration,Completed")

'<think>\nOkay, so I\'ve got this transaction to assess: TXN20250322113045 from Tesla, Inc. to Adani Green Energy Ltd. through JPMorgan Chase Bank and State Bank of India. The amount is $500 million, which is a pretty large sum. Let me break this down step by step.\n\nFirst, I\'ll look at the entities involved. Tesla is a well-known, reputable company in the electric vehicle and renewable energy sector. Adani Green Energy is a major player in renewable energy, especially in India. Both of these companies are established corporations, so their entity types seem legitimate. The banks involved, JPMorgan Chase and State Bank of India, are also reputable financial institutions, which adds to the credibility.\n\nNow, checking the addresses and jurisdictions. Tesla is based in the USA, which is a low-risk country. Adani Green Energy is in India, which isn\'t typically a high-risk jurisdiction, though I should consider if there are any recent sanctions or issues. The banks are in the USA and I