This script demonstrates the construction of an agentic application for analyzing a synthetic financial fraud dataset.
It integrates ArangoDB for graph storage, NVIDIA cuGraph and NetworkX for graph analytics, and LangChain/LangGraph for
natural language query processing. The application supports querying a fraud ring graph using AQL (ArangoDB Query Language)
and NetworkX algorithms, with results translated back into natural language.

### Step 0: Package Installation & setup

In [8]:
# Install required packages for graph processing, database interaction, and language model integration
!pip install nx-arangodb nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com langchain langchain-community langchain-groq langgraph

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com


In [9]:
# Verify NVIDIA GPU availability (optional, for cuGraph acceleration)
!nvidia-smi
!nvcc --version

/bin/bash: line 1: nvidia-smi: command not found
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [10]:
# Import necessary libraries
import os
import re
from random import randint
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from arango import ArangoClient
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_groq import ChatGroq
from langchain_community.graphs import ArangoGraph
from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain
from langchain_core.tools import tool
from pydantic import BaseModel, Field
from typing import Literal
from langchain_core.prompts import ChatPromptTemplate

In [11]:
os.environ["NX_CUGRAPH_AUTOCONFIG"] = "True"
import nx_arangodb as nxadb  # Must import after setting environment variable

In [14]:
# Connect to ArangoDB cloud database
# TODO: Replace with actual credentials

db = ArangoClient(hosts="https://bad64b5a91c6.arangodb.cloud:8529").db(
    username="root",
    password="9u17nIbeG0yGs7fxBIMA",
    verify=True
)

### Step 1: Choose & prepare your dataset for NetworkX

In [7]:
# Load synthetic fraud dataset from CSV
fraud_ring_graph = pd.read_csv(
    "/content/fraud_23pct_synthetic_dataset_fixed.csv",
)

fraud_ring_graph

Unnamed: 0,Transaction_ID,Sender_account,Sender_age,Sender_is_elderly,Receiver_account,Receiver_age,Receiver_is_elderly,Amount,Date,Is_fraud,Sender_gender,Receiver_gender,Type_of_fraud,Method_of_contact,Loss,Time_of_day,Resolution_status
0,TXN-ZGH4A9ZJ,6953697,38,0,206718272,21,0,457.46,2023-05-15,0,Female,Male,Legitimate,Direct,0.00,Morning,
1,TXN-CIJQN6C4,89029013,20,0,30635852,62,1,407.64,2023-03-22,0,Male,Male,Legitimate,Direct,0.00,Morning,
2,TXN-NNS9PZEM,674715057,69,1,453651788,44,0,657.94,2023-12-23,1,Female,Male,Investment Fraud,Email,657.94,Afternoon,Reported
3,TXN-UE6EU8UI,896619255,73,1,284968123,33,0,240.21,2023-11-19,0,Female,Female,Legitimate,Direct,0.00,Evening,
4,TXN-3LKEISJE,175484941,64,1,435131719,21,0,878.00,2023-11-08,0,Male,Female,Legitimate,Direct,0.00,Morning,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,TXN-44JCK2ZC,864406382,25,0,166950383,37,0,124.67,2023-08-30,0,Female,Male,Legitimate,Direct,0.00,Evening,
9996,TXN-LYFAY0XA,727600539,83,1,578909930,33,0,146.02,2023-09-07,1,Male,Male,Tech Support Scam,Phone,146.02,Morning,Reported
9997,TXN-X6ZLIM1B,505988121,29,0,463837684,62,1,57.42,2023-01-13,0,Female,Male,Legitimate,Direct,0.00,Afternoon,
9998,TXN-JACEN74I,149578017,41,0,596499669,21,0,69.78,2023-12-23,0,Female,Female,Legitimate,Direct,0.00,Night,


### Step 2: Convert and Load Graph Data into NetworkX

In [None]:
# Create a MultiDiGraph from the dataset, representing transactions between accounts

G = nx.from_pandas_edgelist(
    fraud_ring_graph,
    source='Sender_account',        # Source node (account)
    target='Receiver_account',      # Target node (account)
    edge_attr=['Transaction_ID', 'Amount', 'Date', 'Is_fraud', 'Type_of_fraud', 'Loss'],  # Edge properties
    create_using=nx.MultiDiGraph()  # Allows multiple directed edges between nodes
)

# Add node attributes (e.g., age, gender) from the dataset
node_attributes = {}
for _, row in fraud_ring_graph.iterrows():
    sender = row['Sender_account']
    receiver = row['Receiver_account']

    # Initialize sender node attributes if not already present
    if sender not in node_attributes:
        node_attributes[sender] = {
            'account': str(sender),         # Explicit account number as string
            'age': row['Sender_age'],
            'is_elderly': row['Sender_is_elderly'],
            'gender': row['Sender_gender']
        }

    # Initialize receiver node attributes if not already present
    if receiver not in node_attributes:
        node_attributes[receiver] = {
            'account': str(receiver),       # Explicit account number as string
            'age': row['Receiver_age'],
            'is_elderly': row['Receiver_is_elderly'],
            'gender': row['Receiver_gender']
        }

# Apply node attributes to the graph
nx.set_node_attributes(G, node_attributes)

# Display attributes of a sample node for verification
sample_node = list(G.nodes())[0]
print(f"Sample node ({sample_node}) attributes: {G.nodes[sample_node]}")

In [None]:
# Visualize the graph (optional)
plot_options = {"node_size": 10, "with_labels": False, "width": 0.15}
pos = nx.spring_layout(G, iterations=15, seed=1721)  # Layout for visualization
fig, ax = plt.subplots(figsize=(15, 9))
nx.draw_networkx(G, pos=pos, ax=ax, **plot_options)
plt.show()

### Step 3: Persist the Graph in ArangoDB

In [15]:
# Load the NetworkX graph into ArangoDB cloud database
G_adb = nxadb.Graph(
    name="knowledge_graph",
    db=db,                     # ArangoDB connection
    # incoming_graph_data=G,   # Uncomment to load the graph (disabled for demo)
    # write_batch_size=500,    # Batch size for writing
    # overwrite_graph=True     # Overwrite existing graph if present
)

[04:32:20 +0000] [INFO]: Graph 'knowledge_graph' exists.
INFO:nx_arangodb:Graph 'knowledge_graph' exists.
[04:32:20 +0000] [INFO]: Default node type set to 'knowledge_graph_node'
INFO:nx_arangodb:Default node type set to 'knowledge_graph_node'


In [16]:
# Enable GPU acceleration for ArangoDB queries (if available)
nx.config.backends.arangodb.use_gpu = True

In [17]:
# Test the graph with a random AQL query (retrieve 3 random nodes)
result = G_adb.query("""
    FOR node IN knowledge_graph_node
        SORT RAND()
        LIMIT 3
        RETURN node
""")
print(list(result))

[{'_key': '788', '_id': 'knowledge_graph_node/788', '_rev': '_jVPJ20---4', 'account': '784543540', 'age': 84, 'is_elderly': 1, 'gender': 'Male'}, {'_key': '799', '_id': 'knowledge_graph_node/799', '_rev': '_jVPJ20--_D', 'account': '785374109', 'age': 30, 'is_elderly': 0, 'gender': 'Female'}, {'_key': '525', '_id': 'knowledge_graph_node/525', '_rev': '_jVPJ2z2--W', 'account': '711026086', 'age': 86, 'is_elderly': 1, 'gender': 'Female'}]


### Step 4: Build the Agentic App with LangChain & LangGraph

In [18]:
# Create an ArangoGraph wrapper for LangChain integration
arango_graph = ArangoGraph(db)

In [19]:
# Set GROQ API key for language model access
os.environ["GROQ_API_KEY"] = "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"

In [20]:
# Define a tool to convert natural language to AQL and back to text
@tool
def text_to_aql_to_text(query: str):
    """
    Translates a natural language query into an AQL query, executes it on the ArangoDB graph,
    and converts the result back to natural language.

    Args:
        query (str): Natural language query (e.g., "Show details of account 12345.")

    Returns:
        str: Result in natural language format

    Examples:
        "Show all transactions above $100."
        "Find all accounts linked to elderly individuals."
    """
    # Initialize a deterministic LLM for consistent AQL generation
    llm = ChatGroq(temperature=0.2, model_name="llama-3.3-70b-versatile")

    # Create an AQL query chain with examples for better performance
    chain = ArangoGraphQAChain.from_llm(
        llm=llm,
        graph=arango_graph,
        verbose=True,
        allow_dangerous_requests=True,
        top_k=20,                     # Limit to top 20 results
        max_aql_generation_attempts=5,  # Retry AQL generation up to 5 times
        aql_examples="""
        # Example: Transactions with loss > $100
        FOR startNode IN knowledge_graph_node
          FOR v, e IN 1..1 OUTBOUND startNode._id GRAPH 'knowledge_graph'
            FILTER e.Loss > 100
            RETURN {
              transaction_id: e.Transaction_ID,
              amount: e.Amount,
              date: e.Date,
              is_fraud: e.Is_fraud,
              type_of_fraud: e.Type_of_fraud,
              loss: e.Loss,
              from_account: startNode.account,
              to_account: v.account
            }
        # Example: Fraudulent transactions involving elderly
        FOR startNode IN knowledge_graph_node
            FILTER startNode.age > 60
            FOR v, e IN 1..1 OUTBOUND startNode._id GRAPH 'knowledge_graph'
                FILTER e.Is_fraud == 1
                RETURN {
                    transaction_id: e.Transaction_ID,
                    amount: e.Amount,
                    date: e.Date,
                    type_of_fraud: e.Type_of_fraud,
                    loss: e.Loss,
                    elderly_account: startNode.account
                }
        """
    )

    # Execute the query and return the result
    result = chain.invoke(query)
    return str(result["result"])


In [21]:
# Define a tool to perform NetworkX graph analytics from natural language queries
@tool
def text_to_nx_algorithm_to_text(query):
    """
    Executes a NetworkX algorithm based on a natural language query, using data from ArangoDB,
    and returns the result in natural language.

    Args:
        query (str): Natural language query (e.g., "Find the most influential accounts.")

    Returns:
        str: Analysis result in natural language

    Notes:
        Use only for graph analytics (e.g., centrality, clustering), not simple traversals.
    """
    # Initialize deterministic LLM for code generation
    llm = ChatGroq(temperature=0.2, model_name="llama-3.3-70b-versatile")

    # Fetch graph data from ArangoDB using AQL
    aql_query = """
        FOR edge IN knowledge_graph_node_to_knowledge_graph_node
            FOR sender IN knowledge_graph_node
                FILTER sender._id == edge._from
            FOR receiver IN knowledge_graph_node
                FILTER receiver._id == edge._to
                RETURN {
                    source: sender.account,
                    target: receiver.account,
                    edge_attrs: {
                        Transaction_ID: edge.Transaction_ID,
                        Amount: edge.Amount,
                        Date: edge.Date,
                        Is_fraud: edge.Is_fraud,
                        Type_of_fraud: edge.Type_of_fraud,
                        Loss: edge.Loss
                    },
                    sender_attrs: {
                        age: sender.age,
                        is_elderly: sender.is_elderly,
                        gender: sender.gender
                    },
                    receiver_attrs: {
                        age: receiver.age,
                        is_elderly: receiver.is_elderly,
                        gender: receiver.gender
                    }
                }
    """
    result = G_adb.query(aql_query)
    edges_data = list(result)
    if not edges_data:
        return "No data found in the graph to analyze."

    # Convert AQL results to a NetworkX graph
    df_edges = pd.DataFrame(edges_data)
    G = nx.from_pandas_edgelist(
        df_edges,
        source='source',
        target='target',
        edge_attr='edge_attrs',
        create_using=nx.MultiDiGraph()
    )
    node_attrs = {row['source']: row['sender_attrs'] for _, row in df_edges.iterrows()}
    node_attrs.update({row['target']: row['receiver_attrs'] for _, row in df_edges.iterrows()})
    nx.set_node_attributes(G, node_attrs)

    # Generate NetworkX code to answer the query
    prompt = f"""
    Given a NetworkX directed graph `G` with schema:
    - Nodes: 'account' (string), 'age' (int), 'is_elderly' (0/1), 'gender' (Male/Female)
    - Edges: 'Transaction_ID' (string), 'Amount' (float), 'Date' (YYYY-MM-DD), 'Is_fraud' (0/1), 'Type_of_fraud' (string), 'Loss' (float)
    Note: Edge attributes are nested under 'edge_attrs'.
    Write Python code using NetworkX to answer: "{query}"
    - Use appropriate algorithms (e.g., nx.pagerank).
    - Access edge attributes via `data["edge_attrs"]["attribute_name"]`.
    - Return a result (e.g., dict, list).
    - Assume `G` is a DiGraph.
    """
    code_response = llm.invoke(prompt)
    code = re.sub(r"^```python\n|```$", "", code_response.content, flags=re.MULTILINE).strip()
    print(code)

    # Execute the generated code
    exec_globals = {'nx': nx, 'G': G}
    exec_locals = {}
    try:
        exec(code, exec_globals, exec_locals)
        result = exec_locals.get('result', None)
        if result is None:
            return "The generated code did not produce a valid result."
    except Exception as e:
        return f"Error executing NetworkX code: {str(e)}"

    # Convert result to natural language
    result_prompt = f"""
    Analyze this NetworkX result for query "{query}":
    Result: {result}
    Provide a concise natural language response with key insights and possible actions.
    """
    nl_response = llm.invoke(result_prompt)
    return nl_response.content.strip()

In [22]:
from pydantic import BaseModel, Field
from typing import Literal
from langchain_core.prompts import ChatPromptTemplate

# Define a Pydantic model for tool selection
class QueryTool(BaseModel):
    selected_tools: Literal[
        '[text_to_aql_to_text]',
        '[text_to_nx_algorithm_to_text]',
        '[text_to_aql_to_text, text_to_nx_algorithm_to_text]',
    ] = Field(..., description="Determines which tool(s) to use.")

# Prompt for classifying queries
query_classifier_prompt = ChatPromptTemplate.from_messages([
    ("system", """
You are an AI assistant responsible for classifying user queries about a financial fraud graph to determine which tools should be used.

**Available Tools:**
1. `text_to_aql_to_text`: For retrieving structured data from ArangoDB (e.g., listing transactions).
2. `text_to_nx_algorithm_to_text`: For graph analytics using NetworkX (e.g., centrality, clustering).
3. Both in sequence: For hybrid queries needing retrieval and analysis.

**Instructions:**
- Analyze the query and decide which tool(s) to use.
- Return your decision in the format: `[selected_tools=<tool_selection>]`
  - Examples: `[selected_tools="[text_to_aql_to_text]"]`, `[selected_tools="[text_to_nx_algorithm_to_text]"]`, `[selected_tools="[text_to_aql_to_text, text_to_nx_algorithm_to_text]"]`
- Provide reasoning before your decision.

**Examples:**
- Query: "Show transactions flagged as fraudulent."
  - Reasoning: This is a data retrieval task.
  - Output: `[selected_tools="[text_to_aql_to_text]"]`

- Query: "Find the most influential fraudsters in the network., Find the top 5 accounts that have the most influence in fraudulent transactions, considering both direct and indirect connections."
  - Reasoning: This requires graph analytics.
  - Output: `[selected_tools="[text_to_nx_algorithm_to_text]"]`

- Query: "Who are the most influential fraudsters connected to elderly accounts?"
  - Reasoning: This needs data retrieval (fraudsters connected to elderly) and analytics (influence ranking).
  - Output: `[selected_tools="[text_to_aql_to_text, text_to_nx_algorithm_to_text]"]`
"""),
    ("human", "Query: {query}")
])

In [23]:
# Initialize classifier LLM
classifier_llm = ChatGroq(model="mixtral-8x7b-32768")
classifier = query_classifier_prompt | classifier_llm.with_structured_output(QueryTool)

In [26]:
def execute_tools(query: str, classifier_output: str) -> str:
    selected_tools = classifier_output.split("selected_tools=")[1]

    if selected_tools == '[text_to_aql_to_text]':
        return text_to_aql_to_text(query)
    elif selected_tools == '[text_to_nx_algorithm_to_text]':
        return text_to_nx_algorithm_to_text(query)
    elif selected_tools == '[text_to_aql_to_text, text_to_nx_algorithm_to_text]':
        aql_result = text_to_aql_to_text(query)
        nx_query = f"{query} based on the following data: {aql_result}"
        return text_to_nx_algorithm_to_text(nx_query)
    else:
        return "Invalid tool selection."

# Define the two-agent query_graph function
def query_graph(query: str) -> str:
    # Step 1: Classify the query
    classifier_result = classifier.invoke({"query": query})
    classifier_output = f"selected_tools={classifier_result.selected_tools}"
    print(f"Classifier Decision: {classifier_output}")

    # Step 2: Execute the tools based on classification
    result = execute_tools(query, classifier_output)
    return result

In [None]:
# 7. Experiment with example queries
# Note: Some may work, some may not!

# simple queries
query = "Show transactions flagged as fraudulent."
query = "Show all transactions above $100."
query = "Find all direct connections of account 500326438."
query = "Show details of account 287501362."
query = "Find all accounts linked to elderly individuals."
query = "How many fraud transactions involved elderly people?"

# complex queries
query = "Find the top 5 accounts that have the most influence in fraudulent transactions, considering both direct and indirect connections."

# hybrid queries
query = "Who are the most influential fraudsters connected to elderly accounts?"

In [27]:
query_graph("Find all accounts linked to elderly individuals.")

Classifier Decision: selected_tools=[text_to_aql_to_text]


[1m> Entering new ArangoGraphQAChain chain...[0m


  return text_to_aql_to_text(query)


AQL Query (1):[32;1m[1;3m
WITH knowledge_graph_node
FOR node IN knowledge_graph_node
  FILTER node.is_elderly == 1
  RETURN {
    account: node.account,
    age: node.age,
    gender: node.gender
  }
[0m
AQL Result:
[32;1m[1;3m[{'account': '30635852', 'age': 62, 'gender': 'Male'}, {'account': '674715057', 'age': 69, 'gender': 'Female'}, {'account': '896619255', 'age': 73, 'gender': 'Female'}, {'account': '175484941', 'age': 64, 'gender': 'Male'}, {'account': '991307112', 'age': 80, 'gender': 'Female'}, {'account': '876106572', 'age': 61, 'gender': 'Male'}, {'account': '936575986', 'age': 61, 'gender': 'Male'}, {'account': '739328947', 'age': 63, 'gender': 'Male'}, {'account': '925602213', 'age': 65, 'gender': 'Male'}, {'account': '69212356', 'age': 87, 'gender': 'Male'}, {'account': '796285932', 'age': 65, 'gender': 'Female'}, {'account': '47654552', 'age': 73, 'gender': 'Male'}, {'account': '731448745', 'age': 80, 'gender': 'Male'}, {'account': '756420240', 'age': 63, 'gender': '

'Based on the provided information, here is a natural language summary:\n\n"We have found 20 accounts linked to elderly individuals. The accounts belong to individuals with ages ranging from 61 to 87, with a mix of males and females. The accounts are associated with the following details: \n\n- 10 male accounts with ages 61, 62, 63, 64, 65, 65, 72, 73, 73, 80, 80, 83, 87 \n- 10 female accounts with ages 63, 65, 69, 71, 73, 79, 80, 81, 87 \n\nThese accounts can be further reviewed to determine their specific needs and requirements."'