In [None]:
!pip install neo4j openai pandas numpy


Collecting neo4j
  Downloading neo4j-6.0.3-py3-none-any.whl.metadata (5.2 kB)
Downloading neo4j-6.0.3-py3-none-any.whl (325 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-6.0.3


In [None]:
import os
import numpy as np
import pandas as pd
from neo4j import GraphDatabase
from openai import OpenAI

# ---------- CONFIG ----------

# Neo4j Aura
NEO4J_URI="XXXXXXXX"
NEO4J_USER="neo4j"
NEO4J_PASSWORD="XXXXXXXXXXXXXXX"

# OpenAI
os.environ["OPENAI_API_KEY"] = "XXXXXXXXXXXXXXXXXXXX"   # or set it in the env directly
OPENAI_MODEL_EMBED = "text-embedding-3-small"
OPENAI_MODEL_CHAT  = "gpt-4.1-mini"   # you can swap to another model if you want

# Paths to store embeddings on disk
EMBEDDINGS_PATH = "/content/drive/MyDrive/Patent_Data/500_exports/patent_embeddings_500.parquet"

# ---------- CLIENTS ----------

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
client = OpenAI()


In [None]:
def fetch_all_patents(tx):
    """
    Fetch all Patent nodes: id, title, abstract.
    """
    query = """
    MATCH (p:Patent)
    RETURN p.id AS id, p.title AS title, p.abstract AS abstract
    """
    result = tx.run(query)
    rows = []
    for record in result:
        rows.append({
            "patent_id": str(record["id"]),
            "title": record["title"] or "",
            "abstract": record["abstract"] or ""
        })
    return pd.DataFrame(rows)

with driver.session() as session:
    df_patents = session.execute_read(fetch_all_patents)

print("Patents loaded from Neo4j:", len(df_patents))
df_patents.head()


Patents loaded from Neo4j: 500


Unnamed: 0,patent_id,title,abstract
0,11856881,"Detection of plant diseases with multi-stage, ...",A computer system is provided comprising a cla...
1,11856943,"Control system, agricultural utility vehicle a...",A control system for an agricultural utility v...
2,11857204,Surgical instrument,"A surgical instrument (25), in particular a su..."
3,11857257,Functional oct data processing,A method of processing functional OCT image da...
4,11857271,Markerless navigation using AI computer vision,"Provided herein are devices, systems, and meth..."


In [None]:
def embed_texts(texts, batch_size=32):
    """
    Compute embeddings for a list of texts using OpenAI embeddings.
    Returns a list of numpy arrays.
    """
    vectors = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        response = client.embeddings.create(
            model=OPENAI_MODEL_EMBED,
            input=batch
        )
        for item in response.data:
            vectors.append(np.array(item.embedding, dtype=np.float32))
    return vectors

# Build text to embed: you can tune this concatenation
df_patents["embed_text"] = (
    "Title: " + df_patents["title"] + ". "
    "Abstract: " + df_patents["abstract"]
)

texts = df_patents["embed_text"].tolist()
embs  = embed_texts(texts, batch_size=32)

# Turn into a 2D array
emb_matrix = np.vstack(embs)
print("Embedding matrix shape:", emb_matrix.shape)

# Save as Parquet (each row is a patent with flattened embedding)
df_emb = df_patents[["patent_id", "title", "abstract"]].copy()
df_emb["embedding"] = [vec.tolist() for vec in emb_matrix]

df_emb.to_parquet(EMBEDDINGS_PATH, index=False)
print(f"✅ Saved embeddings to {EMBEDDINGS_PATH}")


Embedding matrix shape: (500, 1536)
✅ Saved embeddings to /content/drive/MyDrive/Patent_Data/500_exports/patent_embeddings_500.parquet


In [None]:
# Load the saved embeddings
df_emb = pd.read_parquet(EMBEDDINGS_PATH)

# Convert list → numpy array for fast similarity
emb_matrix = np.vstack(df_emb["embedding"].apply(np.array).values)

# For easy ID lookups
patent_ids = df_emb["patent_id"].tolist()

print("Loaded embedding matrix:", emb_matrix.shape)


Loaded embedding matrix: (500, 1536)


In [None]:
def embed_query(query: str) -> np.ndarray:
    resp = client.embeddings.create(
        model=OPENAI_MODEL_EMBED,
        input=[query],
    )
    return np.array(resp.data[0].embedding, dtype=np.float32)


def cosine_sim_matrix(vec: np.ndarray, mat: np.ndarray) -> np.ndarray:
    """
    vec: shape (d,)
    mat: shape (N, d)
    returns: shape (N,) similarity scores
    """
    v = vec / (np.linalg.norm(vec) + 1e-8)
    M = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-8)
    return M @ v


In [None]:
def fetch_patent_subgraph(patent_ids):
    """
    For a list of patent_ids, get:
      - patent properties
      - inventors, assignees, cpcs, locations
    Returns a dict keyed by patent_id containing structured context.
    """
    query = """
    MATCH (p:Patent)
    WHERE p.id IN $patent_ids
    OPTIONAL MATCH (p)-[:PATENTED_BY]->(inv:Inventor)
    OPTIONAL MATCH (p)-[:ASSIGNED_TO]->(a:Assignee)
    OPTIONAL MATCH (p)-[:CLASSIFIED_AS]->(c:CPC)
    OPTIONAL MATCH (a)-[:LOCATED_IN]->(loc:Location)
    RETURN
        p.id AS patent_id,
        p.title AS title,
        p.abstract AS abstract,
        p.patent_date AS patent_date,
        collect(DISTINCT inv.id) AS inventors,
        collect(DISTINCT a.id) AS assignees,
        collect(DISTINCT c.code) AS cpc_codes,
        collect(DISTINCT loc.location_clean) AS locations
    """
    with driver.session() as session:
        result = session.run(query, patent_ids=patent_ids)
        contexts = {}
        for record in result:
            pid = str(record["patent_id"])
            contexts[pid] = {
                "title": record["title"] or "",
                "abstract": record["abstract"] or "",
                "patent_date": record["patent_date"] or "",
                "inventors": [x for x in (record["inventors"] or []) if x],
                "assignees": [x for x in (record["assignees"] or []) if x],
                "cpc_codes": [x for x in (record["cpc_codes"] or []) if x],
                "locations": [x for x in (record["locations"] or []) if x],
            }
    return contexts


In [None]:
def build_context_block(pid: str, info: dict) -> str:
    lines = []
    lines.append(f"Patent ID: {pid}")
    if info.get("title"):
        lines.append(f"Title: {info['title']}")
    if info.get("abstract"):
        lines.append(f"Abstract: {info['abstract']}")
    if info.get("patent_date"):
        lines.append(f"Publication date: {info['patent_date']}")
    if info.get("cpc_codes"):
        lines.append(f"CPC codes: {', '.join(info['cpc_codes'])}")
    if info.get("assignees"):
        lines.append(f"Assignees: {', '.join(info['assignees'])}")
    if info.get("inventors"):
        lines.append(f"Inventors: {', '.join(info['inventors'])}")
    if info.get("locations"):
        lines.append(f"Assignee locations: {', '.join(info['locations'])}")
    return "\n".join(lines)


In [None]:
def graph_rag_answer(question: str, k: int = 5) -> dict:
    """
    1. Embed the question
    2. Find top-k similar patents (via embedding similarity)
    3. Expand those patents into neighborhood context (Neo4j)
    4. Ask the LLM to answer using that graph-aware context
    """
    # 1) Embed query
    q_vec = embed_query(question)
    sims  = cosine_sim_matrix(q_vec, emb_matrix)

    # 2) Top-k patents
    top_idx = np.argsort(-sims)[:k]
    top_patent_ids = [patent_ids[i] for i in top_idx]
    top_scores     = [float(sims[i]) for i in top_idx]

    # 3) Fetch graph neighborhoods from Neo4j
    contexts = fetch_patent_subgraph(top_patent_ids)

    # 4) Build context text
    context_blocks = []
    for pid, score in zip(top_patent_ids, top_scores):
        info = contexts.get(pid, {})
        block = build_context_block(pid, info)
        context_blocks.append(f"(score={score:.3f})\n{block}")

    full_context = "\n\n---\n\n".join(context_blocks)

    # 5) Call LLM
    prompt = f"""
You are a patent analysis assistant. The user will ask a question about technology, patents, or similar topics.

You are given a set of relevant patents and their graph metadata
(CPC codes, inventors, assignees, locations). Use ONLY this information
to answer, and if something is unclear, say so explicitly.

User question:
{question}

Relevant patent graph context:
{full_context}

Please answer clearly, and refer to specific patent IDs and titles when helpful.
"""
    chat_resp = client.chat.completions.create(
        model=OPENAI_MODEL_CHAT,
        messages=[
            {"role": "system", "content": "You are a careful and precise patent analysis assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )
    answer = chat_resp.choices[0].message.content

    return {
        "question": question,
        "top_patent_ids": top_patent_ids,
        "context_preview": full_context[:2000],  # just for debugging
        "answer": answer,
    }


In [None]:
res = graph_rag_answer(
    "Which patents relate to deep learning for agricultural or plant disease detection?"
)
print(res["answer"])




The patents that relate to deep learning for agricultural or plant disease detection are:

1. Patent ID 11856881 - "Detection of plant diseases with multi-stage, multi-scale deep learning"
   - This patent specifically addresses plant disease detection using multi-stage, multi-scale deep learning models applied to images of plants.

2. Patent ID 11864494 - "AI-optimized harvester configured to maximize yield and minimize impurities"
   - This patent involves using machine learning models (which can include deep learning) to detect impurities in harvested plants, which is related to plant quality detection in agriculture.

3. Patent ID 11868100 - "System and method for irrigation management using machine learning workflows"
   - While this patent focuses on irrigation management, it uses machine learning workflows analyzing data from sensors related to crops, which is relevant to agricultural monitoring.

The other patents listed either focus on medical image detection (11861829) or cla

In [None]:
res = graph_rag_answer(
    "Are there any patents related to knowledge graphs or recommendation systems?"
)
print(res["answer"])




Yes, there are several patents related to knowledge graphs and recommendation systems in the provided data:

1. Knowledge Graphs:
   - Patent ID 11861311: "System and method for knowledge graph construction using capsule neural network" describes a system for constructing knowledge graphs by defining entities and relations, converting sentences into embeddings, and using neural network techniques to learn entity and relation capsules.
   - Patent ID 11860929: "System and method for unifying feature vectors in a knowledge graph" involves multi-datatype searching by generating and expanding vectors associated with different datatypes within a knowledge graph.
   - Patent ID 11861308: "Mapping natural language utterances to operations over a knowledge graph" focuses on processing natural language queries by mapping operands and operators to nodes and operations in a knowledge graph to generate query results.

2. Recommendation Systems:
   - Patent ID 11860726: "Recommending remediation ac

In [None]:
res = graph_rag_answer(
    "Which patents involve cloud computing and are assigned to large US-based companies?"
)
print(res["answer"])




Among the patents listed, the following involve cloud computing and are assigned to large US-based companies:

1. Patent ID 11865649: "Welding systems and methods utilizing cloud computing and data storage"  
   - Assignee: ILLINOIS TOOL WORKS INC. (a large US-based company)  
   - This patent explicitly involves cloud computing for welding data management and transactions.

2. Patent ID 11861342: "Enhanced cloud-computing environment deployment"  
   - Assignee: MicroStrategy Incorporated (a large US-based company)  
   - This patent relates to deployment tools and workflows for cloud computing environments.

3. Patent ID 11857872: "Content adaptive data center routing and forwarding in cloud computing environments"  
   - Assignee: NVIDIA CORPORATION (a large US-based company)  
   - This patent addresses cloud computing environments for optimized application session routing.

The other patents either do not specify an assignee that is a large US-based company (e.g., Aviatrix Systems

In [None]:
res = graph_rag_answer(
    "Which assignee published the most number of patents?"
)
print(res["answer"])



Based on the provided patent graph context, each patent lists a different assignee except for one patent (ID 11868368) which does not specify an assignee.

Here is the breakdown of assignees and their number of patents:

- ILLINOIS TOOL WORKS INC.: 1 patent (ID 11865649, "Welding systems and methods utilizing cloud computing and data storage")
- NATIONAL TSING HUA UNIVERSITY: 1 patent (ID 11861702, "Method and apparatus for renewable energy allocation based on reinforcement learning")
- VMware LLC: 1 patent (ID 11868644, "Techniques for tracking frequently accessed memory")
- STATE FARM MUTUAL AUTOMOBILE INSURANCE COMPANY: 1 patent (ID 11861566, "Vehicle telematics systems and methods")
- Patent ID 11868368 ("System and method for implementing consensus in distributed ledger arrangement") does not list an assignee.

Since all assignees have published exactly one patent each in this set, no single assignee has published the most number of patents.

Therefore, based on the given informat

In [None]:
from neo4j import GraphDatabase
from openai import OpenAI

client = OpenAI()

driver = GraphDatabase.driver(
    NEO4J_URI,
    auth=("neo4j", NEO4J_PASSWORD)
)

# LLM prompt template
SYSTEM_PROMPT = """
You are a Cypher expert. Convert the user’s natural-language question
into a SINGLE Cypher query compatible with Neo4j.

Schema:
(:Patent {id, title, abstract, year})
(:Inventor {id})
(:Assignee {id})
(:Location {id, location})
(:CPC {id})

Relationships:
(:Patent)-[:PATENTED_BY]->(:Inventor)
(:Patent)-[:ASSIGNED_TO]->(:Assignee)
(:Patent)-[:CLASSIFIED_AS]->(:CPC)
(:Assignee)-[:LOCATED_IN]->(:Location)

Return Cypher only, no explanation.
"""

def nl_to_cypher(question):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": question}
        ]
    )

    cypher = completion.choices[0].message.content.strip()


    cypher = cypher.replace("```cypher", "")
    cypher = cypher.replace("```", "")
    cypher = cypher.strip()

    return cypher

def run_cypher(cypher):
    with driver.session() as session:
        result = session.run(cypher)
        return [record.data() for record in result]

def ask_graph(question):
    cypher = nl_to_cypher(question)
    print("Generated Cypher:\n", cypher)

    data = run_cypher(cypher)

    if not data:
        return "No results."

    # Summarize results using LLM
    summary = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Summarize the Neo4j results clearly."},
            {"role": "user", "content": str(data)}
        ]
    ).choices[0].message.content

    return summary

# Example question:
print(ask_graph("List ten patents assigned to Google LLC"))


Generated Cypher:
 MATCH (a:Assignee {id: 'Google LLC'})<-[:ASSIGNED_TO]-(p:Patent)
RETURN p 
LIMIT 10
Here are key details from the recent patent applications submitted on January 2nd and January 9th, 2024:

1. **MUSS—Map User Submission States** (Patent ID: 11860857)
   - **Abstract**: An interactive user interface for submitting and reviewing updates related to points of interest, utilizing a machine learning model for content classification and approval notifications.

2. **Scaling High-Level Statistical Languages to Large, Distributed Datasets** (Patent ID: 11861331)
   - **Abstract**: A system for large-scale data processing that dynamically translates high-level statistical operations into efficient low-level operations for distributed execution.

3. **Augmentation of Code Completion and Code Synthesis with Semantic Checking** (Patent ID: 11861333)
   - **Abstract**: Method for providing context-aware autofill suggestions in a development environment, using machine learning for 

In [None]:
print(ask_graph("List ten patents assigned to companies based out of Kyoto, Japan"))

Generated Cypher:
 MATCH (p:Patent)-[:ASSIGNED_TO]->(a:Assignee)-[:LOCATED_IN]->(l:Location {location: "Kyoto, Japan"})
RETURN p
LIMIT 10
Here are the summarized results of the Neo4j query:

1. **Patent ID: 11861826**
   - **Title**: Imaging data processing device
   - **Date**: January 2, 2024
   - **Abstract**: Describes a device that superimposes and aligns stained and mass spectrometric images of a sample. It features a grid for precise image deformation, allowing efficient changes in image representation based on user specifications.

2. **Patent ID: 11862445**
   - **Title**: Imaging mass spectrometer
   - **Date**: January 2, 2024
   - **Abstract**: Outlines an imaging mass spectrometer that integrates data from mass spectrometry and Raman analysis, normalizing signal intensity and aligning spatial resolutions for statistical analysis of the imaging graphics obtained.

3. **Patent ID: 11862027**
   - **Title**: Flight path calculation system, flight path calculation program, and

In [None]:
print(ask_graph("List some patents classified as Y04"))

Generated Cypher:
 MATCH (p:Patent)-[:CLASSIFIED_AS]->(c:CPC {id: 'Y04'})
RETURN p
The Neo4j results provide information on two patents filed on January 2, 2024:

1. **Patent ID: 11861634**
   - **Title**: Utility portals for managing demand-response events
   - **Abstract**: This patent describes a method for creating a utility portal interface that interacts with a utility computer system to manage demand response events. It includes features for displaying energy-consuming locations, updating energy demand profiles dynamically based on selections made by the utility, and transmitting commands to connected thermostats to execute the events.

2. **Patent ID: 11862972**
   - **Title**: Collaborative service provisioning of distributed energy resources
   - **Abstract**: This patent outlines a system and method for aggregating distributed energy resources (DERs) to fulfill requests. The technology involves a routing system that identifies and evaluates DER contributors based on profiles