In [4]:
import pandas as pd

df = pd.read_csv("preperation/extracted_data.csv")

In [5]:
df = df.iloc[:20, :]

In [6]:
from openai import AzureOpenAI
import os
import dotenv

dotenv.load_dotenv()

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-03-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Function to generate synthetic patent text
def generate_patent_text(title):
    prompt = f"Create a text to simulate a patent in the automotive industry on \"{title}\""
    deployment_name = "gpt-35-turbo-instruct"
    response = client.completions.create(
        model=deployment_name,
        prompt=prompt,
        temperature=0.7,
        max_tokens=800
    )
    return response.choices[0].text.strip()

# Function to save synthetic patent text to file
def save_patent_text_to_file(id, text):
    directory = "synthetic_patents"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(os.path.join(directory, f"{id.replace("/", "_")}.txt"), "w") as file:
        file.write(text)

# Generate synthetic patent texts and save them to files
for index, row in df.iterrows():
    title = row['title']
    patent_text = generate_patent_text(title)
    save_patent_text_to_file(row['patent_id'], patent_text)
    print(f"Patent text for ID {row['patent_id']} saved successfully.")

print("All patent texts saved successfully.")


Patent text for ID patent/US10427604B2/en saved successfully.
Patent text for ID patent/US20210234767A1/en saved successfully.
Patent text for ID patent/US10979875B2/en saved successfully.
Patent text for ID patent/US10320836B2/en saved successfully.
Patent text for ID patent/US11790420B2/en saved successfully.
Patent text for ID patent/US10613579B2/en saved successfully.
Patent text for ID patent/US11250648B2/en saved successfully.
Patent text for ID patent/US11627433B2/en saved successfully.
Patent text for ID patent/US11380107B2/en saved successfully.
Patent text for ID patent/CN111630404B/en saved successfully.
Patent text for ID patent/US11411917B2/en saved successfully.
Patent text for ID patent/US10846690B2/en saved successfully.
Patent text for ID patent/US11513187B2/en saved successfully.
Patent text for ID patent/US11462061B2/en saved successfully.
Patent text for ID patent/US11038363B2/en saved successfully.
Patent text for ID patent/EP3835810B1/en saved successfully.
Patent

In [7]:

# Create an empty list to store text data
text_data = []


# Function to read text files and return text content
def read_text_file(file_path):
    with open(file_path, "r") as file:
        return file.read()
    
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Read the corresponding text file
    file_path = f"synthetic_patents/{row['patent_id'].replace("/", "_")}.txt"
    if os.path.exists(file_path):
        text_content = read_text_file(file_path)
        text_data.append({"patent_id": row["patent_id"], "text": text_content})

# Create a DataFrame from the text data
text_df = pd.DataFrame(text_data)

# Join the text DataFrame with the original DataFrame on the "id" column
merged_df = pd.merge(df, text_df, on="patent_id", how="left")

In [8]:
merged_df


Unnamed: 0,position,rank,patent_id,serpapi_link,title,priority_date,filing_date,grant_date,publication_date,inventor,assignee,publication_number,language,thumbnail,pdf,page,text
0,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,Title: Vision System for a Vehicle\n\nAbstract...
1,2,1,patent/US20210234767A1/en,https://serpapi.com/search.json?engine=google_...,Vehicle middleware,2011-11-16,2021-04-16,,2021-07-29,Christopher P. Ricci,Autoconnect Holdings Llc,US20210234767A1,en,https://patentimages.storage.googleapis.com/aa...,https://patentimages.storage.googleapis.com/42...,1.0,Title: Vehicle Middleware System for Enhanced ...
2,3,2,patent/US10979875B2/en,https://serpapi.com/search.json?engine=google_...,System and method for wireless interface selec...,2011-01-14,2018-09-11,2021-04-13,2021-04-13,Lillian Lei Dai,"Cisco Technology, Inc.",US10979875B2,en,https://patentimages.storage.googleapis.com/46...,https://patentimages.storage.googleapis.com/31...,1.0,"Title: ""System and Method for Wireless Interfa..."
3,4,3,patent/US10320836B2/en,https://serpapi.com/search.json?engine=google_...,Automotive ECU controller and data network hav...,2017-01-03,2018-07-16,2019-06-11,2019-06-11,Tal Efraim Ben David,Karamba Security Ltd.,US10320836B2,en,https://patentimages.storage.googleapis.com/50...,https://patentimages.storage.googleapis.com/40...,1.0,Title: Automotive ECU controller and data netw...
4,5,4,patent/US11790420B2/en,https://serpapi.com/search.json?engine=google_...,Visual discovery tool for automotive manufactu...,2016-10-18,2021-01-04,2023-10-17,2023-10-17,Jeffrey Stuart Cotton,"Autoalert, Llc",US11790420B2,en,https://patentimages.storage.googleapis.com/38...,https://patentimages.storage.googleapis.com/f7...,1.0,Title: Visual Discovery Tool for Automotive Ma...
5,6,5,patent/US10613579B2/en,https://serpapi.com/search.json?engine=google_...,Automotive display device,2015-03-02,2019-01-30,2020-04-07,2020-04-07,Seong Min Wang,"Samsung Display Co., Ltd.",US10613579B2,en,https://patentimages.storage.googleapis.com/68...,https://patentimages.storage.googleapis.com/2d...,1.0,Title: Automotive Display Device for Enhanced ...
6,7,6,patent/US11250648B2/en,https://serpapi.com/search.json?engine=google_...,Predictive maintenance of automotive transmission,2019-12-18,2019-12-18,2022-02-15,2022-02-15,Poorna Kale,"Micron Technology, Inc.",US11250648B2,en,https://patentimages.storage.googleapis.com/d3...,https://patentimages.storage.googleapis.com/36...,1.0,Title: Predictive Maintenance System for Autom...
7,8,7,patent/US11627433B2/en,https://serpapi.com/search.json?engine=google_...,Enhanced automotive passive entry,2017-02-10,2021-12-21,2023-04-11,2023-04-11,Brent M. Ledvina,Apple Inc.,US11627433B2,en,https://patentimages.storage.googleapis.com/7f...,https://patentimages.storage.googleapis.com/95...,1.0,Title: Enhanced Automotive Passive Entry Syste...
8,9,8,patent/US11380107B2/en,https://serpapi.com/search.json?engine=google_...,Power and data center (PDC) for automotive app...,2018-06-29,2019-06-28,2022-07-05,2022-07-05,Lee Bauer,Aptiv Technologies Limited,US11380107B2,en,https://patentimages.storage.googleapis.com/83...,https://patentimages.storage.googleapis.com/ca...,1.0,"Title: ""Power and Data Center (PDC) for Automo..."
9,10,9,patent/CN111630404B/en,https://serpapi.com/search.json?engine=google_...,"Detection, mitigation and avoidance of mutual ...",2017-11-29,2018-11-25,2023-10-20,2023-10-20,约拉姆·斯特廷尔,阿尔贝机器人有限公司,CN111630404B,en,,https://patentimages.storage.googleapis.com/a0...,1.0,"Title: Detection, Mitigation and Avoidance of ..."




Chunking

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter_char = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 20,
    length_function = len,
    separators=["<ele>", ". ", " "],
    keep_separator=False
)

In [10]:
import pandas as pd

def chunk_documents(content_df: pd.DataFrame, text_splitter, word_count_threshold: int = 10, element_separator: str = "<ele>") -> pd.DataFrame:
    """
    Split a document into chunks according to the configuration of the text splitter
    """

    
    # Split content into chunks
    content_df["text_chunks"] = content_df["text"].apply(text_splitter.split_text)
    content_df = content_df.explode("text_chunks")
    content_df["text_chunks"] = content_df["text_chunks"].str.replace(element_separator, ", ")
    
    content_df["text_chunks"] = content_df["title"] + ": " + content_df["text_chunks"]
    
    # Drop failed extractions
    print(f'Files with extraction errors: {set(content_df[content_df["text_chunks"].isna()]["title"].to_list())}')
    content_df = content_df.dropna(axis=0, subset=["text_chunks"])
    
    # Drop chunks with less words than the threshold allows
    content_df = content_df[content_df["text_chunks"].apply(lambda n: len(n.split())) > word_count_threshold]

    return content_df

# Example usage:
# chunked_df = chunk_documents(df, text_splitter_char)


In [11]:

chunk_df = chunk_documents(merged_df, text_splitter_char)

Files with extraction errors: set()


In [12]:
chunk_df.text_chunks.values[:3]

array(["Vision system for a vehicle: Title: Vision System for a Vehicle\n\nAbstract:\nThe present invention relates to a vision system for a vehicle, specifically designed to improve the safety and efficiency of driving. This system utilizes advanced technology to provide a 360-degree view of the surroundings of the vehicle, allowing for better awareness of potential hazards and obstacles. The vision system is integrated into the vehicle's existing control systems, making it easy to install and operate",
       "Vision system for a vehicle: This patent presents a detailed description of the components and functionality of the vision system, along with its potential applications in the automotive industry.\n\nBackground:\nWith the increasing number of vehicles on the road, there is a growing concern for the safety of drivers and passengers. Many accidents occur due to limited visibility, blind spots, and other factors that impede the driver's ability to see their surroundings",
       "

NER

In [13]:
import os
import pandas as pd
import ast

# Initialize AzureOpenAI client
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-03-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Few-shot prompt for entity extraction
few_shot_prompt = """RETURN ONLY A VALID JSON OBJECT!!! NO ADDITIONAL TEXT

Extract relevant entities such as people, technologies, places, and organizations from the given text.

Example 1:
Text: "The invention relates to an advanced LiDAR technology for environmental perception."
{"technologies": ["LiDAR"], "places": [], "people": [], "organizations": []}


Example 2:
Text: "The vehicle was tested in urban environments such as New York City and Los Angeles."
{"technologies": [], "places": ["New York City", "Los Angeles"], "people": [], "organizations": []}

Example 3:
Text: "The engineers John Smith and Alice Johnson collaborated on the development of the navigation system."
{"technologies": ["navigation system"], "places": [], "people": ["John Smith", "Alice Johnson"], "organizations": []}

Example 4:
Text: "The company XYZ Inc. funded the research project on autonomous vehicle navigation."
{"technologies": ["autonomous vehicle navigation"], "places": [], "people": [], "organizations": ["XYZ Inc."]}

Example 4:
Text: "A glass filled with liquids."
{"technologies": [], "places": [], "people": [], "organizations": []}


RETURN ONLY A VALID JSON OBJECT!!! NO ADDITIOANL TEXT. JUST A JSON
IF THERE ARE NOT ENTITIES RETURN AN EMPTY JSON
---"""


# Function to extract entities from text using Azure OpenAI API
def extract_entities(text):
    prompt = f"{few_shot_prompt}\nText: \"{text}\"\n"
    deployment_name = "gpt-35-turbo-instruct"
    response = client.completions.create(
        model=deployment_name,
        prompt=prompt,
        max_tokens=100,
        temperature=0.3,
        top_p=1,
        stop=["\n"]
    )
    # Parse the response as JSON
    entities_json = response.choices[0].text.strip()
    return entities_json


# Create an empty list to store extracted entity data
entity_data = []

# Iterate over each row in the DataFrame
for index, row in chunk_df.iterrows():
    text = row["text_chunks"]
    # Extract entities from the text
    entities = extract_entities(text)
    if entities is not None:
        entity_data.append(entities)
    else:
        entity_data.append('')


chunk_df["entities"] = entity_data

In [14]:
from utils.retriever import retriever_model_inference

chunk_df["embeddings"] = retriever_model_inference(chunk_df.text_chunks.values.tolist())

True


In [15]:
chunk_df["embeddings"].values[0].shape

(1536,)

In [16]:
vector_dim=1536
partition_nr=10

from utils.faiss_storage import index_initializaton, train_index, add_index_vectors
import numpy as np
import faiss

# 1. initialize index
#index = index_initializaton(vector_dim, partition_nr)
path_to_index = "a4_db_data.index"

def check_index_state(index):
    return index.is_trained

def vector_db_setup(
    vector_dim: int, partition_nr: int, df: pd.DataFrame
):
    # 1. initialize index
    index = index_initializaton(vector_dim, partition_nr)
    print("Index initialized")

    # 2. check index state - and train if required
    if check_index_state(index):
        pass
    else:
        vectors = df["embeddings"].values
        vectors = np.array([i.astype(np.float32) for i in chunk_df["embeddings"].values])
        print("Shape of input vectors:", vectors.shape)
        train_index(index, vectors)
        print("Index trained")
        # 3. add vectors to index
        add_index_vectors(index, vectors)
        print("Vectors added to index")

    return index



index = vector_db_setup(vector_dim, partition_nr, chunk_df)

Index initialized
Shape of input vectors: (185, 1536)
Index trained
Vectors added to index


In [17]:
faiss.write_index(index, path_to_index)

In [18]:
index = faiss.read_index(path_to_index)
index.ntotal

185

In [19]:
chunk_df.drop(columns=["embeddings"]).to_csv("a4_db_data.csv", index=False)

In [20]:
pd.read_csv("a4_db_data.csv").head(3)

Unnamed: 0,position,rank,patent_id,serpapi_link,title,priority_date,filing_date,grant_date,publication_date,inventor,assignee,publication_number,language,thumbnail,pdf,page,text,text_chunks,entities
0,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,Title: Vision System for a Vehicle\n\nAbstract...,Vision system for a vehicle: Title: Vision Sys...,
1,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,Title: Vision System for a Vehicle\n\nAbstract...,Vision system for a vehicle: This patent prese...,"{""technologies"": [""vision system""], ""places"": ..."
2,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,Title: Vision System for a Vehicle\n\nAbstract...,Vision system for a vehicle: Traditional visio...,
