In [1]:
import pandas as pd

df = pd.read_csv("extracted_data.csv")

In [2]:
df = df.iloc[:2, :]

In [3]:
from openai import AzureOpenAI
import os
import dotenv

dotenv.load_dotenv()

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-03-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Function to generate synthetic patent text
def generate_patent_text(title):
    prompt = f"Create a text to simulate a patent in the automotive industry on \"{title}\""
    deployment_name = "gpt-35-turbo-instruct"
    response = client.completions.create(
        model=deployment_name,
        prompt=prompt,
        temperature=0.7,
        max_tokens=800
    )
    return response.choices[0].text.strip()

# Function to save synthetic patent text to file
def save_patent_text_to_file(id, text):
    directory = "synthetic_patents"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(os.path.join(directory, f"{id.replace("/", "_")}.txt"), "w") as file:
        file.write(text)

# Generate synthetic patent texts and save them to files
for index, row in df.iterrows():
    title = row['title']
    patent_text = generate_patent_text(title)
    save_patent_text_to_file(row['patent_id'], patent_text)
    print(f"Patent text for ID {row['patent_id']} saved successfully.")

print("All patent texts saved successfully.")


Patent text for ID patent/US10427604B2/en saved successfully.
Patent text for ID patent/US20210234767A1/en saved successfully.
All patent texts saved successfully.


In [4]:

# Create an empty list to store text data
text_data = []


# Function to read text files and return text content
def read_text_file(file_path):
    with open(file_path, "r") as file:
        return file.read()
    
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Read the corresponding text file
    file_path = f"synthetic_patents/{row['patent_id'].replace("/", "_")}.txt"
    if os.path.exists(file_path):
        text_content = read_text_file(file_path)
        text_data.append({"patent_id": row["patent_id"], "text": text_content})

# Create a DataFrame from the text data
text_df = pd.DataFrame(text_data)

# Join the text DataFrame with the original DataFrame on the "id" column
merged_df = pd.merge(df, text_df, on="patent_id", how="left")

In [5]:
merged_df


Unnamed: 0,position,rank,patent_id,serpapi_link,title,priority_date,filing_date,grant_date,publication_date,inventor,assignee,publication_number,language,thumbnail,pdf,page,text
0,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,Title: Vision System for a Vehicle\n\nAbstract...
1,2,1,patent/US20210234767A1/en,https://serpapi.com/search.json?engine=google_...,Vehicle middleware,2011-11-16,2021-04-16,,2021-07-29,Christopher P. Ricci,Autoconnect Holdings Llc,US20210234767A1,en,https://patentimages.storage.googleapis.com/aa...,https://patentimages.storage.googleapis.com/42...,1.0,Title: Vehicle Middleware for Efficient Data T...


Chunking

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter_char = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 20,
    length_function = len,
    separators=["<ele>", ". ", " "],
    keep_separator=False
)

In [7]:
import pandas as pd

def chunk_documents(content_df: pd.DataFrame, text_splitter, word_count_threshold: int = 10, element_separator: str = "<ele>") -> pd.DataFrame:
    """
    Split a document into chunks according to the configuration of the text splitter
    """

    
    # Split content into chunks
    content_df["text_chunks"] = content_df["text"].apply(text_splitter.split_text)
    content_df = content_df.explode("text_chunks")
    content_df["text_chunks"] = content_df["text_chunks"].str.replace(element_separator, ", ")
    
    content_df["text_chunks"] = content_df["title"] + ": " + content_df["text_chunks"]
    
    # Drop failed extractions
    print(f'Files with extraction errors: {set(content_df[content_df["text_chunks"].isna()]["title"].to_list())}')
    content_df = content_df.dropna(axis=0, subset=["text_chunks"])
    
    # Drop chunks with less words than the threshold allows
    content_df = content_df[content_df["text_chunks"].apply(lambda n: len(n.split())) > word_count_threshold]

    return content_df

# Example usage:
# chunked_df = chunk_documents(df, text_splitter_char)


In [8]:

chunk_df = chunk_documents(merged_df, text_splitter_char)

Files with extraction errors: set()


In [9]:
chunk_df.text_chunks.values[:3]

array(['Vision system for a vehicle: Title: Vision System for a Vehicle\n\nAbstract:\n\nThe present invention relates to a vision system for a vehicle, particularly for use in automobiles. The system comprises of a set of cameras and sensors placed strategically around the vehicle to provide a comprehensive and real-time view of the vehicle’s surroundings',
       'Vision system for a vehicle: The collected data is then processed and analyzed to provide the driver with enhanced situational awareness, enabling them to make informed decisions while driving.\n\nBackground:\n\nThe automotive industry has witnessed a significant increase in the use of advanced driver assistance systems (ADAS) to improve vehicle safety and performance. These systems heavily rely on various sensors and cameras to provide a 360-degree view of the vehicle’s surroundings',
       'Vision system for a vehicle: However, the existing systems have limitations in terms of accuracy, reliability, and cost. Therefore, t

NER

In [34]:
import os
import pandas as pd
import ast

# Initialize AzureOpenAI client
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-03-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Few-shot prompt for entity extraction
few_shot_prompt = """RETURN ONLY A VALID JSON OBJECT!!! NO ADDITIONAL TEXT

Extract relevant entities such as people, technologies, places, and organizations from the given text.

Example 1:
Text: "The invention relates to an advanced LiDAR technology for environmental perception."
{"technologies": ["LiDAR"], "places": [], "people": [], "organizations": []}


Example 2:
Text: "The vehicle was tested in urban environments such as New York City and Los Angeles."
{"technologies": [], "places": ["New York City", "Los Angeles"], "people": [], "organizations": []}

Example 3:
Text: "The engineers John Smith and Alice Johnson collaborated on the development of the navigation system."
{"technologies": ["navigation system"], "places": [], "people": ["John Smith", "Alice Johnson"], "organizations": []}

Example 4:
Text: "The company XYZ Inc. funded the research project on autonomous vehicle navigation."
{"technologies": ["autonomous vehicle navigation"], "places": [], "people": [], "organizations": ["XYZ Inc."]}

Example 4:
Text: "A glass filled with liquids."
{"technologies": [], "places": [], "people": [], "organizations": []}


RETURN ONLY A VALID JSON OBJECT!!! NO ADDITIOANL TEXT. JUST A JSON
IF THERE ARE NOT ENTITIES RETURN AN EMPTY JSON
---"""


# Function to extract entities from text using Azure OpenAI API
def extract_entities(text):
    prompt = f"{few_shot_prompt}\nText: \"{text}\"\n"
    deployment_name = "gpt-35-turbo-instruct"
    response = client.completions.create(
        model=deployment_name,
        prompt=prompt,
        max_tokens=100,
        temperature=0.3,
        top_p=1,
        stop=["\n"]
    )
    # Parse the response as JSON
    entities_json = response.choices[0].text.strip()
    return entities_json


# Create an empty list to store extracted entity data
entity_data = []

# Iterate over each row in the DataFrame
for index, row in chunk_df.iterrows():
    text = row["text_chunks"]
    # Extract entities from the text
    entities = extract_entities(text)
    if entities is not None:
        entity_data.append(entities)
    else:
        entity_data.append('')


chunk_df["entities"] = entity_data

In [35]:
from utils.retriever import retriever_model_inference

chunk_df["embeddings"] = retriever_model_inference(chunk_df.text_chunks.values.tolist())

True


In [42]:
chunk_df["embeddings"].values[0].shape

(1536,)

In [61]:
vector_dim=1536
partition_nr=10

from utils.faiss_storage import index_initializaton, train_index, add_index_vectors
import numpy as np
import faiss

# 1. initialize index
#index = index_initializaton(vector_dim, partition_nr)
path_to_index = "a4_db_data.index"

def check_index_state(index):
    return index.is_trained

def vector_db_setup(
    vector_dim: int, partition_nr: int, df: pd.DataFrame
):
    # 1. initialize index
    index = index_initializaton(vector_dim, partition_nr)
    print("Index initialized")

    # 2. check index state - and train if required
    if check_index_state(index):
        pass
    else:
        vectors = df["embeddings"].values
        vectors = np.array([i.astype(np.float32) for i in chunk_df["embeddings"].values])
        print("Shape of input vectors:", vectors.shape)
        train_index(index, vectors)
        print("Index trained")
        # 3. add vectors to index
        add_index_vectors(index, vectors)
        print("Vectors added to index")

    return index



index = vector_db_setup(vector_dim, partition_nr, chunk_df)

Index initialized
Shape of input vectors: (17, 1536)
Index trained
Vectors added to index


In [62]:
faiss.write_index(index, path_to_index)

In [63]:
index = faiss.read_index(path_to_index)
index.ntotal

17

In [65]:
chunk_df.drop(columns=["embeddings"]).to_csv("a4_db_data.csv", index=False)

In [67]:
pd.read_csv("a4_db_data.csv").head(3)

Unnamed: 0,position,rank,patent_id,serpapi_link,title,priority_date,filing_date,grant_date,publication_date,inventor,assignee,publication_number,language,thumbnail,pdf,page,text,text_chunks,entities
0,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,Title: Vision System for a Vehicle\n\nAbstract...,Vision system for a vehicle: Title: Vision Sys...,
1,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,Title: Vision System for a Vehicle\n\nAbstract...,Vision system for a vehicle: The collected dat...,"{""technologies"": [""Vision system"", ""advanced d..."
2,1,0,patent/US10427604B2/en,https://serpapi.com/search.json?engine=google_...,Vision system for a vehicle,2000-03-02,2018-08-27,2019-10-01,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,Title: Vision System for a Vehicle\n\nAbstract...,"Vision system for a vehicle: However, the exis...","{""technologies"": [], ""places"": [], ""people"": [..."
