In [48]:
import pandas as pd




df1 = pd.read_csv("a4_db_patents.csv")
df1 = df1[['rank', 'title', 'publication_date',
        'inventor', 'assignee', 'publication_number', 'language', 'thumbnail', 'pdf', 'page', 'entities', 'text_chunks'
        ]]

df1.to_csv("a4_db_patents.csv", index=False)
df1.head(2)

Unnamed: 0,rank,title,publication_date,inventor,assignee,publication_number,language,thumbnail,pdf,page,entities,text_chunks
0,0,Vision system for a vehicle,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,"{""technologies"": [""vision system""], ""places"": ...",Vision system for a vehicle: Title: Vision Sys...
1,0,Vision system for a vehicle,2019-10-01,Kenneth Schofield,Magna Electronics Inc.,US10427604B2,en,https://patentimages.storage.googleapis.com/24...,https://patentimages.storage.googleapis.com/57...,1.0,,Vision system for a vehicle: This patent prese...


In [49]:
df1.columns

Index(['rank', 'title', 'publication_date', 'inventor', 'assignee',
       'publication_number', 'language', 'thumbnail', 'pdf', 'page',
       'entities', 'text_chunks'],
      dtype='object')

In [50]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker to generate fake data
fake = Faker()

titles = [
    "Mastering Your Vehicle's Red Alerts: A Comprehensive Guide to Critical Maintenance",
    "Navigating Amber Warnings: Essential Maintenance Tips to Keep Your Vehicle Running Smoothly",
    "Green Light Driving: Optimizing Performance and Efficiency for Your Vehicle",
    "Understanding Red Signals: Troubleshooting and Resolving Critical Automotive Issues",
    "Amber Alert: Your Guide to Intermediate Maintenance and Precautionary Measures",
    "Green Zone Maintenance: Proactive Strategies for Long-Term Vehicle Health",
    "Red Flag Repairs: Step-by-Step Solutions for Urgent Automotive Problems",
    "Amber Zone Awareness: Identifying Potential Issues and Taking Preventive Action",
    "Driving in the Green Lane: Maintaining Peak Performance and Reliability",
    "Red Alert Resolutions: Swift and Effective Responses to Emergency Automotive Situations"
]


# Number of entries
num_entries = 10

# Create DataFrame
data = {
    'rank': np.arange(1, num_entries + 1),
    'title': titles,
    'publication_date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(num_entries)],
    'inventor': [fake.name() for _ in range(num_entries)],
    'assignee': [fake.company() for _ in range(num_entries)],
    'publication_number': [str(fake.random_number(digits=10)) for _ in range(num_entries)],
    'language': ['en'] * num_entries,
    'thumbnail': [fake.image_url() for _ in range(num_entries)],
    'pdf': [fake.url() for _ in range(num_entries)],
    'page': [random.randint(1, 10) for _ in range(num_entries)]
}

# Create DataFrame
df = pd.DataFrame(data)


In [26]:
df.columns

Index(['rank', 'title', 'publication_date', 'inventor', 'assignee',
       'publication_number', 'language', 'thumbnail', 'pdf', 'page'],
      dtype='object')

In [51]:
from openai import AzureOpenAI
import os
import dotenv

dotenv.load_dotenv()

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-03-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Function to generate synthetic patent text
def generate_patent_text(title):
    prompt = f"Create a text to simulate a manual on: \"{title}\", related to the automotive industry"
    deployment_name = "gpt-35-turbo-instruct"
    response = client.completions.create(
        model=deployment_name,
        prompt=prompt,
        temperature=0.7,
        max_tokens=800
    )
    return response.choices[0].text.strip()

# Function to save synthetic patent text to file
def save_patent_text_to_file(id, text):
    directory = "synthetic_manuals"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(os.path.join(directory, f"{id.replace("/", "_")}.txt"), "w") as file:
        file.write(text)

# Generate synthetic patent texts and save them to files
for index, row in df.iterrows():
    title = row['title']
    patent_text = generate_patent_text(title)
    save_patent_text_to_file(row['publication_number'], patent_text)
    print(f"Patent text for ID {row['publication_number']} saved successfully.")

print("All patent texts saved successfully.")


Patent text for ID 1740611947 saved successfully.
Patent text for ID 9754827200 saved successfully.
Patent text for ID 1640155484 saved successfully.
Patent text for ID 6773150709 saved successfully.
Patent text for ID 1877850619 saved successfully.
Patent text for ID 2361427102 saved successfully.
Patent text for ID 7395006099 saved successfully.
Patent text for ID 8632135679 saved successfully.
Patent text for ID 5765289575 saved successfully.
Patent text for ID 8281010559 saved successfully.
All patent texts saved successfully.


In [52]:

# Create an empty list to store text data
text_data = []


# Function to read text files and return text content
def read_text_file(file_path):
    with open(file_path, "r") as file:
        return file.read()
    
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Read the corresponding text file
    file_path = f"synthetic_manuals/{row['publication_number'].replace("/", "_")}.txt"
    if os.path.exists(file_path):
        text_content = read_text_file(file_path)
        text_data.append({"publication_number": row["publication_number"], "text": text_content})

# Create a DataFrame from the text data
text_df = pd.DataFrame(text_data)

# Join the text DataFrame with the original DataFrame on the "id" column
merged_df = pd.merge(df, text_df, on="publication_number", how="left")

In [53]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter_char = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 20,
    length_function = len,
    separators=["<ele>", ". ", " "],
    keep_separator=False
)

In [54]:
import pandas as pd

def chunk_documents(content_df: pd.DataFrame, text_splitter, word_count_threshold: int = 10, element_separator: str = "<ele>") -> pd.DataFrame:
    """
    Split a document into chunks according to the configuration of the text splitter
    """

    
    # Split content into chunks
    content_df["text_chunks"] = content_df["text"].apply(text_splitter.split_text)
    content_df = content_df.explode("text_chunks")
    content_df["text_chunks"] = content_df["text_chunks"].str.replace(element_separator, ", ")
    
    content_df["text_chunks"] = content_df["title"] + ": " + content_df["text_chunks"]
    
    # Drop failed extractions
    print(f'Files with extraction errors: {set(content_df[content_df["text_chunks"].isna()]["title"].to_list())}')
    content_df = content_df.dropna(axis=0, subset=["text_chunks"])
    
    # Drop chunks with less words than the threshold allows
    content_df = content_df[content_df["text_chunks"].apply(lambda n: len(n.split())) > word_count_threshold]

    return content_df

# Example usage:
# chunked_df = chunk_documents(df, text_splitter_char)


In [55]:
chunk_df = chunk_documents(merged_df, text_splitter_char)

Files with extraction errors: set()


In [56]:
import os
import pandas as pd
import ast

# Initialize AzureOpenAI client
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-03-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Few-shot prompt for entity extraction
few_shot_prompt = """RETURN ONLY A VALID JSON OBJECT!!! NO ADDITIONAL TEXT

Extract relevant entities such as people, technologies, places, and organizations from the given text.

Example 1:
Text: "The invention relates to an advanced LiDAR technology for environmental perception."
{"technologies": ["LiDAR"], "places": [], "people": [], "organizations": []}


Example 2:
Text: "The vehicle was tested in urban environments such as New York City and Los Angeles."
{"technologies": [], "places": ["New York City", "Los Angeles"], "people": [], "organizations": []}

Example 3:
Text: "The engineers John Smith and Alice Johnson collaborated on the development of the navigation system."
{"technologies": ["navigation system"], "places": [], "people": ["John Smith", "Alice Johnson"], "organizations": []}

Example 4:
Text: "The company XYZ Inc. funded the research project on autonomous vehicle navigation."
{"technologies": ["autonomous vehicle navigation"], "places": [], "people": [], "organizations": ["XYZ Inc."]}

Example 4:
Text: "A glass filled with liquids."
{"technologies": [], "places": [], "people": [], "organizations": []}


RETURN ONLY A VALID JSON OBJECT!!! NO ADDITIOANL TEXT. JUST A JSON
IF THERE ARE NOT ENTITIES RETURN AN EMPTY JSON
---"""


# Function to extract entities from text using Azure OpenAI API
def extract_entities(text):
    prompt = f"{few_shot_prompt}\nText: \"{text}\"\n"
    deployment_name = "gpt-35-turbo-instruct"
    response = client.completions.create(
        model=deployment_name,
        prompt=prompt,
        max_tokens=100,
        temperature=0.3,
        top_p=1,
        stop=["\n"]
    )
    # Parse the response as JSON
    entities_json = response.choices[0].text.strip()
    return entities_json


# Create an empty list to store extracted entity data
entity_data = []

# Iterate over each row in the DataFrame
for index, row in chunk_df.iterrows():
    text = row["text_chunks"]
    # Extract entities from the text
    entities = extract_entities(text)
    if entities is not None:
        entity_data.append(entities)
    else:
        entity_data.append('')


chunk_df["entities"] = entity_data

In [57]:

def retriever_model_inference(text_input: list[str]):

    client = AzureOpenAI(
        api_key = os.getenv("AZURE_OPENAI_KEY"),
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_version="2024-03-01-preview"
        )

    embeddings = client.embeddings.create(
        input=text_input,
        model="text-embedding-ada-002"
    )

    vectors = [np.array(i.embedding, dtype='f') for i in embeddings.data]


    return vectors


chunk_df["embeddings"] = retriever_model_inference(chunk_df.text_chunks.values.tolist())

In [58]:
vector_dim=1536
partition_nr=10

from utils.faiss_storage import train_index, add_index_vectors
import numpy as np
import faiss

# 1. initialize index
#index = index_initializaton(vector_dim, partition_nr)
path_to_index = "a4_db_data.index"

def vector_db_setup(df: pd.DataFrame):

    index = faiss.read_index(path_to_index)

    print(index.ntotal)
    
    vectors = df["embeddings"].values
    vectors = np.array([i.astype(np.float32) for i in chunk_df["embeddings"].values])
    train_index(index, vectors)
    # 3. add vectors to index
    add_index_vectors(index, vectors)

    return index



index = vector_db_setup(chunk_df)

185


In [59]:
faiss.write_index(index, path_to_index)

In [60]:
index = faiss.read_index(path_to_index)
index.ntotal

266

In [61]:
chunk_df.drop(columns=["embeddings", "text"]).to_csv("a4_db_manuals.csv", index=False)

In [62]:
data = pd.read_csv("a4_db_patents.csv")
manuals = pd.read_csv("a4_db_manuals.csv")


pd.concat([data, manuals]).to_csv("a4_db_data.csv", index=False)