In [2]:
topics = [
    "Administrative Policies and Procedures",
    "Organizational Structure",
    "Employee Handbook",
    "HR Management Systems",
    "Onboarding Processes",
    "New Hire Orientation",
    "Personnel Files Management",
    "Compliance Training",
    "Workplace Safety Procedures",
    "Performance Evaluation Systems",
    "Benefits Administration",
    "Time and Attendance Management",
    "Employee Recognition Programs",
    "Communication Channels",
    "Workforce Planning",
    "Diversity and Inclusion Initiatives",
    "Conflict Resolution Procedures",
    "Employee Development Programs",
    "Change Management Strategies",
    "Succession Planning"
]


In [5]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker to generate fake data
fake = Faker()

num_entries = len(topics)

data = {
    'rank': np.arange(1, num_entries + 1),
    'title': topics,
    'publication_date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(num_entries)],
    'author': [fake.name() for _ in range(num_entries)],  # Changed 'inventor' to 'author'
    'department': [fake.company() for _ in range(num_entries)],  # Changed 'organization' to 'department'
    'publication_number': [str(fake.random_number(digits=10)) for _ in range(num_entries)],  # Changed 'publication_number' to 'document_number'
    'language': ['en'] * num_entries,
    'thumbnail': [fake.image_url() for _ in range(num_entries)],
    'pdf': [fake.url() for _ in range(num_entries)],
    'page': [random.randint(1, 10) for _ in range(num_entries)]
}

# Create DataFrame
df = pd.DataFrame(data)


In [8]:
from openai import AzureOpenAI
import os
import dotenv

dotenv.load_dotenv()

client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-03-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Function to generate synthetic patent text
def generate_patent_text(title):
    prompt = f"Create a text to simulate a adminstrative document within a company on \"{title}\""
    deployment_name = "gpt-35-turbo-instruct"
    response = client.completions.create(
        model=deployment_name,
        prompt=prompt,
        temperature=0.7,
        max_tokens=800
    )
    return response.choices[0].text.strip()

# Function to save synthetic patent text to file
def save_patent_text_to_file(id, text):
    directory = "synthetic_admin"
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(os.path.join(directory, f"{id.replace("/", "_")}.txt"), "w") as file:
        file.write(text)

# Generate synthetic patent texts and save them to files
for index, row in df.iterrows():
    title = row['title']
    patent_text = generate_patent_text(title)
    save_patent_text_to_file(row['publication_number'], patent_text)
    print(f"Doc text for ID {row['publication_number']} saved successfully.")

print("All Docs texts saved successfully.")


Doc text for ID 8309030038 saved successfully.
Doc text for ID 6320554784 saved successfully.
Doc text for ID 979711036 saved successfully.
Doc text for ID 5644000230 saved successfully.
Doc text for ID 2910177507 saved successfully.
Doc text for ID 7967943304 saved successfully.
Doc text for ID 9825861125 saved successfully.
Doc text for ID 9058286412 saved successfully.
Doc text for ID 5632315866 saved successfully.
Doc text for ID 5346207284 saved successfully.
Doc text for ID 2640696840 saved successfully.
Doc text for ID 3360395245 saved successfully.
Doc text for ID 4406538911 saved successfully.
Doc text for ID 5741425636 saved successfully.
Doc text for ID 7297106292 saved successfully.
Doc text for ID 9047011157 saved successfully.
Doc text for ID 1770017717 saved successfully.
Doc text for ID 690856158 saved successfully.
Doc text for ID 5563506326 saved successfully.
Doc text for ID 2007163696 saved successfully.
All Docs texts saved successfully.


In [9]:

# Create an empty list to store text data
text_data = []

# Function to read text files and return text content
def read_text_file(file_path):
    with open(file_path, "r") as file:
        return file.read()
    
# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    # Read the corresponding text file
    file_path = f"synthetic_admin/{row['publication_number'].replace("/", "_")}.txt"
    if os.path.exists(file_path):
        text_content = read_text_file(file_path)
        text_data.append({"publication_number": row["publication_number"], "text": text_content})

# Create a DataFrame from the text data
text_df = pd.DataFrame(text_data)

# Join the text DataFrame with the original DataFrame on the "id" column
merged_df = pd.merge(df, text_df, on="publication_number", how="left")

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter_char = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap  = 20,
    length_function = len,
    separators=["<ele>", ". ", " "],
    keep_separator=False
)

In [11]:
import pandas as pd

def chunk_documents(content_df: pd.DataFrame, text_splitter, word_count_threshold: int = 10, element_separator: str = "<ele>") -> pd.DataFrame:
    """
    Split a document into chunks according to the configuration of the text splitter
    """

    
    # Split content into chunks
    content_df["text_chunks"] = content_df["text"].apply(text_splitter.split_text)
    content_df = content_df.explode("text_chunks")
    content_df["text_chunks"] = content_df["text_chunks"].str.replace(element_separator, ", ")
    
    content_df["text_chunks"] = content_df["title"] + ": " + content_df["text_chunks"]
    
    # Drop failed extractions
    print(f'Files with extraction errors: {set(content_df[content_df["text_chunks"].isna()]["title"].to_list())}')
    content_df = content_df.dropna(axis=0, subset=["text_chunks"])
    
    # Drop chunks with less words than the threshold allows
    content_df = content_df[content_df["text_chunks"].apply(lambda n: len(n.split())) > word_count_threshold]

    return content_df

# Example usage:
# chunked_df = chunk_documents(df, text_splitter_char)


In [12]:
chunk_df = chunk_documents(merged_df, text_splitter_char)

Files with extraction errors: set()


In [13]:
import os
import pandas as pd
import ast

# Initialize AzureOpenAI client
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version="2024-03-01-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
)

# Few-shot prompt for entity extraction
few_shot_prompt = """
RETURN ONLY A VALID JSON OBJECT!!! NO ADDITIONAL TEXT

Extract relevant entities such as departments, procedures, technologies, and projects from the given administrative document.

Example 1:
Text: "The new HR management system enhances employee onboarding processes."
{"departments": [], "procedures": ["employee onboarding processes"], "technologies": ["HR management system"], "projects": []}

Example 2:
Text: "The safety procedures were updated for the manufacturing department."
{"departments": ["manufacturing"], "procedures": ["safety procedures"], "technologies": [], "projects": []}

Example 3:
Text: "The IT department implemented a new cybersecurity protocol."
{"departments": ["IT"], "procedures": ["cybersecurity protocol"], "technologies": [], "projects": []}

Example 4:
Text: "The company ABC Inc. initiated a project for workplace diversity training."
{"departments": [], "procedures": [], "technologies": [], "projects": ["workplace diversity training"]}

Example 5:
Text: "The administrative policies were reviewed and updated."
{"departments": [], "procedures": ["administrative policies"], "technologies": [], "projects": []}

RETURN ONLY A VALID JSON OBJECT!!! NO ADDITIONAL TEXT. JUST A JSON
IF THERE ARE NO ENTITIES RETURN AN EMPTY JSON
---"""



# Function to extract entities from text using Azure OpenAI API
def extract_entities(text):
    prompt = f"{few_shot_prompt}\nText: \"{text}\"\n"
    deployment_name = "gpt-35-turbo-instruct"
    response = client.completions.create(
        model=deployment_name,
        prompt=prompt,
        max_tokens=100,
        temperature=0.3,
        top_p=1,
        stop=["\n"]
    )
    # Parse the response as JSON
    entities_json = response.choices[0].text.strip()
    return entities_json


# Create an empty list to store extracted entity data
entity_data = []

# Iterate over each row in the DataFrame
for index, row in chunk_df.iterrows():
    text = row["text_chunks"]
    # Extract entities from the text
    entities = extract_entities(text)
    if entities is not None:
        entity_data.append(entities)
    else:
        entity_data.append('')


chunk_df["entities"] = entity_data

In [14]:

def retriever_model_inference(text_input: list[str]):

    client = AzureOpenAI(
        api_key = os.getenv("AZURE_OPENAI_KEY"),
        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_version="2024-03-01-preview"
        )

    embeddings = client.embeddings.create(
        input=text_input,
        model="text-embedding-ada-002"
    )

    vectors = [np.array(i.embedding, dtype='f') for i in embeddings.data]


    return vectors


chunk_df["embeddings"] = retriever_model_inference(chunk_df.text_chunks.values.tolist())

In [15]:
vector_dim=1536
partition_nr=10

from utils.faiss_storage import index_initializaton, train_index, add_index_vectors
import numpy as np
import faiss

# 1. initialize index
#index = index_initializaton(vector_dim, partition_nr)
path_to_index = "a4_db_onboarding.index"

def check_index_state(index):
    return index.is_trained

def vector_db_setup(
    vector_dim: int, partition_nr: int, df: pd.DataFrame
):
    # 1. initialize index
    index = index_initializaton(vector_dim, partition_nr)
    print("Index initialized")

    # 2. check index state - and train if required
    if check_index_state(index):
        pass
    else:
        vectors = df["embeddings"].values
        vectors = np.array([i.astype(np.float32) for i in chunk_df["embeddings"].values])
        print("Shape of input vectors:", vectors.shape)
        train_index(index, vectors)
        print("Index trained")
        # 3. add vectors to index
        add_index_vectors(index, vectors)
        print("Vectors added to index")

    return index



index = vector_db_setup(vector_dim, partition_nr, chunk_df)

Index initialized
Shape of input vectors: (143, 1536)
Index trained
Vectors added to index


In [16]:
faiss.write_index(index, path_to_index)

In [17]:
index = faiss.read_index(path_to_index)
index.ntotal

143

In [18]:
chunk_df.drop(columns=["embeddings"]).to_csv("a4_db_onboarding.csv", index=False)

In [7]:
import pandas as pd 

pd.read_csv("a4_db_onboarding.csv").head(3).entities.values[0]

'{"departments": [], "procedures": ["Administrative Policies and Procedures", "administrative policies and procedures", "office hours"], "technologies": [], "projects": []}'