# FAISS Workflow
## 1) create '.txt' files of unique values per attribute based off of the splits files

In [3]:
import os
import pandas as pd

# This script processes CSV files containing neighbor entries for different attributes,
# extracts unique neighbors, and saves them to separate text files for each attribute.

# Directory containing the input CSV files
directory = 'splits'
# Directory where the output text files will be saved
output_directory = 'faiss_index/faiss_index_data/individual_values'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# List of attributes to process
attributes = ['bioActivity', 'collectionSite', 'collectionSpecie', 'collectionType', 'name']

for attribute in attributes:
    # Initialize a set to store unique neighbor entries
    unique_neighbors = set()

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.startswith(f"test_doi_{attribute}") or filename.startswith(f"train_doi_{attribute}"):
            filepath = os.path.join(directory, filename)
            # Read the CSV file
            df = pd.read_csv(filepath)
            # Add unique neighbor entries to the set
            unique_neighbors.update(df['neighbor'].unique())

    # Save the unique neighbors to a text file
    with open(os.path.join(output_directory, f'unique_{attribute}.txt'), 'w') as f:
        for neighbor in sorted(unique_neighbors):
            f.write(f"{neighbor}\n")


## 2) create FAISS indexes based on the unique values in the textfiles

In [1]:
#create faiss indexes
import os
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the text files
input_directory = 'faiss_index/faiss_index_data/individual_values'
output_directory = 'faiss_index'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Iterate over all text files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(input_directory, filename)
        
        # Read the text file and create Document objects
        entities = []
        with open(filepath, 'r') as f:
            for line in f:
                text = line.strip()
                doc = Document(page_content=text, metadata={'text': text})
                entities.append(doc)
        
        # Create FAISS index from documents
        faiss_index = FAISS.from_documents(entities, embeddings)
        
        # Save the FAISS index locally
        index_path = os.path.join(output_directory, f'{filename}.index')
        faiss_index.save_local(index_path)

print("FAISS indices have been created and saved.")

FAISS indices have been created and saved.


## 2.5) test the FAISS indexes with similarity search and custom values

In [None]:
#test Faiss indexes
import os
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the FAISS indexes
index_directory = 'faiss_index'

# Mapping from number to attribute
attribute_mapping = {
    1: 'collectionSpecie',
    2: 'collectionSite',
    3: 'bioActivity',
    4: 'name',
    5: 'collectionType'
}

def load_faiss_index(attribute_number):
    attribute = attribute_mapping.get(attribute_number)
    if not attribute:
        raise ValueError(f"Invalid attribute number: {attribute_number}")
    index_path = os.path.join(index_directory, f'unique_{attribute}.txt.index')
    if not os.path.exists(index_path):
        raise FileNotFoundError(f"FAISS index for attribute '{attribute}' not found.")
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

def similarity_search(attribute_number, query, top_k=5):
    faiss_index = load_faiss_index(attribute_number)
    docs_with_score = faiss_index.similarity_search_with_score(query, top_k=top_k)
    return docs_with_score

# Example usage
attribute_number = 2  # Change this to the attribute number you want to search (1 to 5)
query = "test"  # Change this to your query string

try:
    results = similarity_search(attribute_number, query)
    for doc, score in results:
        print(f"Document: {doc.page_content}, Score: {score}")
except (FileNotFoundError, ValueError) as e:
    print(e)