In [3]:
import os
import pandas as pd

# Directory containing the files
directory = 'splits'
output_directory = 'faiss_index'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# List of attributes to process
attributes = ['bioActivity', 'collectionSite', 'collectionSpecie', 'collectionType', 'name']

for attribute in attributes:
    # Initialize a set to store unique neighbor entries
    unique_neighbors = set()

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.startswith(f"test_doi_{attribute}") or filename.startswith(f"train_doi_{attribute}"):
            filepath = os.path.join(directory, filename)
            # Read the CSV file
            df = pd.read_csv(filepath)
            # Add unique neighbor entries to the set
            unique_neighbors.update(df['neighbor'].unique())

    # Save the unique neighbors to a text file
    with open(os.path.join(output_directory, f'unique_{attribute}.txt'), 'w') as f:
        for neighbor in sorted(unique_neighbors):
            f.write(f"{neighbor}\n")


In [21]:
import os
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the text files
input_directory = 'faiss_index'
output_directory = 'faiss_index_trained'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Iterate over all text files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(input_directory, filename)
        
        # Read the text file and create Document objects
        entities = []
        with open(filepath, 'r') as f:
            for line in f:
                text = line.strip()
                doc = Document(page_content=text, metadata={'text': text})
                entities.append(doc)
        
        # Create FAISS index from documents
        faiss_index = FAISS.from_documents(entities, embeddings)
        
        # Save the FAISS index locally
        index_path = os.path.join(output_directory, f'{filename}.index')
        faiss_index.save_local(index_path)

print("FAISS indices have been created and saved.")

2024-06-19 16:21:32,841 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:21:33,940 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:21:35,286 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:21:37,396 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:21:39,251 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


FAISS indices have been created and saved.


In [10]:
import os
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the FAISS indexes
index_directory = 'faiss_index_trained'

# Mapping from number to attribute
attribute_mapping = {
    1: 'collectionSpecie',
    2: 'collectionSite',
    3: 'bioActivity',
    4: 'name',
    5: 'collectionType'
}

def load_faiss_index(attribute_number):
    attribute = attribute_mapping.get(attribute_number)
    if not attribute:
        raise ValueError(f"Invalid attribute number: {attribute_number}")
    index_path = os.path.join(index_directory, f'unique_{attribute}.txt.index')
    if not os.path.exists(index_path):
        raise FileNotFoundError(f"FAISS index for attribute '{attribute}' not found.")
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

def similarity_search(attribute_number, query, top_k=5):
    faiss_index = load_faiss_index(attribute_number)
    docs_with_score = faiss_index.similarity_search_with_score(query, top_k=top_k)
    return docs_with_score

# Example usage
attribute_number = 2  # Change this to the attribute number you want to search (1 to 5)
query = "test"  # Change this to your query string

try:
    results = similarity_search(attribute_number, query)
    for doc, score in results:
        print(f"Document: {doc.page_content}, Score: {score}")
except (FileNotFoundError, ValueError) as e:
    print(e)

Document: Sao Paulo/SP, Score: 0.12348789721727371
Document: Sao Carlos/SP, Score: 0.1887209415435791
Document: Teodoro Sampaio/SP, Score: 0.2107856422662735
Document: Campinas/SP, Score: 0.2311762422323227


In [15]:
import csv
import ast
import os
import re

def remove_duplicates_from_restored(file_path):
    """
    Processes a CSV file to remove duplicate entries from the nested lists in the 'restored' column.

    Args:
        file_path (str): The path to the CSV file to be processed.
        
    The function reads the CSV file, processes each row to remove duplicates from the nested lists
    in the 'restored' column while preserving the first element and the order of subsequent elements.
    The modified data is then written back to the same file.
    """
    with open(file_path, mode='r+', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        fieldnames = reader.fieldnames
        rows = list(reader)
        
        file.seek(0)
        writer = csv.DictWriter(file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        
        for row in rows:
            try:
                restored = ast.literal_eval(row['restored'])
                if len(restored) > 1 and isinstance(restored[1], list):
                    # Deduplicate the nested list while preserving the order
                    seen = set()
                    deduplicated_list = []
                    for item in restored[1]:
                        if item not in seen:
                            deduplicated_list.append(item)
                            seen.add(item)
                    # Update the 'restored' field with the deduplicated list
                    row['restored'] = str([restored[0], deduplicated_list])
            except (ValueError, SyntaxError) as e:
                print(f"Error parsing restored field in row {row}: {e}")
            writer.writerow(row)
        
        file.truncate()

# Iterate through all files in the directory
directory = 'results/LLM_enhanced_complete/'
pattern = re.compile(r'llm_results_gpt4_0.8_doi_\w+_\d+_\d+(?:st|nd|rd|th)\.csv')

for filename in os.listdir(directory):
    if pattern.match(filename):
        file_path = os.path.join(directory, filename)
        remove_duplicates_from_restored(file_path)

Error parsing restored field in row {'true': 'ioxidant', 'restored': " free radical scavenging abilities'", 'edge_type': " 'Antioxidant", None: [" free radical scavenging abilities'", " 'Antioxidant", " free radical scavenging abilities'", " 'Antioxidant", " free radical scavenging abilities'", " 'Antioxidant", " higher free radical scavenging abilities'", " 'Antioxidant", " higher free radical scavenging abilities'", " 'Antioxidant", ' higher free radical scavenging abilities\']]"', 'doi_bioActivity']}: unexpected indent (<unknown>, line 1)


ValueError: dict contains fields not in fieldnames: None

In [19]:
import os
import ast
import csv
import re
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the FAISS indexes
index_directory = 'faiss_index_trained'

# List of attributes to process
attributes = ['bioActivity', 'collectionSite', 'collectionSpecie', 'collectionType', 'name']

def load_faiss_index(attribute):
    index_path = os.path.join(index_directory, f'unique_{attribute}.txt.index')
    if not os.path.exists(index_path):
        raise FileNotFoundError(f"FAISS index directory for attribute '{attribute}' not found.")
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

def similarity_search(attribute, query, top_k=5):
    faiss_index = load_faiss_index(attribute)
    docs_with_score = faiss_index.similarity_search_with_score(query, top_k=top_k)
    return docs_with_score

def process_files_for_similarity_search(directory, attributes):
    pattern = re.compile(r'llm_results_gpt4_0.8_doi_\w+_\d+_\d+(?:st|nd|rd|th)\.csv')
    
    for filename in os.listdir(directory):
        if pattern.match(filename):
            file_path = os.path.join(directory, filename)
            attribute = extract_attribute_from_filename(filename, attributes)
            if attribute:
                update_restored_with_similarity_search(file_path, attribute)

def extract_attribute_from_filename(filename, attributes):
    for attribute in attributes:
        if attribute in filename:
            return attribute
    return None

def update_restored_with_similarity_search(file_path, attribute):
    try:
        faiss_index = load_faiss_index(attribute)
    except FileNotFoundError as e:
        print(e)
        return

    with open(file_path, mode='r+', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        fieldnames = reader.fieldnames
        rows = list(reader)
        
        file.seek(0)
        writer = csv.DictWriter(file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        
        for row in rows:
            try:
                restored = ast.literal_eval(row['restored'])
                if len(restored) > 1 and isinstance(restored[1], list):
                    # Ensure all items in the list are strings
                    restored[1] = [str(item) for item in restored[1]]
                    query = ' '.join(restored[1])
                    docs_with_score = similarity_search(attribute, query)
                    new_restored_value = [restored[0], [doc.page_content for doc, score in docs_with_score]]
                    row['restored'] = str(new_restored_value)
            except (ValueError, SyntaxError, TypeError) as e:
                print(f"Error parsing restored field in row {row}: {e}")
            # Ensure only defined fieldnames are written
            filtered_row = {key: row[key] for key in fieldnames if key in row}
            writer.writerow(filtered_row)
        
        file.truncate()

# Example usage
directory = 'results/LLM_enhanced_complete/'
process_files_for_similarity_search(directory, attributes)

Error parsing restored field in row {'true': 'Trypanosoma cruzi\']]"', 'restored': 'doi_bioActivity', 'edge_type': ''}: malformed node or string: <ast.Name object at 0x11807c5b0>
Error parsing restored field in row {'true': 'ioxidant', 'restored': " free radical scavenging abilities'", 'edge_type': " 'Antioxidant"}: unexpected indent (<unknown>, line 1)
Error parsing restored field in row {'true': " of caffeic acid'", 'restored': ' \'Weak free radical scavenging activities towards DPPH\']]"', 'edge_type': 'doi_bioActivity'}: unexpected indent (<unknown>, line 1)
Error parsing restored field in row {'true': 'ioxidant', 'restored': " free radical scavenging abilities'", 'edge_type': " 'Antioxidant"}: unexpected indent (<unknown>, line 1)
Error parsing restored field in row {'true': "0% on HeLa cells and 25% on CHO cells.'", 'restored': " 'Not described.'", 'edge_type': ' \'No antifungal activity noted.\']]"', None: ['doi_bioActivity']}: unexpected indent (<unknown>, line 1)


KeyboardInterrupt: 

In [20]:
import os
import ast
import csv
import re
import logging
from tqdm import tqdm
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the FAISS indexes
index_directory = 'faiss_index_trained'

# List of attributes to process
attributes = ['bioActivity', 'collectionSite', 'collectionSpecie', 'collectionType', 'name']

def load_faiss_index(attribute):
    index_path = os.path.join(index_directory, f'unique_{attribute}.txt.index')
    if not os.path.exists(index_path):
        raise FileNotFoundError(f"FAISS index directory for attribute '{attribute}' not found.")
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

def similarity_search(attribute, query, top_k=5):
    faiss_index = load_faiss_index(attribute)
    docs_with_score = faiss_index.similarity_search_with_score(query, top_k=top_k)
    return docs_with_score

def process_files_for_similarity_search(directory, attributes):
    pattern = re.compile(r'llm_results_gpt4_0.8_doi_\w+_\d+_\d+(?:st|nd|rd|th)\.csv')
    files = [f for f in os.listdir(directory) if pattern.match(f)]
    
    for filename in tqdm(files, desc="Processing files"):
        file_path = os.path.join(directory, filename)
        attribute = extract_attribute_from_filename(filename, attributes)
        if attribute:
            logging.info(f"Processing file: {filename} with attribute: {attribute}")
            update_restored_with_similarity_search(file_path, attribute)

def extract_attribute_from_filename(filename, attributes):
    for attribute in attributes:
        if attribute in filename:
            return attribute
    return None

def clean_restored_field(restored_str):
    # Remove any leading/trailing whitespace and ensure proper list format
    restored_str = restored_str.strip()
    if not (restored_str.startswith('[') and restored_str.endswith(']')):
        restored_str = f"[{restored_str}]"
    return restored_str

def update_restored_with_similarity_search(file_path, attribute):
    try:
        faiss_index = load_faiss_index(attribute)
    except FileNotFoundError as e:
        logging.error(e)
        return

    with open(file_path, mode='r+', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        fieldnames = reader.fieldnames
        rows = list(reader)
        
        file.seek(0)
        writer = csv.DictWriter(file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        
        for row in tqdm(rows, desc=f"Processing rows in {os.path.basename(file_path)}"):
            try:
                restored_str = clean_restored_field(row['restored'])
                restored = ast.literal_eval(restored_str)
                if len(restored) > 1 and isinstance(restored[1], list):
                    # Ensure all items in the list are strings
                    restored[1] = [str(item) for item in restored[1]]
                    query = ' '.join(restored[1])
                    docs_with_score = similarity_search(attribute, query)
                    new_restored_value = [restored[0], [doc.page_content for doc, score in docs_with_score]]
                    row['restored'] = str(new_restored_value)
            except (ValueError, SyntaxError, TypeError) as e:
                logging.error(f"Error parsing restored field in row {row}: {e}")
            # Ensure only defined fieldnames are written
            filtered_row = {key: row[key] for key in fieldnames if key in row}
            writer.writerow(filtered_row)
        
        file.truncate()

# Example usage
directory = 'results/LLM_enhanced_complete/'
process_files_for_similarity_search(directory, attributes)

Processing files:   0%|          | 0/5 [00:00<?, ?it/s]2024-06-19 16:11:50,720 - INFO - Processing file: llm_results_gpt4_0.8_doi_collectionSite_0_1st.csv with attribute: collectionSite
2024-06-19 16:11:51,106 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:11:51,380 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:11:51,658 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:11:51,903 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:11:52,260 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:11:52,499 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:11:52,905 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:11:53,142 - INFO - HTTP Request: POST https://api.o