In [3]:
import os
import pandas as pd

# This script processes CSV files containing neighbor entries for different attributes,
# extracts unique neighbors, and saves them to separate text files for each attribute.

# Directory containing the input CSV files
directory = 'splits'
# Directory where the output text files will be saved
output_directory = 'faiss_index'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# List of attributes to process
attributes = ['bioActivity', 'collectionSite', 'collectionSpecie', 'collectionType', 'name']

for attribute in attributes:
    # Initialize a set to store unique neighbor entries
    unique_neighbors = set()

    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        if filename.startswith(f"test_doi_{attribute}") or filename.startswith(f"train_doi_{attribute}"):
            filepath = os.path.join(directory, filename)
            # Read the CSV file
            df = pd.read_csv(filepath)
            # Add unique neighbor entries to the set
            unique_neighbors.update(df['neighbor'].unique())

    # Save the unique neighbors to a text file
    with open(os.path.join(output_directory, f'unique_{attribute}.txt'), 'w') as f:
        for neighbor in sorted(unique_neighbors):
            f.write(f"{neighbor}\n")


In [21]:
#create faiss indexes
import os
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the text files
input_directory = 'faiss_index'
output_directory = 'faiss_index_trained'

# Ensure the output directory exists
os.makedirs(output_directory, exist_ok=True)

# Iterate over all text files in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.txt'):
        filepath = os.path.join(input_directory, filename)
        
        # Read the text file and create Document objects
        entities = []
        with open(filepath, 'r') as f:
            for line in f:
                text = line.strip()
                doc = Document(page_content=text, metadata={'text': text})
                entities.append(doc)
        
        # Create FAISS index from documents
        faiss_index = FAISS.from_documents(entities, embeddings)
        
        # Save the FAISS index locally
        index_path = os.path.join(output_directory, f'{filename}.index')
        faiss_index.save_local(index_path)

print("FAISS indices have been created and saved.")

2024-06-19 16:21:32,841 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:21:33,940 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:21:35,286 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:21:37,396 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-19 16:21:39,251 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


FAISS indices have been created and saved.


In [10]:
#test Faiss indexes
import os
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the FAISS indexes
index_directory = 'faiss_index_trained'

# Mapping from number to attribute
attribute_mapping = {
    1: 'collectionSpecie',
    2: 'collectionSite',
    3: 'bioActivity',
    4: 'name',
    5: 'collectionType'
}

def load_faiss_index(attribute_number):
    attribute = attribute_mapping.get(attribute_number)
    if not attribute:
        raise ValueError(f"Invalid attribute number: {attribute_number}")
    index_path = os.path.join(index_directory, f'unique_{attribute}.txt.index')
    if not os.path.exists(index_path):
        raise FileNotFoundError(f"FAISS index for attribute '{attribute}' not found.")
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

def similarity_search(attribute_number, query, top_k=5):
    faiss_index = load_faiss_index(attribute_number)
    docs_with_score = faiss_index.similarity_search_with_score(query, top_k=top_k)
    return docs_with_score

# Example usage
attribute_number = 2  # Change this to the attribute number you want to search (1 to 5)
query = "test"  # Change this to your query string

try:
    results = similarity_search(attribute_number, query)
    for doc, score in results:
        print(f"Document: {doc.page_content}, Score: {score}")
except (FileNotFoundError, ValueError) as e:
    print(e)

Document: Sao Paulo/SP, Score: 0.12348789721727371
Document: Sao Carlos/SP, Score: 0.1887209415435791
Document: Teodoro Sampaio/SP, Score: 0.2107856422662735
Document: Campinas/SP, Score: 0.2311762422323227


In [5]:
import os
import ast
import csv
import re
import logging
import shutil
from tqdm import tqdm
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set your OpenAI API key
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Directory containing the FAISS indexes
index_directory = 'faiss_index_trained'

def load_faiss_index(attribute):
    index_path = os.path.join(index_directory, f'unique_{attribute}.txt.index')
    if not os.path.exists(index_path):
        raise FileNotFoundError(f"FAISS index directory for attribute '{attribute}' not found.")
    return FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)

def similarity_search(attribute, query, top_k=1):
    faiss_index = load_faiss_index(attribute)
    docs_with_score = faiss_index.similarity_search_with_score(query, top_k=top_k)
    return docs_with_score

def clean_restored_field(restored_str):
    # Remove any leading/trailing whitespace and ensure proper list format
    restored_str = restored_str.strip()
    if not (restored_str.startswith('[') and restored_str.endswith(']')):
        restored_str = f"[{restored_str}]"
    return restored_str

def update_restored_with_similarity_search(file_path, attribute):
    # Create a backup of the file to prevent data loss
    backup_file_path = file_path + '_bak.csv'
    shutil.copy(file_path, backup_file_path)
    
    try:
        faiss_index = load_faiss_index(attribute)
    except FileNotFoundError as e:
        logging.error(e)
        return

    with open(file_path, mode='r+', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        fieldnames = reader.fieldnames
        rows = list(reader)
        
        file.seek(0)
        writer = csv.DictWriter(file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        
        for row in tqdm(rows, desc=f"Processing rows in {os.path.basename(file_path)}"):
            try:
                restored_str = clean_restored_field(row['restored'])
                restored = ast.literal_eval(restored_str)
                if len(restored) > 1 and isinstance(restored[1], list):
                    # Ensure all items in the list are strings
                    restored[1] = [str(item) for item in restored[1]]
                    query = ' '.join(restored[1])
                    docs_with_score = similarity_search(attribute, query)
                    if docs_with_score:
                        old_value = row['restored']
                        new_restored_value = [restored[0], docs_with_score[0][0].page_content]
                        row['restored'] = str(new_restored_value)
                        logging.info(f"changed {old_value} to {new_restored_value} in row {row['true']}")
            except (ValueError, SyntaxError, TypeError) as e:
                logging.error(f"Error parsing restored field in row {row}: {e}")
            # Ensure only defined fieldnames are written
            filtered_row = {key: row[key] for key in fieldnames if key in row}
            writer.writerow(filtered_row)
        
        file.truncate()

def process_files_for_similarity_search(directory, attribute):
    pattern = re.compile(rf'llm_results_gpt4_0.8_doi_{attribute}_\d+_\d+(?:st|nd|rd|th)\.csv')
    files = [f for f in os.listdir(directory) if pattern.match(f)]
    
    for filename in tqdm(files, desc="Processing files"):
        file_path = os.path.join(directory, filename)
        logging.info(f"Processing file: {filename} with attribute: {attribute}")
        update_restored_with_similarity_search(file_path, attribute)

# Process all files for the attribute 'bioActivity'
directory = 'results/LLM_enhanced_full/'
attribute = 'name'
process_files_for_similarity_search(directory, attribute)

Processing files:   0%|          | 0/40 [00:00<?, ?it/s]2024-06-20 15:55:16,861 - INFO - Processing file: llm_results_gpt4_0.8_doi_name_2_2nd.csv with attribute: name
2024-06-20 15:55:17,419 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-06-20 15:55:17,426 - INFO - changed ['10.1002/cbdv.200800342', ['(1R*,3S*,5S*,6aR*,7S*,8S*,10R*,10aR*)-1-(Acetyloxy)-3,5,6,6a,7,8,9,10-octahydro-10-hydroxy-7,8-dimethyl-7-[(2Z)-3-methylpenta-2,4-dien-1-yl]naphtho[1,8a-c]furan-3,5-diyl Dibutanoate', 'not provided', 'not provided', 'not provided', 'not provided', '(1R*,3S*,5S*,6aR*,7S*,8R*,10R*,10aR*)-1,3-Bis(acetyloxy)-3,5,6,6a,7,8,9,10-octahydro-10-hydroxy-7,8-dimethyl-7-[(2Z)-3-methylpenta-2,4-dien-1-yl]naphtho[1,8a-c]furan-5-yl Butanoate', '(2R*,4aS*,5S*,7R*,8R*,8aS*)-4,4a-Diformyl-1,2,4a,5,6,7,8,8a-octahydro-5-hydroxy-7,8-dimethyl-8-[(2Z)-3-methylpenta-2,4-dien-1-yl]naphthalen-2-yl Butanoate']] to ['10.1002/cbdv.200800342', 'Caseargrewiin F; (1R*,3S*,5S*,6aR*

In [6]:
# Delete all files with _bak.csv as filename ending in the directory results/LLM_enhanced_full
backup_directory = 'results/LLM_enhanced_full'
for root, dirs, files in os.walk(backup_directory):
    for file in files:
        if file.endswith('_bak.csv'):
            file_path = os.path.join(root, file)
            try:
                os.remove(file_path)
                logging.info(f"Deleted backup file: {file_path}")
            except Exception as e:
                logging.error(f"Error deleting file {file_path}: {e}")



2024-06-20 16:40:15,113 - INFO - Deleted backup file: results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_collectionSpecie_6_3rd.csv_bak.csv
2024-06-20 16:40:15,115 - INFO - Deleted backup file: results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_4_4th.csv_bak.csv
2024-06-20 16:40:15,115 - INFO - Deleted backup file: results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_0_1st.csv_bak.csv
2024-06-20 16:40:15,116 - INFO - Deleted backup file: results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_4_2nd.csv_bak.csv
2024-06-20 16:40:15,118 - INFO - Deleted backup file: results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_bioActivity_3_3rd.csv_bak.csv
2024-06-20 16:40:15,118 - INFO - Deleted backup file: results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_collectionType_8_1st.csv_bak.csv
2024-06-20 16:40:15,120 - INFO - Deleted backup file: results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_collectionSpecie_1_4th.csv_bak.csv
2024-06-20 16:40:15,123 - INFO - Deleted backup file: results/L

In [3]:
import csv
import re
import os
import glob
import ast
import shutil

def process_csv(input_file, output_file):
    with open(input_file, 'r', newline='', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile, quoting=csv.QUOTE_ALL)

        for row in reader:
            if reader.line_num == 1:
                writer.writerow(row)
                continue

            true_col, restored_col, edge_type = row

            # Process the true column
            true_col = process_column(true_col)

            # Process the restored column
            restored_col = process_column(restored_col)

            writer.writerow([true_col, restored_col, edge_type])

    print(f"Processed file saved as: {output_file}")

def process_column(col):
    try:
        # Remove outer brackets and split by the first comma
        content = col.strip()[1:-1]
        doi, name = content.split(',', 1)
        
        # Clean up DOI and name
        doi = doi.strip().strip("'\"")
        name = name.strip().strip("'\"")
        
        # Escape double quotes in the name
        name = name.replace('"', '\\"')
        
        # Reconstruct the column with proper quoting
        return f'["{doi}", "{name}"]'
    except:
        # If we can't parse it, return the original
        return col

def process_all_files(input_directory, output_directory):
    # Clean the output directory
    if os.path.exists(output_directory):
        shutil.rmtree(output_directory)
    os.makedirs(output_directory)

    # Pattern for matching file names
    pattern = r'llm_results_gpt4_0.8_doi_name_\d+_\d+(?:st|nd|rd|th)\.csv'
    
    # Get all matching files in the input directory
    for input_file in glob.glob(os.path.join(input_directory, '*.csv')):
        if re.match(pattern, os.path.basename(input_file)):
            output_file = os.path.join(output_directory, os.path.basename(input_file))
            print(f"Processing {input_file}...")
            process_csv(input_file, output_file)

# Usage
input_directory = 'results/LLM_enhanced_full/'
output_directory = 'results/LLM_enhanced_full/modified_names/'
process_all_files(input_directory, output_directory)

Processing results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_2_2nd.csv...
Processed file saved as: results/LLM_enhanced_full/modified_names/llm_results_gpt4_0.8_doi_name_2_2nd.csv
Processing results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_3_4th.csv...
Processed file saved as: results/LLM_enhanced_full/modified_names/llm_results_gpt4_0.8_doi_name_3_4th.csv
Processing results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_7_3rd.csv...
Processed file saved as: results/LLM_enhanced_full/modified_names/llm_results_gpt4_0.8_doi_name_7_3rd.csv
Processing results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_7_1st.csv...
Processed file saved as: results/LLM_enhanced_full/modified_names/llm_results_gpt4_0.8_doi_name_7_1st.csv
Processing results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_6_3rd.csv...
Processed file saved as: results/LLM_enhanced_full/modified_names/llm_results_gpt4_0.8_doi_name_6_3rd.csv
Processing results/LLM_enhanced_full/llm_results_gpt4_0.8_doi_name_6_1

In [5]:
#EVAL from NatUKE
import pandas as pd
from ast import literal_eval
import numpy as np

def hits_at(k, true, list_pred):
    hits = []
    missed_entries = []

    for index_t, t in enumerate(true):
        hit = False
        # get the list of predicteds that's on the second argument
        for index_lp, lp in enumerate(list_pred[index_t]):
            if index_lp >= k:
                break
            if t[1] == lp:
                hits.append(1)
                hit = True
                break
        if not(hit):
            hits.append(0)
            missed_entries.append((index_t, t, list_pred[index_t][:k]))
    return np.mean(hits), missed_entries

def mrr(true, list_pred):
    # using the first list pred to get how many there will be
    rrs = []
    missed_entries = []
    for index_t, t in enumerate(true):
        hit = False
        # get the list of predicteds that's on the second argument
        for index_lp, lp in enumerate(list_pred[index_t]):
            if t[1] == lp:
                rrs.append(1/(index_lp + 1))
                break
            if not hit:
                missed_entries.append((index_t, t, list_pred[index_t]))

    return np.mean(rrs), missed_entries


path = 'results/LLM_enhanced_full/'
file_name = "llm_results"
splits = [0.8]
#edge_groups = ['doi_name', 'doi_bioActivity', 'doi_collectionSpecie', 'doi_collectionSite', 'doi_collectionType']
edge_group = 'doi_name'
#algorithms = ['bert', 'deep_walk', 'node2vec', 'metapath2vec', 'regularization']
algorithms = ['gpt4']
k_at = [2]
dynamic_stages = ['1st', '2nd', '3rd', '4th']

# hits@k
hitsatk_df = {'k': [], 'algorithm': [], 'edge_group': [], 'split': [], 'dynamic_stage': [], 'value': []}
missed_hits = []

for algorithm in algorithms:
    for k in k_at:
        for split in splits:
            for iteration in range(10):
                for dynamic_stage in dynamic_stages:
                    restored_df = pd.read_csv("{}modified_names/{}_{}_{}_{}_{}_{}.csv".format(path, file_name, algorithm, split, edge_group, iteration, dynamic_stage))
                    restored_df['true'] = restored_df['true'].apply(literal_eval)
                    restored_df['restored'] = restored_df['restored'].apply(literal_eval)
                    mean_hits, missed = hits_at(k, restored_df.true.to_list(),restored_df.restored.to_list())
                    hitsatk_df['k'].append(k)
                    hitsatk_df['algorithm'].append(algorithm)
                    hitsatk_df['split'].append(split)
                    hitsatk_df['edge_group'].append(edge_group)
                    hitsatk_df['dynamic_stage'].append(dynamic_stage)
                    hitsatk_df['value'].append(mean_hits)
                    missed_hits.extend(missed)

hitsatk_df = pd.DataFrame(hitsatk_df)
hitsatk_df.to_csv('{}metric_results/full_dynamic_hits@k_{}_{}.csv'.format(path, edge_group, file_name), index=False)
hitsatk_df_mean = hitsatk_df.groupby(by=['k', 'algorithm', 'split', 'edge_group', 'dynamic_stage'], as_index=False).mean()
hitsatk_df_std = hitsatk_df.groupby(by=['k', 'algorithm', 'split', 'edge_group', 'dynamic_stage'], as_index=False).std()
hitsatk_df_mean['std'] = hitsatk_df_std['value']
print(hitsatk_df_mean)

# Save missed hits entries for debugging
missed_hits_df = pd.DataFrame(missed_hits, columns=['index', 'true', 'predictions'])
missed_hits_df.to_csv('{}metric_results/missed_hits@k_{}_{}.csv'.format(path, edge_group, file_name), index=False)

# mrr
mrr_df = {'algorithm': [], 'edge_group': [], 'split': [], 'dynamic_stage': [], 'value': []}
missed_mrr = []

for algorithm in algorithms:
    for split in splits:
        for iteration in range(10):
            for dynamic_stage in dynamic_stages:
                restored_df = pd.read_csv("{}modified_names/{}_{}_{}_{}_{}_{}.csv".format(path, file_name, algorithm, split, edge_group, iteration, dynamic_stage))
                restored_df['true'] = restored_df['true'].apply(literal_eval)
                restored_df['restored'] = restored_df['restored'].apply(literal_eval)
                mean_mrr, missed = mrr(restored_df.true.to_list(),restored_df.restored.to_list())
                mrr_df['algorithm'].append(algorithm)
                mrr_df['split'].append(split)
                mrr_df['edge_group'].append(edge_group)
                mrr_df['dynamic_stage'].append(dynamic_stage)
                mrr_df['value'].append(mean_mrr)
                missed_mrr.extend(missed)

mrr_df = pd.DataFrame(mrr_df)
mrr_df.to_csv('{}metric_results/full_dynamic_mrr_{}_{}.csv'.format(path, edge_group, file_name), index=False)
mrr_df_mean = mrr_df.groupby(by=['algorithm', 'edge_group', 'split', 'dynamic_stage'], as_index=False).mean()
mrr_df_std = mrr_df.groupby(by=['algorithm', 'edge_group', 'split', 'dynamic_stage'], as_index=False).std()
mrr_df_mean['std'] = mrr_df_std['value']
print(mrr_df_mean)

# Save missed MRR entries for debugging
missed_mrr_df = pd.DataFrame(missed_mrr, columns=['index', 'true', 'predictions'])
missed_mrr_df.to_csv('{}metric_results/missed_mrr_{}_{}.csv'.format(path, edge_group, file_name), index=False)

# saving files
hitsatk_df_mean.to_csv('{}metric_results/dynamic_hits@k_{}_{}.csv'.format(path, edge_group, file_name), index=False)
mrr_df_mean.to_csv('{}metric_results/dynamic_mrr_{}_{}.csv'.format(path, edge_group, file_name), index=False)

   k algorithm  split edge_group dynamic_stage     value       std
0  2      gpt4    0.8   doi_name           1st  0.199250  0.013019
1  2      gpt4    0.8   doi_name           2nd  0.243049  0.026573
2  2      gpt4    0.8   doi_name           3rd  0.280748  0.028572
3  2      gpt4    0.8   doi_name           4th  0.335740  0.050884
  algorithm edge_group  split dynamic_stage  value  std
0      gpt4   doi_name    0.8           1st    0.5  0.0
1      gpt4   doi_name    0.8           2nd    0.5  0.0
2      gpt4   doi_name    0.8           3rd    0.5  0.0
3      gpt4   doi_name    0.8           4th    0.5  0.0
