In [5]:
import csv
import ast
import os
import re

def remove_duplicates_from_restored(file_path):
    """
    Processes a CSV file to remove duplicate entries from the nested lists in the 'restored' column.

    Args:
        file_path (str): The path to the CSV file to be processed.
        
    The function reads the CSV file, processes each row to remove duplicates from the nested lists
    in the 'restored' column while preserving the first element and the order of subsequent elements.
    The modified data is then written back to the same file.
    """
    with open(file_path, mode='r+', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        fieldnames = reader.fieldnames
        rows = list(reader)
        
        file.seek(0)
        writer = csv.DictWriter(file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        
        for row in rows:
            restored = ast.literal_eval(row['restored'])
            if len(restored) > 1 and isinstance(restored[1], list):
                # Deduplicate the nested list while preserving the order
                deduplicated_list = list(dict.fromkeys(restored[1]))
                # Update the 'restored' field with the deduplicated list
                row['restored'] = str([restored[0], deduplicated_list])
            writer.writerow(row)
        
        file.truncate()

# Iterate through all files in the directory
directory = 'results/LLM_enhanced_complete/'
pattern = re.compile(r'llm_results_gpt4_0.8_doi_collectionSite_\d+_\d+(?:st|nd|rd|th)\.csv')

for filename in os.listdir(directory):
    if pattern.match(filename):
        file_path = os.path.join(directory, filename)
        remove_duplicates_from_restored(file_path)


In [19]:
import csv
import ast
import os
import re
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

OPENAI_API_KEY = ''

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
loaded_faiss_index = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

def clean_annotations(restored):
    # Remove rdflib.term.Literal annotations
    return [restored[0], [str(item).replace("rdflib.term.Literal(", "").replace(")", "") for item in restored[1]]]

def process_csv(file_path, faiss_index):
    print(f"Processing file: {file_path}")
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        fieldnames = reader.fieldnames
        rows = list(reader)
        
        for row in rows:
            restored = ast.literal_eval(row['restored'])
            print(f"Original restored: {restored}")
            if len(restored) > 1 and isinstance(restored[1], list):
                for i, result in enumerate(restored[1]):
                    print(f"Performing FAISS search for: {result}")
                    docs_with_score = faiss_index.similarity_search_with_score(result, top_k=1)
                    if docs_with_score:
                        top_result = docs_with_score[0][0].page_content
                        print(f"Top result: {top_result}")
                        restored[1][i] = top_result  # Ensure the result is a string
                restored = clean_annotations(restored)
                row['restored'] = str(restored)
                print(f"Updated restored: {row['restored']}")
    
    # Save the modified data to a new file
    new_file_path = file_path.replace('.csv', '_processed.csv')
    with open(new_file_path, mode='w', newline='', encoding='utf-8') as new_file:
        writer = csv.DictWriter(new_file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        writer.writeheader()
        writer.writerows(rows)
    print(f"Finished processing file. Results saved to: {new_file_path}")

# Iterate through all files in the directory
directory = 'results/LLM_enhanced/'
pattern = re.compile(r'llm_results_gpt4_0.8_doi_collectionSite_\d+_\d+(?:st|nd|rd|th)\.csv')

for filename in os.listdir(directory):
    if pattern.match(filename):
        file_path = os.path.join(directory, filename)
        process_csv(file_path, loaded_faiss_index)


Processing file: results/LLM_enhanced/llm_results_gpt4_0.8_doi_collectionSite_1_1st.csv
Original restored: ['10.1002/cbdv.200590016', []]
Updated restored: ['10.1002/cbdv.200590016', []]
Original restored: ['10.1002/hlca.200890147', ['Apore, GO, Brazil']]
Performing FAISS search for: Apore, GO, Brazil
Top result: Apore ,GO
Updated restored: ['10.1002/hlca.200890147', ['Apore ,GO']]
Original restored: ['10.1002/jobm.200800093', ['Ribeirão Preto, State of São Paulo, Brazil']]
Performing FAISS search for: Ribeirão Preto, State of São Paulo, Brazil
Top result: Ribeirao Preto ,SP
Updated restored: ['10.1002/jobm.200800093', ['Ribeirao Preto ,SP']]
Original restored: ['10.1002/jobm.200800093', ['Ribeirão Preto, State of São Paulo, Brazil']]
Performing FAISS search for: Ribeirão Preto, State of São Paulo, Brazil
Top result: Ribeirao Preto ,SP
Updated restored: ['10.1002/jobm.200800093', ['Ribeirao Preto ,SP']]
Original restored: ['10.1002/ps.1278', ['Poços de Caldas, Minas Gerais, Brazil']]
P