In [7]:
import os
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import time

# Read the CSV file
df = pd.read_csv(os.path.join(os.pardir, "preprocess", 'predicted_places.csv'))

# Initialize the SPARQL endpoint
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")



# Function to generate and run SPARQL query
def get_label(wikidata_id):
  query = f"""
  SELECT * WHERE {{
    wd:{wikidata_id} rdfs:label ?label ;
      schema:description ?description .
    FILTER (lang(?label) = "en")
    FILTER (lang(?description) = "en" || lang(?description) = "it")
  }}
  """
  while True:
    try:
      sparql.setQuery(query)
      sparql.setReturnFormat(JSON)
      results = sparql.query().convert()
      break
    except Exception as e:
      if '429' in str(e):
        retry_after = int(e.headers.get('Retry-After', 5))
        print(f"Rate limited. Retrying after {retry_after} seconds.")
        time.sleep(retry_after)
      else:
        raise e
  time.sleep(.1)
  return results["results"]["bindings"][0] if results["results"]["bindings"] else None

# Iterate through each row and print the label
for index, row in df.iterrows():
  wikidata_url = row['wikidata_url']
  try:
    wikidata_id = wikidata_url.split("/")[-1]
    binding = get_label(wikidata_id)
    df.at[index, 'label'] = binding["label"]["value"]
    df.at[index, 'description'] = binding["description"]["value"]
  except Exception as e:
    print(f"URI: {wikidata_url}, Error: {e}")
    df.at[index, 'label'] = None
    df.at[index, 'description'] = None
    continue
  
# Save the DataFrame to a CSV file
df.drop(columns=['geonames_url'], inplace=True)
df.to_csv('test.csv', index=False)

URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/Q3892513, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/Q1438759, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not subscriptable
URI: https://www.wikidata.org/wiki/, Error: 'NoneType' object is not su

In [8]:
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

client = OpenAI(
  api_key=os.getenv("OPENAI_API_KEY"),  # This is the default and can be omitted
)

df = pd.read_csv('test.csv')

for index, row in df.iterrows():
  page = str(row['page'])
  value = row['value']
  label = row['label'] 
  wikidata_url = row['wikidata_url']
  description = row['description']
  try:
    with open(os.path.join(os.pardir, os.pardir, "txt", f"{page}.txt"), "r") as file:
      content = file.read()
  except FileNotFoundError:
    content = ""

  if not pd.isnull(label) and content:
    prompt = """
I am reviewing the following diary entry:

"""+content+"""

The system automatically detected that \""""+value+"""\" can be referenced as <"""+wikidata_url+""">.
This wikidata entry is about this place: \""""+label+"""\" which is described as \""""+description+"""\".

Is this correct?
ANSWER ONLY WITH TRUE if the text is referring to the same place as the wikidata entry, 
FALSE if it is not, 
and UNCLEAR if it is not possible to determine or the description is too vague, not referring to a specific place.
""" 
    chat_completion = client.chat.completions.create(
      messages=[
          {
              "role": "user",
              "content": prompt,
          }
      ],
      model="gpt-4o",
    )
    try:
      df.at[index, 'gpt-4o_result'] = chat_completion.choices[0].message.content
    except Exception as e:
      print(f"Error processing page {page}, value {value}: {e}")
      continue
    
df = df[['page', 'value', 'label', 'description', 'gpt-4o_result', 'wikidata_url']]
df.to_csv('test_with_gpt.csv', index=False)
print("test_with_gpt.csv has been saved")

KeyboardInterrupt: 

In [53]:
# Read the CSV file
df = pd.read_csv('test_with_gpt.csv')

# Calculate TP, FP, FN
TP = len(df[(df['gpt-4o_result'] == 'TRUE') & (df['label'].notna())])
FP = len(df[(df['gpt-4o_result'].isin(['FALSE', 'UNCLEAR'])) & (df['label'].notna())])
FN = len(df[(df['gpt-4o_result'].isna()) & (df['label'].notna())])

# Calculate precision, recall, and F1 score
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"TP: {TP}, FP: {FP}, FN: {FN}")
print(f"Precision: {precision:.2%}, Recall: {recall:.2%}, F1 Score: {f1:.2%}")

TP: 252, FP: 92, FN: 0
Precision: 73.26%, Recall: 100.00%, F1 Score: 84.56%
