## wikidata id disambiguation
This code processes a Wikidata ID generated by a Large Language Model (LLM) to validate and refine it. It accesses the Wikidata SPARQL endpoint to retrieve a list of potential matches for the given label. Using GPT-4 for disambiguation, the system incorporates contextual information, such as coordinates previously extracted by the LLM, and specifies that the entities of interest are geospatial features. These include countries, cities, villages, buildings, museums, and cultural heritage sites. Once the correct entity is identified, the disambiguated Wikidata ID is substituted back into the original TRiG file.

In [None]:
import os
import re
import requests
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from openai import OpenAI
from SPARQLWrapper import SPARQLWrapper, JSON

@dataclass
class Entity:
    wikidata_id: str
    label: str
    coordinates: Optional[str]
    entity_type: str

@dataclass
class WikidataCandidate:
    id: str
    label: str
    description: str

@dataclass
class DisambiguationResult:
    file: str
    original_id: str
    label: str
    coordinates: Optional[str]
    disambiguated_id: Optional[str]

class WikidataDisambiguator:
    def __init__(self, api_key: str):
        self.client = OpenAI(api_key=api_key)
        self.sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
        
    def validate_wikidata_id(self, wikidata_id: str, expected_label: str) -> Tuple[bool, Optional[str]]:
        """Validate if a Wikidata ID matches the expected label."""
        wikidata_id = wikidata_id.lstrip("Q")
        query = f"""
        SELECT ?entityLabel WHERE {{
            BIND(wd:Q{wikidata_id} AS ?entity)
            ?entity rdfs:label ?entityLabel.
            FILTER(LANG(?entityLabel) IN ("en", "it"))
        }}
        """
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)

        try:
            results = self.sparql.query().convert()
            for result in results["results"]["bindings"]:
                label = result["entityLabel"]["value"]
                if expected_label.lower() == label.lower():
                    return True, label
            return False, None
        except Exception as e:
            print(f"SPARQL Error: {e}")
            return False, None

    def search_wikidata_candidates(self, label: str) -> List[WikidataCandidate]:
        """Search for potential Wikidata matches for a given label."""
        url = "https://www.wikidata.org/w/api.php"
        params = {
            "action": "wbsearchentities",
            "search": label,
            "language": "en",
            "uselang": "it",
            "format": "json",
            "type": "item"
        }

        try:
            response = requests.get(url, params=params)
            data = response.json()
            return [
                WikidataCandidate(
                    id=result["id"].lstrip('Q'),
                    label=result["label"],
                    description=result.get("description", "")
                )
                for result in data.get("search", [])
            ]
        except Exception as e:
            print(f"API Error: {e}")
            return []

    def gpt_disambiguation(
        self, 
        original_id: str, 
        original_label: str, 
        candidates: List[WikidataCandidate],
        coordinates: Optional[str],
        entity_type: str
    ) -> Optional[str]:
        """Use GPT to disambiguate between multiple Wikidata candidates."""
        prompt = self._build_gpt_prompt(
            original_id, original_label, candidates, coordinates, entity_type
        )

        try:
            response = self.client.chat.completions.create(
                model="gpt-4",
                messages=[{"role": "user", "content": prompt}]
            )
            return response.choices[0].message.content.strip().lstrip('Q')
        except Exception as e:
            print(f"GPT Error: {e}")
            return None

    def _build_gpt_prompt(
        self, 
        original_id: str, 
        original_label: str, 
        candidates: List[WikidataCandidate],
        coordinates: Optional[str],
        entity_type: str
    ) -> str:
        """Build the prompt for GPT disambiguation."""
        prompt = f"""
        The Wikidata ID Q{original_id} with label '{original_label}' needs to be validated.
        Context:
        - Entity type: {entity_type}
        - Coordinates: {coordinates}

        Here are some alternative candidates with their descriptions:
        """
        for candidate in candidates:
            prompt += f"- {candidate.label} (Q{candidate.id}): {candidate.description}\n"

        prompt += "\nConsidering the context provided and considering that these entities all refer to geospatial data (i.e. cultural heritage sites, buildings, cities), which candidate is the correct match? Provide only the Wikidata ID number without the Q prefix."
        return prompt

class TrigFileProcessor:
    @staticmethod
    def extract_info(file_path: str) -> List[Entity]:
        """Extract entity information from a TRIG file."""
        entities = []

        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

                wikidata_pattern = r'<https://www\.wikidata\.org/wiki/Q(\d+)>'
                wikidata_matches = re.finditer(wikidata_pattern, content)

                for wikidata_match in wikidata_matches:
                    entity = TrigFileProcessor._extract_entity(content, wikidata_match.group(1))
                    if entity:
                        entities.append(entity)

        except Exception as e:
            print(f"Error reading file {file_path}: {e}")

        return entities

    @staticmethod
    def _extract_entity(content: str, wikidata_id: str) -> Optional[Entity]:
        """Extract a single entity's information from TRIG content."""
        label_pattern = rf'<https://www\.wikidata\.org/wiki/Q{wikidata_id}>\s+rdfs:label\s+"([^"]+)"'
        label_match = re.search(label_pattern, content)

        if not label_match:
            value_pattern = r'rdf:value\s+"([^"]+)"'
            label_match = re.search(value_pattern, content)

        if not label_match:
            return None

        coordinates_pattern = r'crm:P168_place_is_defined_by\s+"([^"]+)"'
        coordinates_match = re.search(coordinates_pattern, content)

        return Entity(
            wikidata_id=wikidata_id,
            label=label_match.group(1),
            coordinates=coordinates_match.group(1) if coordinates_match else None,
            entity_type="Location"
        )

    @staticmethod
    def update_file(file_path: str, original_id: str, new_id: str) -> None:
        """Update Wikidata IDs in a TRIG file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            updated_content = content.replace(
                f'<https://www.wikidata.org/wiki/Q{original_id}>',
                f'<https://www.wikidata.org/wiki/Q{new_id}>'
            )

            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(updated_content)

            print(f"Updated Wikidata ID from Q{original_id} to Q{new_id} in {file_path}")

        except Exception as e:
            print(f"Error updating file {file_path}: {e}")

def main():
    # Initialize the disambiguator with your API key
    disambiguator = WikidataDisambiguator(api_key='')
    
    # Set your input folder path
    folder_path = "./rdf_output"
    results: List[DisambiguationResult] = []

    # Process each TRIG file
    for filename in os.listdir(folder_path):
        if not filename.endswith('.trig'):
            continue

        file_path = os.path.join(folder_path, filename)
        print(f"Processing file: {filename}")

        try:
            entities = TrigFileProcessor.extract_info(file_path)

            for entity in entities:
                # Validate and disambiguate
                is_valid, _ = disambiguator.validate_wikidata_id(entity.wikidata_id, entity.label)
                
                disambiguated_id = None
                if not is_valid:
                    candidates = disambiguator.search_wikidata_candidates(entity.label)
                    if candidates:
                        disambiguated_id = disambiguator.gpt_disambiguation(
                            entity.wikidata_id,
                            entity.label,
                            candidates,
                            entity.coordinates,
                            entity.entity_type
                        )

                # Store results
                results.append(DisambiguationResult(
                    file=filename,
                    original_id=entity.wikidata_id,
                    label=entity.label,
                    coordinates=entity.coordinates,
                    disambiguated_id=disambiguated_id
                ))

                # Update file if needed
                if disambiguated_id and disambiguated_id != entity.wikidata_id:
                    TrigFileProcessor.update_file(file_path, entity.wikidata_id, disambiguated_id)

        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    # Print results
    for result in results:
        print(f"\nFile: {result.file}")
        print(f"Original ID: Q{result.original_id}")
        print(f"Label: {result.label}")
        print(f"Coordinates: {result.coordinates}")
        print(f"Disambiguated ID: {result.disambiguated_id if result.disambiguated_id else 'Not found'}")

if __name__ == "__main__":
    main()

Processing file: 10_date_1_sunday feb 18 1894.trig
Processing file: 10_date_2_monday feb 19 1894.trig
Processing file: 10_event_3c8d1a0d-f8c2-4d4e-9d7f-e9a6d5b8d9c3.trig
Processing file: 10_event_a0d9d7b0-f7c4-4f9f-9d9f-f6d3b6a1d9a6.trig
Processing file: 10_event_d8d9d7b0-f7c4-4f9f-9d9f-f6d3b6a1d9a6.trig
Processing file: 10_place_1_il palmerino maiano.trig
Processing file: 10_place_2_12 lungarno acciajuoli florence.trig
Processing file: 10_place_3_uffizi.trig
Updated Wikidata ID from Q127665 to Q51252 in ./rdf_output\10_place_3_uffizi.trig
Updated Wikidata ID from Q127665 to Q51252 in ./rdf_output\10_place_3_uffizi.trig
Processing file: 11_date_1_tuesday feb 20 1894.trig
Processing file: 11_event_a0d2b9d4-f3f6-4a6b-9d9c-5f6d1d3b8d9a.trig
Processing file: 11_event_e7b2a9c0-f9d4-4f9f-9f9a-c1f6d3b8d9a2.trig
Processing file: 11_event_f6a1d7c5-f8d6-4b1a-9d7e-0c3f6e8d1d3b.trig
Processing file: 11_place_1_bargello.trig
Updated Wikidata ID from Q806813 to Q388448 in ./rdf_output\11_place_1_bar