In [1]:
!pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query-scholarly.wikidata.org/sparql"

query = """SELECT * WHERE {
  ?paper wdt:P31 wd:Q13442814; rdfs:label ?title.
  FILTER(LANG(?title)="en")
  }
LIMIT 100"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)




In [2]:
import pandas as pd
# Convert the results to a pandas DataFrame
df_results = pd.DataFrame(results["results"]["bindings"])
df_results["paper"] = df_results["paper"].apply(lambda x: x["value"])
df_results["title"] = df_results["title"].apply(lambda x: x["value"])

# Display the first few rows of the DataFrame
display(df_results.head())

Unnamed: 0,paper,title
0,http://www.wikidata.org/entity/Q14405376,Nomenclatorial transfers
1,http://www.wikidata.org/entity/Q14405740,Annotated checklist of the recent and extinct ...
2,http://www.wikidata.org/entity/Q14503029,The genus Epiphyllum and its allies
3,http://www.wikidata.org/entity/Q14508416,Apertium: a free/open-source platform for rule...
4,http://www.wikidata.org/entity/Q14530181,The nearest visual binaries


#Named Entity Extraction

In [3]:
!pip install spacy scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz (14.8 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [4]:
import requests
import spacy
import pandas as pd
import re
import os
import json

# Load ScispaCy or fallback
try:
    import scispacy
    nlp = spacy.load("en_core_sci_sm")
except:
    nlp = spacy.load("en_core_web_sm")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [5]:
# ========== NER WITH SCISPACY ==========

def extract_entities(report_text):
    doc = nlp(report_text)
    entities = [ent.text.lower() for ent in doc.ents]
    return entities

In [6]:
df_results["ner_entities"] = df_results["title"].apply(extract_entities)

In [7]:
df_results.head()

Unnamed: 0,paper,title,ner_entities
0,http://www.wikidata.org/entity/Q14405376,Nomenclatorial transfers,[nomenclatorial transfers]
1,http://www.wikidata.org/entity/Q14405740,Annotated checklist of the recent and extinct ...,"[annotated, checklist, pythons, serpentes, pyt..."
2,http://www.wikidata.org/entity/Q14503029,The genus Epiphyllum and its allies,"[genus, epiphyllum, allies]"
3,http://www.wikidata.org/entity/Q14508416,Apertium: a free/open-source platform for rule...,"[apertium, free/open-source, rule-based machin..."
4,http://www.wikidata.org/entity/Q14530181,The nearest visual binaries,"[nearest, visual]"


#Reconciliation to Wikidata

In [8]:
!pip install wikibaseintegrator

Collecting wikibaseintegrator
  Downloading wikibaseintegrator-0.12.12-py3-none-any.whl.metadata (35 kB)
Collecting backoff<3.0.0,>=2.2.1 (from wikibaseintegrator)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting mwoauth<0.5.0,>=0.4.0 (from wikibaseintegrator)
  Downloading mwoauth-0.4.0-py3-none-any.whl.metadata (2.1 kB)
Collecting ujson<6.0.0,>=5.10.0 (from wikibaseintegrator)
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Downloading wikibaseintegrator-0.12.12-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading backoff-2.2.1-py3-none-any.whl (15 kB)
Downloading mwoauth-0.4.0-py3-none-any.whl (12 kB)
Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m3.7 MB/s[0m eta [36m0:

In [9]:
from wikibaseintegrator import WikibaseIntegrator, wbi_helpers
import time

wbi = WikibaseIntegrator()

def reconcile_entity(entity_text):
    """Reconciles a single entity text against Wikidata, excluding scholarly publications."""
    try:
        # Search for the entity text in Wikidata
        search_results = wbi_helpers.search_entities(search_string=entity_text)
        time.sleep(5)  # Add a delay to avoid rate limiting

        if search_results:
            # Get the QID of the first result
            qid = search_results[0]

            # Fetch the entity details to check its type
            entity = wbi.item.get(qid)

            # Check the list of the names
            dict_n = []
            for label in entity.labels:
              if label.language == "en":
                dict_n.append(label.value)
            if "en" in entity.aliases.aliases:
              for alias in entity.aliases.aliases["en"]:
                dict_n.append(alias.value)
            dict_n = [x.lower() for x in dict_n]

            # Check if the entity is a scholarly publication (P31 is 'instance of', Q13442814 is 'scholarly article')
            is_scholarly_publication = False
            for claim in entity.claims:
                if claim.mainsnak.property_number == "P31":
                    if claim.mainsnak.datavalue["value"]["id"] == 'Q13442814':
                        is_scholarly_publication = True
                        break

            # Return the QID if it's not a scholarly publication, otherwise return None
            if not is_scholarly_publication:
                return qid
                # Check the exact match
                if (entity_text.lower() in dict_n):
                  return qid
                else:
                  return None
            else:
                return None
        else:
            # Return None if no results are found
            return None
    except Exception as e:
        print(f"Error reconciling '{entity_text}': {e}")
        return None

# Apply the reconciliation function to the 'ner_entities' column
# This will create a new column 'reconciled_qid' containing the Wikidata QIDs
df_results['reconciled_qid'] = df_results['ner_entities'].apply(lambda entities: [reconcile_entity(ent) for ent in entities])

# Display the updated DataFrame with the new column
display(df_results.head())



Unnamed: 0,paper,title,ner_entities,reconciled_qid
0,http://www.wikidata.org/entity/Q14405376,Nomenclatorial transfers,[nomenclatorial transfers],[None]
1,http://www.wikidata.org/entity/Q14405740,Annotated checklist of the recent and extinct ...,"[annotated, checklist, pythons, serpentes, pyt...","[Q6503489, Q106140535, Q184018, Q25537662, Q18..."
2,http://www.wikidata.org/entity/Q14503029,The genus Epiphyllum and its allies,"[genus, epiphyllum, allies]","[Q34740, Q310964, Q65097022]"
3,http://www.wikidata.org/entity/Q14508416,Apertium: a free/open-source platform for rule...,"[apertium, free/open-source, rule-based machin...","[Q184835, None, Q28324910, Q3331189]"
4,http://www.wikidata.org/entity/Q14530181,The nearest visual binaries,"[nearest, visual]","[Q1374523, Q162668]"


In [10]:
df_results.to_excel("main_subject_reconciliation.xlsx")