Querying random speeches for NER evaluation 


*Install* the dependencies:

In [None]:
!pip install SPARQLWrapper numpy scipy matplotlib pandas networkx
%matplotlib inline

from datetime import datetime
from dateutil.relativedelta import relativedelta

import matplotlib.pyplot as plt
import numpy    as np
import networkx as nx
import pandas   as pd

from collections import Counter, OrderedDict
from rdflib.namespace import XSD
from scipy.stats import binned_statistic
from scipy.interpolate import interp1d
from SPARQLWrapper import SPARQLWrapper, JSON, POST
from IPython.display import display, HTML

Define some useful functions for data conversion

In [None]:
from rdflib.namespace import XSD
import pandas as pd 

DATATYPECONVERTERS = {
      str(XSD.integer):  int,
      str(XSD.decimal):  float,
      str(XSD.date):     lambda v: datetime.strptime(v, '%Y-%m-%d').date()
  }

def convertDatatype(obj):
  return DATATYPECONVERTERS.get(obj.get('datatype'), str)(obj.get('value')) 

def JSON2Pandas(results):
    res = results["results"]["bindings"]
    data = [dict([(k, convertDatatype(v)) for k,v in r.items()]) for r in res]
    return pd.DataFrame(data)

In [None]:
import  getpass
authorization = getpass.getpass('Password:') 

final_results = []
sparql3 = SPARQLWrapper("http://ldf.fi/semparl/sparql")

query="""
  PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
  PREFIX sem: <http://semanticweb.cs.vu.nl/2009/11/sem/>
  PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
  PREFIX dct: <http://purl.org/dc/terms/>
  PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
  PREFIX text: <http://jena.apache.org/text#>
  PREFIX : <http://ldf.fi/schema/semparl/>
  prefix linguistics: <http://ldf.fi/schema/semparl/linguistics/>

SELECT ?date ?speech ?content ?namedEntity ?surfaceForm ?category  {
{ SELECT ?date ?speech ?content {
	?speech 
     	:content ?content;
  		dct:date ?date .
FILTER(?date > "XXXX-01-01"^^xsd:date && ?date <= "XXXX-12-31"^^xsd:date)
FILTER NOT EXISTS { ?speech :speechType <http://ldf.fi/semparl/PuhemiesPuheenvuoro> }
BIND(SHA512(CONCAT(STR(RAND()), STR(?speech))) AS ?random) . # https://stackoverflow.com/questions/5677340/how-to-select-random-dbpedia-nodes-from-sparql
}
    ORDER BY ?random 
	LIMIT 200
}
OPTIONAL{?speech linguistics:referencedNamedEntity ?entity .
  	?entity
    	skos:prefLabel ?namedEntity ;
    	linguistics:surfaceForm ?surfaceForm ;
     	linguistics:category ?category .
  }
}
"""

for year in range(1985, 1996):
  sparql3.setQuery(query.replace('XXXX', str(year)))

  sparql3.setReturnFormat(JSON)
  sparql3.addCustomHttpHeader("Authorization", authorization)
  results = sparql3.query().convert()
  link_data = JSON2Pandas(results)
  final_results.append(link_data)


all_years=pd.concat(final_results)
all_years.insert(5, 'recognition was true positive', '')
all_years["category was right"] = ""
all_years["not recognised by NER"] = ""
display(all_years)




In [None]:
from google.colab import files

all_years.to_csv('NER_evaluation_years.csv', index=False)
files.download('NER_evaluation_years.csv')