In [1]:
import json, copy

In [2]:
# load event series data

file1 = open('eventseries2305.json')

event_series = json.load(file1)

file1.close()

In [3]:
# load event data

file2 = open('events2305.json', encoding="utf8")

events = json.load(file2)

file2.close()

In [4]:
# filter out events without title

events_with_title = []

for binding in events['results']['bindings']:
    if 'title' in binding:
        events_with_title.append(binding)

# filter out event series without title

event_series_with_title = []

for binding in event_series['results']['bindings']:
    if 'title' in binding:
        event_series_with_title.append(binding)

        

In [5]:
print("number of events present in wikidata, with a title value: ", len(events_with_title))
print("number of event series present in wikidata, with a title value: ", len(event_series_with_title))

number of events present in wikidata, with a title value:  3415
number of event series present in wikidata, with a title value:  118


In [6]:
# subset events with entry "series"

events_with_title_and_series = []

for binding in events['results']['bindings']:
    if 'title' in binding and 'series' in binding:
        events_with_title_and_series.append(binding)
    
print("number of events with a title value, and linked to an event series: ", len(events_with_title_and_series))

# compare linked events with event series, drop the events if there is no match, making sure that corresponding event series exist
# "series" : type uri, value wikidata link

event_series_dummy = [event['series'] for event in event_series_with_title]

subset_events_with_title_and_series = [event for event in events_with_title_and_series if event['series'] in event_series_dummy]

print("number of events, with respective event series present in event series list: ",len(subset_events_with_title_and_series))

number of events with a title value, and linked to an event series:  278
number of events, with respective event series present in event series list:  253


the discrepancy between numbers show that there are events in wikidata with a series value, but these series values are nor present in the event series query

moving on, the list "subset_events_with_title_and_series" will be our ground truth, to test if our matching algorithm works as intended

In [7]:
# drop property series from this subset, to prevent data leakage
subset_train = copy.deepcopy(subset_events_with_title_and_series)

for event in subset_train:
    event.pop('series', None)

subset_test = copy.deepcopy(subset_events_with_title_and_series)

In [8]:
# extract event title values into a list

subset_train_only_titles = []

for titles in subset_train:
    subset_train_only_titles.append(titles['title']['value'])

print(len(subset_train_only_titles))
print(subset_train_only_titles)

253
['Third Conference on Digital Curation Technologies (Qurator 2022)', '37th Italian Conference on Computational Logic', 'Sixteenth International Conference on Concept Lattices and Their Applications (CLA 2022)', '1st International Workshop on Knowledge Graph Generation From Text', '3rd International Workshop on Knowledge Graph Construction (KGCW 2022)', '8th Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', 'Joint Ontology Workshops 2021', '36th Italian Conference on Computational Logic', '7th Linguistic and Cognitive Approaches To Dialog Agents Workshop - LaCATODA 2021', '2nd International Workshop on Knowledge Graph Construction', '11th International Workshop on Enterprise Modeling and Information Systems Architectures', 'Seventh Italian Conference on Computational Linguistics', 'Conference on Digital Curation Technologies (Qurator 2021)', '4th Workshop on Natural Language for Artificial Intelligence', '2nd International Workshop on Information-Commu

In [9]:
# extract event series' title values into a list

event_series_only_title = []

for titles in event_series_with_title:
    event_series_only_title.append(titles['title']['value'])

print(len(event_series_only_title))
print(event_series_only_title)

118
['European Workshop on Human-Computer Interaction and Information Retrieval', 'International Workshop on Neural-Symbolic Learning and Reasoning', 'International Conference on Computational Linguistics and Intelligent Systems', 'Workshop From Objects to Agents', 'International Workshop on Information Management for Mobile Applications', 'International Workshop on Knowledge Discovery on the Web', 'International Workshop on Artificial Intelligence and Cognition', 'International Configuration Workshop', 'Interop-Vlab.It Workshop on Pervasive Computing for Networked Enterprises', 'International Workshop on Modular Ontologies', 'Design and Management of Data Warehouses', 'Joint Ontology Workshops', 'International Workshop on Knowledge Graph Construction', 'Italian Conference on Cybersecurity', 'International Conference on Biomedical Ontology', 'Conference on Digital Curation Technologies', 'International Workshop on Control, Optimisation and Analytical Processing of Social Networks', 'Wo

In [10]:
# spaCy rule-based matching
# simpler than alternatives
# more robust, given that the event series data is not big enough for a statistical model

import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

# terms is the list of event series titles to be matched
terms = event_series_only_title.copy()

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("Event Series List", patterns)

events_prediction_event_series = []

for index, titles in enumerate(subset_train_only_titles):

    doc = nlp(titles)

    matches = matcher(doc)

    events_prediction_event_series.insert(index, "")

    for match_id, start, end in matches:
        span = doc[start:end]

        print(span.text)
        events_prediction_event_series[index] = span.text



  from .autonotebook import tqdm as notebook_tqdm


Conference on Digital Curation Technologies
Italian Conference on Computational Logic
Workshop on Knowledge Graph Generation From Text
International Workshop on Knowledge Graph Construction
Joint Workshop on Interfaces and Human Decision Making for Recommender Systems
Joint Ontology Workshops
Italian Conference on Computational Logic
International Workshop on Knowledge Graph Construction
International Workshop on Enterprise Modeling and Information Systems Architectures
Italian Conference on Computational Linguistics
Conference on Digital Curation Technologies
International Workshop on Information-Communication Technologies & Embedded Systems
International Semantic Web Conference
Workshop on Managing the Evolution and Preservation of the Data Web
Italian Conference on Computational Logic
Joint Workshop on Interfaces and Human Decision Making for Recommender Systems
Scientific-practical Workshop Information Technologies: Algorithms, Models, Systems
Joint Ontology Workshops
International

In [11]:
print(len(events_prediction_event_series))
print(events_prediction_event_series)

253
['Conference on Digital Curation Technologies', 'Italian Conference on Computational Logic', '', 'Workshop on Knowledge Graph Generation From Text', 'International Workshop on Knowledge Graph Construction', 'Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', 'Joint Ontology Workshops', 'Italian Conference on Computational Logic', '', 'International Workshop on Knowledge Graph Construction', 'International Workshop on Enterprise Modeling and Information Systems Architectures', 'Italian Conference on Computational Linguistics', 'Conference on Digital Curation Technologies', '', 'International Workshop on Information-Communication Technologies & Embedded Systems', 'International Semantic Web Conference', 'Workshop on Managing the Evolution and Preservation of the Data Web', '', 'Italian Conference on Computational Logic', 'Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', '', 'Scientific-practical Workshop Information Techno

In [12]:
# insert corresponsing "series" value into subset_train

for i, title in enumerate(events_prediction_event_series):
    for entry in event_series_with_title:
        if entry['title']['value'] == title:
            subset_train[i]['series'] = entry.get('series')
            break


In [14]:
# compare test and train

metrics1 = []
metrics2 = []

for train, test in zip(subset_train, subset_test):
    metrics1.append(True if train.get('series') == test.get('series') else False)
    metrics2.append(True if "series" in train else False)

accuracy = (sum(metrics1) / len(metrics1)) * 100
precision = (sum(metrics2) / sum(metrics2)) * 100

print("percantage of true predictions: ", accuracy)
print("percentage of true positives in predictions, aka precision: ", precision)

percantage of true predictions:  65.61264822134387
percentage of true positives in predictions, aka precision:  100.0


In [15]:
#TODO
# spaCy entity linking approach
# a model is needed to ne trained
# this one is more powerful, yet needs more effort
# might be more suitable for wikidata