In [1]:
import json, copy

In [2]:
# load event series data

file1 = open(r'C:\Users\resmi\rwth\kglab\CEUR-WS-Event-Series--SS23\eventseries\src\main\resources\event_series.json')

event_series = json.load(file1)

file1.close()

In [3]:
# load event data

file2 = open(r'C:\Users\resmi\rwth\kglab\CEUR-WS-Event-Series--SS23\eventseries\src\main\resources\events.json', encoding="utf8")

events = json.load(file2)

file2.close()

In [4]:
# filter out events without title

events_with_title = []

for binding in events['results']['bindings']:
    if 'title' in binding:
        events_with_title.append(binding)

# filter out event series without title

event_series_with_title = []

for binding in event_series['results']['bindings']:
    if 'title' in binding:
        event_series_with_title.append(binding)

        

In [5]:
print("number of events present in wikidata, with a title value: ", len(events_with_title))
print("number of event series present in wikidata, with a title value: ", len(event_series_with_title))

number of events present in wikidata, with a title value:  3427
number of event series present in wikidata, with a title value:  118


In [6]:
# subset events with entry "series"

events_with_title_and_series = []

for binding in events['results']['bindings']:
    if 'title' in binding and 'series' in binding:
        events_with_title_and_series.append(binding)
    
print("number of events with a title value, and linked to an event series: ", len(events_with_title_and_series))

# compare linked events with event series, drop the events if there is no match, making sure that corresponding event series exist
# "series" : type uri, value wikidata link

event_series_dummy = [event['series'] for event in event_series_with_title]

subset_events_with_title_and_series = [event for event in events_with_title_and_series if event['series'] in event_series_dummy]

print("number of events with a title value, and with respective event series present in event series list: ",len(subset_events_with_title_and_series))

number of events with a title value, and linked to an event series:  278
number of events with a title value, and with respective event series present in event series list:  253


the discrepancy between numbers show that there are events in wikidata with a series value, but these series values are not present in the event series query

moving on, the list "subset_events_with_title_and_series" will be our ground truth, to test if our matching algorithm works as intended

In [7]:
# drop property series from this subset, to prevent data leakage
subset_train_rb = copy.deepcopy(subset_events_with_title_and_series)

for event in subset_train_rb:
    event.pop('series', None)

subset_test_rb = copy.deepcopy(subset_events_with_title_and_series)

In [8]:
# extract event title values into a list

subset_train_only_titles_rb = []

for titles in subset_train_rb:
    subset_train_only_titles_rb.append(titles['title']['value'])

print(len(subset_train_only_titles_rb))
print(subset_train_only_titles_rb)

253
['Third Conference on Digital Curation Technologies (Qurator 2022)', '37th Italian Conference on Computational Logic', 'Sixteenth International Conference on Concept Lattices and Their Applications (CLA 2022)', '1st International Workshop on Knowledge Graph Generation From Text', '3rd International Workshop on Knowledge Graph Construction (KGCW 2022)', '8th Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', 'Joint Ontology Workshops 2021', '36th Italian Conference on Computational Logic', '7th Linguistic and Cognitive Approaches To Dialog Agents Workshop - LaCATODA 2021', '2nd International Workshop on Knowledge Graph Construction', '11th International Workshop on Enterprise Modeling and Information Systems Architectures', 'Seventh Italian Conference on Computational Linguistics', 'Conference on Digital Curation Technologies (Qurator 2021)', '4th Workshop on Natural Language for Artificial Intelligence', '2nd International Workshop on Information-Commu

In [9]:
# extract event series' title values into a list

event_series_only_title = []

for titles in event_series_with_title:
    event_series_only_title.append(titles['title']['value'])

print(len(event_series_only_title))
print(event_series_only_title)

118
['European Workshop on Human-Computer Interaction and Information Retrieval', 'International Workshop on Neural-Symbolic Learning and Reasoning', 'International Conference on Computational Linguistics and Intelligent Systems', 'Workshop From Objects to Agents', 'International Workshop on Information Management for Mobile Applications', 'International Workshop on Knowledge Discovery on the Web', 'International Workshop on Artificial Intelligence and Cognition', 'International Configuration Workshop', 'Interop-Vlab.It Workshop on Pervasive Computing for Networked Enterprises', 'International Workshop on Modular Ontologies', 'Design and Management of Data Warehouses', 'Joint Ontology Workshops', 'International Workshop on Knowledge Graph Construction', 'Italian Conference on Cybersecurity', 'International Conference on Biomedical Ontology', 'Conference on Digital Curation Technologies', 'International Workshop on Control, Optimisation and Analytical Processing of Social Networks', 'Wo

In [10]:
# spaCy rule-based matching
# simpler than alternatives
# more robust, given that the event series data is not big enough for a statistical model

import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

# terms is the list of event series titles to be matched
terms = event_series_only_title.copy()

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("Event Series List", patterns)

events_prediction_event_series_rb = []

for index, titles in enumerate(subset_train_only_titles_rb):

    doc = nlp(titles)

    matches = matcher(doc)

    events_prediction_event_series_rb.insert(index, "")

    for match_id, start, end in matches:
        span = doc[start:end]

        #print(span.text)
        events_prediction_event_series_rb[index] = span.text



  from .autonotebook import tqdm as notebook_tqdm


In [11]:
print(len(events_prediction_event_series_rb))
print(events_prediction_event_series_rb)

253
['Conference on Digital Curation Technologies', 'Italian Conference on Computational Logic', '', 'Workshop on Knowledge Graph Generation From Text', 'International Workshop on Knowledge Graph Construction', 'Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', 'Joint Ontology Workshops', 'Italian Conference on Computational Logic', '', 'International Workshop on Knowledge Graph Construction', 'International Workshop on Enterprise Modeling and Information Systems Architectures', 'Italian Conference on Computational Linguistics', 'Conference on Digital Curation Technologies', '', 'International Workshop on Information-Communication Technologies & Embedded Systems', 'International Semantic Web Conference', 'Workshop on Managing the Evolution and Preservation of the Data Web', '', 'Italian Conference on Computational Logic', 'Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', '', 'Scientific-practical Workshop Information Techno

In [12]:
# insert corresponsing "series" value into subset_train

for i, title in enumerate(events_prediction_event_series_rb):
    for entry in event_series_with_title:
        if entry['title']['value'] == title:
            subset_train_rb[i]['series'] = entry.get('series')
            break


In [13]:
# compare test and train

metrics1 = []
metrics2 = []

for train, test in zip(subset_train_rb, subset_test_rb):
    metrics1.append(True if train.get('series') == test.get('series') else False)
    metrics2.append(True if "series" in train else False)

accuracy = (sum(metrics1) / len(metrics1)) * 100
precision = (sum(metrics1) / sum(metrics2)) * 100

print("total number of events with series property", len(subset_train_rb))
print("total number of correct matches: ", metrics1.count(True))
print("total number of matches: ", metrics2.count(True))
print("percantage of true predictions: ", accuracy)
print("percentage of true positives in predictions, aka precision: ", precision)

total number of events with series property 253
total number of correct matches:  166
total number of matches:  167
percantage of true predictions:  65.61264822134387
percentage of true positives in predictions, aka precision:  99.40119760479041


In [14]:
#TODO
# spaCy entity linking approach
# a model is needed to ne trained
# this one is more powerful, yet needs more effort
# might be more suitable for wikidata

In [15]:
#Ayan's ngram matcher

In [16]:
import nltk
from nltk import ngrams
from nltk.metrics.distance import jaccard_distance

In [17]:
# drop property series from this subset, to prevent data leakage
subset_train_ngram = copy.deepcopy(subset_events_with_title_and_series)

for event in subset_train_ngram:
    event.pop('series', None)

subset_test_ngram = copy.deepcopy(subset_events_with_title_and_series)

In [18]:
# extract event title values into a list

subset_train_only_titles_ngram = []

for titles in subset_train_ngram:
    subset_train_only_titles_ngram.append(titles['title']['value'])

print(len(subset_train_only_titles_ngram))
print(subset_train_only_titles_ngram)

253
['Third Conference on Digital Curation Technologies (Qurator 2022)', '37th Italian Conference on Computational Logic', 'Sixteenth International Conference on Concept Lattices and Their Applications (CLA 2022)', '1st International Workshop on Knowledge Graph Generation From Text', '3rd International Workshop on Knowledge Graph Construction (KGCW 2022)', '8th Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', 'Joint Ontology Workshops 2021', '36th Italian Conference on Computational Logic', '7th Linguistic and Cognitive Approaches To Dialog Agents Workshop - LaCATODA 2021', '2nd International Workshop on Knowledge Graph Construction', '11th International Workshop on Enterprise Modeling and Information Systems Architectures', 'Seventh Italian Conference on Computational Linguistics', 'Conference on Digital Curation Technologies (Qurator 2021)', '4th Workshop on Natural Language for Artificial Intelligence', '2nd International Workshop on Information-Commu

In [19]:
n = 3  # Number of words in each n-gram

threshold = 0.5  # Minimum required similarity for a partial match

ngram_series = event_series_only_title.copy()
ngram_events = subset_train_only_titles_ngram.copy()

partially_matched_events_ngram = []
for index, event in enumerate(subset_train_only_titles_ngram):

    partially_matched_events_ngram.insert(index, "")

    event_ngrams = set(ngrams(event.split(), n))

    for series in ngram_series:
        series_ngrams = set(ngrams(series.split(), n))
        '''There can be cases that series or events don't have 3 words'''
        if(len(event_ngrams.union(series_ngrams)) > 0):
            similarity = 1 - jaccard_distance(event_ngrams, series_ngrams)
            
        if similarity >= threshold:
            #print("Partial match found:")
            #print(f"#####EVENT#####{event}")
            #print(f"######SERIES######{series}")
            partially_matched_events_ngram[index]= series

In [20]:
# insert corresponsing "series" value into subset_train

for i, title in enumerate(partially_matched_events_ngram):
    for entry in event_series_with_title:
        if entry['title']['value'] == title:
            subset_train_ngram[i]['series'] = entry.get('series')
            break

In [21]:
# compare test and train

metrics1_ngram = []
metrics2_ngram = []

for train, test in zip(subset_train_ngram, subset_test_ngram):
    metrics1_ngram.append(True if train.get('series') == test.get('series') else False)
    metrics2_ngram.append(True if "series" in train else False)

accuracy_ngram = (sum(metrics1_ngram) / len(metrics1_ngram)) * 100
precision_ngram = (sum(metrics1_ngram) / sum(metrics2_ngram)) * 100

print("total number of events with series property", len(subset_train_ngram))
print("total number of correct matches: ", metrics1_ngram.count(True))
print("total number of matches: ", metrics2_ngram.count(True))
print("percantage of true predictions: ", accuracy_ngram)
print("percentage of true positives in predictions, aka precision: ", precision_ngram)

total number of events with series property 253
total number of correct matches:  142
total number of matches:  142
percantage of true predictions:  56.126482213438734
percentage of true positives in predictions, aka precision:  100.0


In [22]:
# Ayan's tf-idf matching

In [23]:
# drop property series from this subset, to prevent data leakage
subset_train_tfidf = copy.deepcopy(subset_events_with_title_and_series)

for event in subset_train_tfidf:
    event.pop('series', None)

subset_test_tfidf = copy.deepcopy(subset_events_with_title_and_series)

In [24]:
# extract event title values into a list

subset_train_only_titles_tfidf = []

for titles in subset_train_tfidf:
    subset_train_only_titles_tfidf.append(titles['title']['value'])

print(len(subset_train_only_titles_tfidf))
print(subset_train_only_titles_tfidf)

253
['Third Conference on Digital Curation Technologies (Qurator 2022)', '37th Italian Conference on Computational Logic', 'Sixteenth International Conference on Concept Lattices and Their Applications (CLA 2022)', '1st International Workshop on Knowledge Graph Generation From Text', '3rd International Workshop on Knowledge Graph Construction (KGCW 2022)', '8th Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', 'Joint Ontology Workshops 2021', '36th Italian Conference on Computational Logic', '7th Linguistic and Cognitive Approaches To Dialog Agents Workshop - LaCATODA 2021', '2nd International Workshop on Knowledge Graph Construction', '11th International Workshop on Enterprise Modeling and Information Systems Architectures', 'Seventh Italian Conference on Computational Linguistics', 'Conference on Digital Curation Technologies (Qurator 2021)', '4th Workshop on Natural Language for Artificial Intelligence', '2nd International Workshop on Information-Commu

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
import numpy as np

vectorizer = TfidfVectorizer(stop_words = "english")
array1_strings = subset_train_only_titles_tfidf.copy()
array2_strings = event_series_only_title.copy()
tfidf_matrix = vectorizer.fit_transform(array1_strings + array2_strings)

# Calculate cosine similarity between array1 and array2
similarity_matrix = cosine_similarity(tfidf_matrix[:len(array1_strings)], tfidf_matrix[len(array1_strings):])

# Threshold for partial match
threshold = 0.7

# Find partial matches
matches = np.argwhere(similarity_matrix >= threshold)


partially_matched_events_tfidf = []

# Print partial matches
for index, value in enumerate(similarity_matrix):

    partially_matched_events_tfidf.insert(index, "")

    
for match in matches:
    array1_index = match[0]
    array2_index = match[1]
    #print("Partial match found:")
    #print(f"#####EVENT#####{array1_strings[array1_index]}")
    #print(f"######SERIES######{array2_strings[array2_index]}")

    partially_matched_events_tfidf[array1_index] = array2_strings[array2_index]

In [26]:
print(len(partially_matched_events_tfidf))

253


In [27]:
print((partially_matched_events_tfidf))

['Conference on Digital Curation Technologies', 'Italian Conference on Computational Logic', '', 'Workshop on Knowledge Graph Generation From Text', '', 'Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', 'Joint Ontology Workshops', 'Italian Conference on Computational Logic', 'Workshop on Linguistic and Cognitive Approaches To Dialog Agents', 'International Workshop on Knowledge Graph Construction', 'International Workshop on Enterprise Modeling and Information Systems Architectures', 'Italian Conference on Computational Linguistics', 'Conference on Digital Curation Technologies', '', 'International Workshop on Information-Communication Technologies & Embedded Systems', 'International Semantic Web Conference', 'Workshop on Managing the Evolution and Preservation of the Data Web', '', '', 'Joint Workshop on Interfaces and Human Decision Making for Recommender Systems', '', 'Scientific-practical Workshop Information Technologies: Algorithms, Models, Systems

In [28]:
# insert corresponsing "series" value into subset_train

for i, title in enumerate(partially_matched_events_tfidf):
    for entry in event_series_with_title:
        if entry['title']['value'] == title:
            subset_train_tfidf[i]['series'] = entry.get('series')
            break

In [29]:
# compare test and train

metrics1_tfidf = []
metrics2_tfidf = []

for train, test in zip(subset_train_tfidf, subset_test_tfidf):
    metrics1_tfidf.append(True if train.get('series') == test.get('series') else False)
    metrics2_tfidf.append(True if "series" in train else False)

accuracy_tfidf = (sum(metrics1_tfidf) / len(metrics1_tfidf)) * 100
precision_tfidf = (sum(metrics1_tfidf) / sum(metrics2_tfidf)) * 100

print("total number of events with series property", len(subset_train_tfidf))
print("total number of correct matches: ", metrics1_tfidf.count(True))
print("total number of matches: ", metrics2_tfidf.count(True))
print("percantage of true predictions: ", accuracy_tfidf)
print("percentage of true positives in predictions, aka precision: ", precision_tfidf)

total number of events with series property 253
total number of correct matches:  169
total number of matches:  169
percantage of true predictions:  66.79841897233202
percentage of true positives in predictions, aka precision:  100.0
