In [1]:
import nltk
from nltk import ngrams
import json
import os
import spacy
from nltk.metrics.distance import jaccard_distance
from spacy.matcher import PhraseMatcher

In [2]:
events_with_series_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "events_with_ordinal.json")
events_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "events_without_matches.json")
series_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "event_series.json")

In [3]:
with open(events_with_series_file) as file:
    events_with_existing_series = json.load(file)
len(events_with_existing_series)

3501

In [4]:
events_with_existing_series = [event for event in events_with_existing_series if 'series' in event.keys()]
len(events_with_existing_series)

319

In [5]:
events_with_existing_series = [event for event in events_with_existing_series if 'title' in event.keys()]
len(events_with_existing_series)

278

### Since we have only 278 entries with titles that have existing matches, we are not treating this dataset as our training corpus. This leads to dealing with metrics of accuracy only, and not precision, recall and f1-score

In [6]:
with open(events_file) as file:
    events = json.load(file)
len(events)

3140

In [7]:
event_titles = [item['title'] for item in events if 'title' in item]
event_labels = [item['eventLabel'] for item in events if 'eventLabel' in item]

In [8]:
print("Event titles: ", len(event_titles))
print("Event labels: ", len(event_labels))

Event titles:  3098
Event labels:  3140


In [9]:
with open(series_file) as file:
    series = json.load(file)
    series_titles = [item["title"]["value"] for item in series["results"]["bindings"] if "title" in item]
len(series_titles)

118

### It has been usually observed that the event titles are larger than the series title
Example - 
EVENT - Sixth International Workshop on Computer Modeling and Intelligent Systems (CMIS 2023) <br>
EVENT_SERIES - International Workshop on Computer Modeling and Intelligent Systems <br>
<br>
EVENT - 5th International Conference on Recent Trends and Applications in Computer Science and Information Technology <br>
EVENT_SERIES - International Conference on Recent Trends and Applications in Computer Science and Information Technology

#### So we take the series as the phrases to be matched with the events - containment matches

In [10]:
# Capturing all the distinct series
series_distinct = []

In [11]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in series_titles]
matcher.add("Event_EventSeries_Matcher", patterns)

matching_events = []
for event in event_titles:
    doc = nlp(event)    
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        if event not in matching_events:
            matching_events.append(event)
        if span.text not in series_distinct:
            series_distinct.append(span.text)
#         print(f"Series: '{span.text}' Event: '{event}'")
print("Number of containment matches from event titles: ", len(matching_events))

Number of containment matches from event titles:  140


In [12]:
accuracy = (len(matching_events) / len(events))*100 
print(f"Accuracy = {accuracy}")

Accuracy = 4.45859872611465


In [13]:
event_without_matches = [event for event in events if event not in matching_events]
with open('events_without_containment_matches.json', 'w') as fp:
    json.dump(event_without_matches, fp)

In [14]:
event_titles = [event for event in event_titles if event not in matching_events]
len(event_titles)

2958

In [15]:
if '' in event_titles:
    print(True)

In [16]:
n = 3  # Number of words in each n-gram

threshold = 0.5  # Minimum required similarity for a partial match

partially_matched_events = []
for event in event_titles:
    event_ngrams = set(ngrams(event.split(), n))
    for series in series_titles:
        series_ngrams = set(ngrams(series.split(), n))
        '''There can be cases that series or events don't have 3 words'''
        if(len(event_ngrams.union(series_ngrams)) > 0):
            similarity = 1 - jaccard_distance(event_ngrams, series_ngrams)
            
        if similarity >= threshold:
            print("Partial match found:")
            print(f"#####EVENT#####{event}")
            print(f"######SERIES######{series}")
            if series not in series_distinct:
                series_distinct.append(series)
            print()
            partially_matched_events.append(event)

Partial match found:
#####EVENT#####1st International Workshop on Knowledge Graph Summarization
######SERIES######International Workshop on Knowledge Graph Construction

Partial match found:
#####EVENT#####Digital Humanities in the Nordic Countries 5th Conference
######SERIES######Digital Humanities in the Nordic Countries Conference

Partial match found:
#####EVENT#####12th ZEUS Workshop on Services and their Composition
######SERIES######Central-European Workshop on Services and their Composition

Partial match found:
#####EVENT#####XXXV International Conference of the Spanish Society for Natural Language Processing
######SERIES######Annual Conference of the Spanish Society for Natural Language Processing

Partial match found:
#####EVENT#####1st International Workshop on Knowledge Graph Building
######SERIES######International Workshop on Knowledge Graph Construction

Partial match found:
#####EVENT#####11th Central European Workshop on Services and their Composition
######SERIES####

In [17]:
print("Number of partial matches: ", len(partially_matched_events))

Number of partial matches:  23


In [18]:
accuracy = (len(partially_matched_events) / len(event_titles))*100 
print(f"Accuracy = {accuracy}")

Accuracy = 0.777552400270453


In [19]:
event_titles = [event for event in event_titles if event not in partially_matched_events]
len(event_titles)

2935

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
import numpy as np

vectorizer = TfidfVectorizer(stop_words = text.ENGLISH_STOP_WORDS)
array1_strings = event_titles
array2_strings = series_titles
tfidf_matrix = vectorizer.fit_transform(event_titles + series_titles)

# Calculate cosine similarity between array1 and array2
similarity_matrix = cosine_similarity(tfidf_matrix[:len(array1_strings)], tfidf_matrix[len(array1_strings):])

# Threshold for partial match
threshold = 0.7

# Find partial matches
matches = np.argwhere(similarity_matrix >= threshold)

ctr = 0
partially_matched_events = []
# Print partial matches
for match in matches:
    array1_index = match[0]
    array2_index = match[1]
    print("Partial match found:")
    print(f"#####EVENT#####{array1_strings[array1_index]}")
    print(f"######SERIES######{array2_strings[array2_index]}")
    if series not in series_distinct:
        series_distinct.append(series)
    print()
    partially_matched_events.append(array1_strings[array1_index])

Partial match found:
#####EVENT#####15th Central European Workshop Services and their Composition (ZEUS 2023)
######SERIES######Central-European Workshop on Services and their Composition

Partial match found:
#####EVENT#####First International Workshop on Data Ecosystems
######SERIES######International Workshop on Software Ecosystems

Partial match found:
#####EVENT#####23rd Workshop "From Objects to Agents"
######SERIES######Workshop From Objects to Agents

Partial match found:
#####EVENT#####6th Digital Humanities in the Nordic and Baltic Countries Conference (DHNB 2022)
######SERIES######Digital Humanities in the Nordic Countries Conference

Partial match found:
#####EVENT#####14th Central European Workshop on Services and their Composition (ZEUS 2022)
######SERIES######Central-European Workshop on Services and their Composition

Partial match found:
#####EVENT#####22nd Workshop "From Objects to Agents"
######SERIES######Workshop From Objects to Agents

Partial match found:
#####EV

In [21]:
print("Number of partial matches: ", len(partially_matched_events))

Number of partial matches:  58


In [22]:
accuracy = (len(partially_matched_events) / len(event_titles))*100 
print(f"Accuracy = {accuracy}")

Accuracy = 1.9761499148211243


In [23]:
len(series_distinct)

61