In [99]:
import nltk
from nltk import ngrams
from nltk.metrics.distance import jaccard_distance

In [100]:
import os
events_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "events_without_matches.json")
series_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "event_series.json")

In [101]:
with open(events_file) as file:
    events = json.load(file)
len(events)

3140

In [102]:
event_titles = [item['title'] for item in events if 'title' in item]
event_titles

['Second International Workshop on Linked Data-driven Resilience Research 2023',
 'International Symposium on Securing NextGeneration Systems Using Future Artificial Intelligence Technologies',
 'Sixth International Workshop on Computer Modeling and Intelligent Systems (CMIS 2023)',
 'Technology-Enhanced Learning in Laboratories Workshop (TELL 2023)',
 '5th International Conference on Recent Trends and Applications in Computer Science and Information Technology',
 'Workshop on Intervening, Teaming, Delegating',
 '7th International Conference on Computational Linguistics and Intelligent Systems. Volume I: Machine Learning Workshop',
 '7th International Conference on Computational Linguistics and Intelligent Systems. Volume II: Computational Linguistics Workshop',
 'REFSQ-2023 Workshops, Doctoral Symposium, Posters & Tools Track and Journal Early Feedback',
 '3rd Edge Computing Workshop',
 'The QPP++ 2023: Query Performance Prediction and Its Evaluation in New Tasks Workshop',
 'Text2Sto

In [103]:
len(event_titles)

3098

In [104]:
with open(series_file) as file:
    series = json.load(file)
    series_titles = [item["title"]["value"] for item in series["results"]["bindings"] if "title" in item]
len(series_titles)

118

### It has been usually observed that the event titles are larger than the series title
Example - 
EVENT - Sixth International Workshop on Computer Modeling and Intelligent Systems (CMIS 2023) <br>
EVENT_SERIES - International Workshop on Computer Modeling and Intelligent Systems <br>
<br>
EVENT - 5th International Conference on Recent Trends and Applications in Computer Science and Information Technology <br>
EVENT_SERIES - International Conference on Recent Trends and Applications in Computer Science and Information Technology

#### So we take the series as the phrases to be matched with the events - containment matches

In [105]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in series_titles]
matcher.add("Event_EventSeries_Matcher", patterns)

matching_events = []
for event in events:
    if 'title' in event:
        doc = nlp(event['title'])    
        matches = matcher(doc)
        for match_id, start, end in matches:
            span = doc[start:end]
            matching_events.append(event)
            # print(f"Series: '{span.text}' Event: '{event}'")
print("Number of containment matches: ", len(matching_events))

Number of containment matches:  140


In [106]:
event_without_matches = [event for event in events if event not in matching_events]

In [107]:
len(event_without_matches)

3000

In [109]:
import json
with open('events_without_containment_matches.json', 'w') as fp:
    json.dump(event_without_matches, fp)

In [97]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in series_titles]
matcher.add("Event_EventSeries_Matcher", patterns)

matching_events = []
for event in event_titles:
    doc = nlp(event)    
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        matching_events.append(event)
        # print(f"Series: '{span.text}' Event: '{event}'")
print("Number of containment matches: ", len(matching_events))

Number of containment matches:  0


In [87]:
event_titles = [event for event in event_titles if event not in matching_events]
len(event_titles)

2958

In [81]:
if '' in event_titles:
    print(True)

In [86]:
n = 3  # Number of words in each n-gram

threshold = 0.5  # Minimum required similarity for a partial match

partial_matches = []
for event in event_titles:
    event_ngrams = set(ngrams(event.split(), n))
    for series in series_titles:
        series_ngrams = set(ngrams(series.split(), n))
        '''There can be cases that series or events don't have 3 words'''
        if(len(event_ngrams.union(series_ngrams)) > 0):
            similarity = 1 - jaccard_distance(event_ngrams, series_ngrams)
        if similarity >= threshold:
            print(f"Partial match found- Event: '{event}' and Series: '{series}' with similarity {similarity}")
            partial_matches.append(event)
print("Number of partial matches: ", len(partial_matches))

Partial match found- Event: '1st International Workshop on Knowledge Graph Summarization' and Series: 'International Workshop on Knowledge Graph Construction' with similarity 0.5
Partial match found- Event: 'Digital Humanities in the Nordic Countries 5th Conference' and Series: 'Digital Humanities in the Nordic Countries Conference' with similarity 0.5714285714285714
Partial match found- Event: '12th ZEUS Workshop on Services and their Composition' and Series: 'Central-European Workshop on Services and their Composition' with similarity 0.5714285714285714
Partial match found- Event: 'XXXV International Conference of the Spanish Society for Natural Language Processing' and Series: 'Annual Conference of the Spanish Society for Natural Language Processing' with similarity 0.7
Partial match found- Event: '1st International Workshop on Knowledge Graph Building' and Series: 'International Workshop on Knowledge Graph Construction' with similarity 0.5
Partial match found- Event: '11th Central 