In [1]:
import nltk
from nltk import ngrams
import json, pickle
import os, re
import spacy
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from nltk.metrics.distance import jaccard_distance
from spacy.matcher import PhraseMatcher

In [2]:
events_with_series_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "events_with_ordinal.json")
events_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "events_without_matches.json")
series_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "event_series.json")

In [3]:
with open(events_with_series_file) as file:
    events_with_existing_series = json.load(file)
len(events_with_existing_series)

3525

In [4]:
events_with_existing_series = [event for event in events_with_existing_series if 'series' in event.keys()]
len(events_with_existing_series)

320

In [5]:
events_with_existing_series = [event for event in events_with_existing_series if 'title' in event.keys()]
len(events_with_existing_series)

279

### Since we have only 278 entries with titles that have existing matches, we are not treating this dataset as our training corpus. This leads to dealing with metrics of accuracy only, and not precision, recall and f1-score

In [6]:
with open(events_file) as file:
    events = json.load(file)
len(events)

3163

In [7]:
event_titles = [item['title'] for item in events if 'title' in item]
event_labels = [item['eventLabel'] for item in events if 'eventLabel' in item]

In [8]:
print("Event titles: ", len(event_titles))
print("Event labels: ", len(event_labels))

Event titles:  3121
Event labels:  3163


In [35]:
with open(series_file) as file:
    series = json.load(file)
    series_titles = [item["title"]["value"] for item in series["results"]["bindings"] if "title" in item]
len(series_titles)

119

In [10]:
def remove_stopwords(text_list):
    stop_words = set(stopwords.words('english'))
    cleaned_text_list = []

    # Iterate over each text in the list
    for text in text_list:
        # Tokenize the text and remove stop words
        cleaned_text = ' '.join([word for word in word_tokenize(text) if word.lower() not in stop_words])
        cleaned_text_list.append(cleaned_text)

    return cleaned_text_list

In [11]:
def replace_special_characters_with_space(text_list):
    cleaned_text_list = []

    for text in text_list:
        # Replace special characters with spaces
        cleaned_text = re.sub(r"[^\w\s]", " ", text)
        
        # Remove extra spaces and ensure one space between words
        cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip()
        
        cleaned_text_list.append(cleaned_text)

    return cleaned_text_list

In [12]:
event_titles = remove_stopwords(event_titles)
event_titles = replace_special_characters_with_space(event_titles)
series_titles = remove_stopwords(series_titles)
series_titles = replace_special_characters_with_space(series_titles)

In [None]:
""

### It has been usually observed that the event titles are larger than the series title
Example - 
EVENT - Sixth International Workshop on Computer Modeling and Intelligent Systems (CMIS 2023) <br>
EVENT_SERIES - International Workshop on Computer Modeling and Intelligent Systems <br>
<br>
EVENT - 5th International Conference on Recent Trends and Applications in Computer Science and Information Technology <br>
EVENT_SERIES - International Conference on Recent Trends and Applications in Computer Science and Information Technology

#### So we take the series as the phrases to be matched with the events - containment matches

In [13]:
# Capturing all the distinct series
series_distinct = []

In [14]:
def remove_stopwords(dataframe):
    stop_words = set(stopwords.words('english'))

    # Iterate over each column in the DataFrame
    for column in dataframe.columns:
        # Tokenize the text and remove stop words
        dataframe[column] = dataframe[column].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

    return dataframe

In [15]:
def replace_special_characters_with_space(dataframe):
    # Iterate over each column in the DataFrame
    for column in dataframe.columns:
        # Replace special characters with spaces
        dataframe[column] = dataframe[column].apply(lambda x: re.sub(r"[^\w\s]", " ", x))        
        
        # Remove extra spaces and ensure one space between words
        dataframe[column] = dataframe[column].apply(lambda x: re.sub(r"\s+", " ", x).strip())
#         dataframe[column] = dataframe[column].apply(lambda x: re.sub(r"(\w)([A-Z])", r"\1 \2", x))
    
    return dataframe

In [16]:
matches_df = pd.read_json("/Users/ayan/Projects/KGLab/main/CEUR-WS-Event-Series--SS23/eventseries/src/main/resources/all_matches.json")
matches_df.dropna(inplace=True)
matches_df

Unnamed: 0,event,event_series
0,Tunisian-Algerian Joint Conference on Applied ...,Tunisian-Algerian Joint Conference on Applied ...
1,Computational Humanities Research Conference 2022,Conference on Computational Humanities Researc...
2,23rd Italian Conference on Theoretical Compute...,Italian Conference on Theoretical Computer Sci...
3,CLEF 2022 - Conference and Labs of the Evaluat...,Conference and Labs of the Evaluation Forum (C...
4,International Conference on Logic Programming ...,International Conference on Logic Programming ...
...,...,...
714,Fifth International Conference on Semantic Tec...,Fifth International Conference on Semantic Tec...
715,4th Annual International Symposium on Informat...,4th Annual International Symposium on Informat...
716,Joint Workshop on Interfaces and Human Decisio...,Joint Workshop on Interfaces and Human Decisio...
717,Workshop on Linked Data on the Web,Workshop on Linked Data on the Web


In [17]:
remove_stopwords(matches_df)
replace_special_characters_with_space(matches_df)

Unnamed: 0,event,event_series
0,Tunisian Algerian Joint Conference Applied Com...,Tunisian Algerian Joint Conference Applied Com...
1,Computational Humanities Research Conference 2022,Conference Computational Humanities Research CHR
2,23rd Italian Conference Theoretical Computer S...,Italian Conference Theoretical Computer Scienc...
3,CLEF 2022 Conference Labs Evaluation Forum,Conference Labs Evaluation Forum CLEF
4,International Conference Logic Programming 202...,International Conference Logic Programming ICLP
...,...,...
714,Fifth International Conference Semantic Techno...,Fifth International Conference Semantic Techno...
715,4th Annual International Symposium Information...,4th Annual International Symposium Information...
716,Joint Workshop Interfaces Human Decision Makin...,Joint Workshop Interfaces Human Decision Makin...
717,Workshop Linked Data Web,Workshop Linked Data Web


## Phrase Matching

### Existing matches

In [18]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
# Only run nlp.make_doc to speed things up
series_matched_titles = matches_df["event_series"].tolist() 
patterns = [nlp.make_doc(text) for text in series_matched_titles]
event_matched_titles = matches_df["event"].tolist()

matcher.add("Event_EventSeries_Matcher", patterns)

true_positives = 0
false_positives = 0
false_negatives = 0

matching_events = []
for event in event_matched_titles:
    doc = nlp(event)    
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        if event not in matching_events:
            matching_events.append(event)
        if span.text not in series_distinct:
            series_distinct.append(span.text)
        if (matches_df.loc[matches_df['event'] == event, 'event_series'].values[0]) == span.text:
            true_positives+=1
        else:
            false_positives+=1
#         print(f"Series: '{span.text}' Event: '{event}'")

# We consider all the events that did not give out a match as the false negative set.
false_negatives = len(event_matched_titles) - (true_positives + false_positives)

# print("true positives: ", true_positives)
# print("false positives: ", false_positives)
# print("false negatives: ", false_negatives)
precision = true_positives / (true_positives + false_positives)
print("Precision: ", precision)
recall = true_positives / (true_positives + false_negatives)
print("Recall: ", recall)
f1_score = 2 * (precision * recall) / (precision + recall)
print("F1-Score: ", f1_score)

print("Number of containment matches from event titles: ", len(matching_events))

Precision:  0.5869158878504673
Recall:  0.6317907444668008
F1-Score:  0.6085271317829458
Number of containment matches from event titles:  372


### Wikidata

In [19]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in series_titles]
matcher.add("Event_EventSeries_Matcher", patterns)

matching_events = []
for event in event_titles:
    doc = nlp(event)    
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        if event not in matching_events:
            matching_events.append(event)
        if span.text not in series_distinct:
            series_distinct.append(span.text)
        print(f"Series: '{span.text}' Event: '{event}'")
print("Number of containment matches from event titles: ", len(matching_events))

Series: 'International Workshop' Event: 'Workshop Algorithms Theories Analysis Event Data International Workshop Petri Nets Twin Transition'
Series: 'International Workshop' Event: 'Third International Workshop Artificial Intelligence Intelligent Assistance Legal Professionals Digital Workplace'
Series: 'International Workshop' Event: 'Second International Workshop Agile Methods Information Systems Engineering Agil ISE 2023'
Series: 'International Workshop' Event: 'Second International Workshop Linked Data driven Resilience Research 2023'
Series: 'International Workshop' Event: 'First International Workshop Semantic Web Constrained Things'
Series: 'International Workshop Semantic Web' Event: 'First International Workshop Semantic Web Constrained Things'
Series: 'International Workshop' Event: '15th Alberto Mendelzon International Workshop Foundations Data Management AMW 2023'
Series: 'International Workshop' Event: 'Sixth International Workshop Computer Modeling Intelligent Systems CMI

Series: 'International Workshop' Event: '8th International Workshop Artificial Intelligence Cognition'
Series: 'International Workshop Artificial Intelligence Cognition' Event: '8th International Workshop Artificial Intelligence Cognition'
Series: 'International Workshop' Event: 'Sixth International Workshop Cultures Participation Digital Age AI Humans Humans AI'
Series: 'International Workshop' Event: '3rd International Workshop Empowering People Dealing Internet Things Ecosystems'
Series: 'International Workshop' Event: '16th International Workshop Value Modelling Business Ontologies'
Series: 'International Workshop' Event: 'First International Workshop Agile Methods Information Systems Engineering Agil ISE 2022'
Series: 'International Workshop' Event: '5th International Workshop Geospatial Linked Data'
Series: 'International Workshop' Event: 'Third International Workshop Semantic Digital Twins'
Series: 'International Workshop' Event: '10th Workshop Cloud Technologies Education 5th I

Series: 'International Workshop' Event: '22nd International Workshop Trust Agent Societies TRUST 2021'
Series: 'International Workshop' Event: 'Fourth International Workshop Computer Modeling Intelligent Systems CMIS 2021'
Series: 'International Workshop Computer Modeling Intelligent Systems' Event: 'Fourth International Workshop Computer Modeling Intelligent Systems CMIS 2021'
Series: 'International Conference Computational Linguistics Intelligent Systems' Event: '5th International Conference Computational Linguistics Intelligent Systems COLINS 2021 Volume Main Conference'
Series: 'International Workshop' Event: '1st International Workshop Knowledge Graphs Online Discourse Αnalysis KnOD 2021'
Series: 'International Workshop' Event: '3rd International Workshop Challenge Computer Vision Endoscopy EndoCV 2021'
Series: 'International Workshop' Event: '2nd International Workshop Cross lingual Event centric Open Analytics'
Series: 'International Workshop' Event: 'Ninth International Worksho

Series: 'International Workshop' Event: '5th International Workshop Innovations Information Communication Science Technology'
Series: 'International Workshop' Event: '1st International Workshop Project Management ITPM 2020'
Series: 'International Workshop' Event: '1st International Workshop Computational Humanities Social Sciences Computing4Human 2020'
Series: 'International Workshop' Event: '1st International Workshop QuANtum SoftWare Engineering pRogramming'
Series: 'International Workshop' Event: '2nd International Workshop Visual Pattern Extraction Recognition Cultural Heritage Understanding'
Series: 'International Workshop' Event: '14th International Workshop Value Modelling Business Ontologies'
Series: 'International Workshop' Event: '1st International Workshop Digital Content Smart Multimedia DCSMart 2019'
Series: 'International Workshop' Event: '4th International Workshop MIning REasoning Legal texts'
Series: 'International Workshop' Event: '7th International Workshop Quantitat

Series: 'International Workshop' Event: '8th International Workshop Combinations Intelligent Methods Applications'
Series: 'International Workshop' Event: '2nd International Workshop Practicing Open Enterprise Modelling within OMiLAB PrOse'
Series: 'International Workshop' Event: '31st International Workshop Description Logics'
Series: 'International Workshop Description Logics' Event: '31st International Workshop Description Logics'
Series: 'International Workshop' Event: '12th International Workshop Applied Problems Theory Probabilities Mathematical Statistics Summer Session framework Conference Information Telecommunication Technologies Mathematical Modeling High Tech Systems APTP MS 2018'
Series: 'International Workshop' Event: '4th International Workshop Measurement Metrics Green Sustainable Software Systems'
Series: 'International Workshop' Event: 'First International Workshop Semantic Web Technologies Health Data Management'
Series: 'International Workshop Semantic Web' Event: '

Series: 'International Workshop' Event: '24th RCRA International Workshop Experimental Evaluation Algorithms Solving Problems Combinatorial Explosion 2017'
Series: 'International Workshop' Event: '11th International Workshop Artificial Intelligence Cultural Heritage'
Series: 'International Workshop' Event: '4th International Workshop Computational History HistoInformatics 2017'
Series: 'Workshop Computational History' Event: '4th International Workshop Computational History HistoInformatics 2017'
Series: 'International Workshop' Event: '4th International Workshop Dataset PROFIling fEderated Search Web Data PROFILES 2017'
Series: 'International Workshop' Event: '2nd International Workshop Semantics Biodiversity'
Series: 'International Workshop' Event: '5th International Workshop Linked Data Information Extraction'
Series: 'International Workshop' Event: 'Third International Workshop Visualization Interaction Ontologies Linked Data'
Series: 'International Semantic Web Conference' Event: 

Series: 'International Workshop' Event: '2nd International Workshop Executable Modeling'
Series: 'International Workshop' Event: '3rd International Workshop Interplay Model Driven Component Based Software Engineering'
Series: 'International Workshop' Event: '16th International Workshop OCL Textual Modelling'
Series: 'International Workshop' Event: 'Second International Workshop Patterns Model Engineering Fifth International Workshop Verification Model Transformation'
Series: 'International Workshop' Event: 'Second International Workshop Patterns Model Engineering Fifth International Workshop Verification Model Transformation'
Series: 'International Workshop' Event: '25th International Workshop Concurrency Specification Programming'
Series: 'International Workshop Concurrency Specification Programming' Event: '25th International Workshop Concurrency Specification Programming'
Series: 'International Workshop' Event: 'Fourth International Workshop Teaching Analytics'
Series: 'Internationa

Series: 'International Workshop' Event: '2nd International Workshop Model Driven Engineering Component Based Software Systems'
Series: 'International Workshop' Event: '24th International Workshop Concurrency Specification Programming'
Series: 'International Workshop Concurrency Specification Programming' Event: '24th International Workshop Concurrency Specification Programming'
Series: 'International Workshop' Event: '2nd International Workshop Multi Level Modelling'
Series: 'International Workshop' Event: '1st International Workshop Executable Modeling'
Series: 'International Workshop' Event: 'International Workshop Modelling Automotive Software Engineering'
Series: 'International Workshop' Event: '22nd RCRCA International Workshop Experimental Evaluation Algorithms Solving Problems Combinatorial Explosion 2015 RCRA 2015'
Series: 'International Workshop' Event: '3rd International Workshop News Recommendation Analytics'
Series: 'International Workshop' Event: '5th International Worksho

Series: 'International Configuration Workshop' Event: '16th International Configuration Workshop'
Series: 'International Workshop' Event: '2nd International Workshop Eye Tracking Spatial Research'
Series: 'International Workshop' Event: '2nd International Workshop Ontologies Information Systems'
Series: 'International Workshop' Event: '8th International Workshop Modular Ontologies'
Series: 'International Workshop Modular Ontologies' Event: '8th International Workshop Modular Ontologies'
Series: 'International Workshop' Event: '7th International Workshop Information Logistics Knowledge Supply'
Series: 'International Workshop' Event: 'First International Workshop Decision Making Recommender Systems'
Series: 'International Workshop' Event: '28th Workshop Constraint Logic Programming WLP 2014 Proceedings 23rd International Workshop Functional Constraint Logic Programming'
Series: 'International Workshop' Event: '1st International Workshop Interactions Data Mining Natural Language Processin

Series: 'International Workshop' Event: 'Informal Proceedings 26th International Workshop Description Logics'
Series: 'International Workshop Description Logics' Event: 'Informal Proceedings 26th International Workshop Description Logics'
Series: 'International Workshop' Event: 'Informal Proceedings 2nd International Workshop OWL Reasoner Evaluation ORE 2013'
Series: 'International Workshop' Event: '2nd International Workshop Requirements Engineering Sustainable Systems'
Series: 'International Workshop' Event: 'International Workshop Definitions Ontologies 2013'
Series: 'International Workshop' Event: 'International Workshop Vaccine Drug Ontology Studies VDOS 2013'
Series: 'International Workshop' Event: 'International Workshop Modeling Business Environments ModBE 13'
Series: 'International Workshop' Event: 'International Workshop Biological Processes Petri Nets'
Series: 'International Workshop' Event: 'International Workshop Petri Nets Software Engineering PNSE 13'
Series: 'Internatio

Series: 'International Workshop' Event: 'Fourth International Workshop Model Based Architecting Construction Embedded Systems'
Series: 'International Workshop' Event: '6th International Workshop Models run time ACM IEEE 14th International Conference Model Driven Engineering Languages Systems'
Series: 'Interop Vlab It Workshop Pervasive Computing Networked Enterprises' Event: 'Fourth Interop Vlab It Workshop Pervasive Computing Networked Enterprises Revised Papers'
Series: 'International Workshop' Event: '1st International Workshop Semantic Digital Archives'
Series: 'International Workshop' Event: '1st International Workshop Collaborative Usage Development Models Visualizations held ECSCW 2011'
Series: 'International Workshop' Event: '1st International Workshop Automated Forensic Handwriting Analysis Satellite Workshop ICDAR 2011'
Series: 'International Workshop' Event: '5th International Workshop New Challenges Distributed Information Filtering Retrieval'
Series: 'International Worksho

Series: 'International Workshop' Event: 'SeMuDaTe 09 10th International Workshop Multimedia Metadata Community Semantic Multimedia Database Technologies SeMuDaTe 09'
Series: 'Latin American Workshop Non Monotonic Reasoning' Event: 'LANMR 09 Latin American Workshop Non Monotonic Reasoning 2009'
Series: 'International Workshop' Event: 'SSN09 International Workshop Semantic Sensor Networks 2009'
Series: 'International Workshop' Event: 'First International Workshop Living Web Making Web Diversity true asset'
Series: 'International Workshop' Event: 'International Workshop Ontology Dynamics IWOD 2009'
Series: 'International Workshop' Event: '5th International Workshop Semantic Web Enabled Software Engineering SWESE 2009'
Series: 'International Workshop Semantic Web' Event: '5th International Workshop Semantic Web Enabled Software Engineering SWESE 2009'
Series: 'International Workshop' Event: 'International Workshop Enabling Service Business Ecosystems ESBE 09'
Series: 'International Worksho

Series: 'International Workshop' Event: 'First International Workshop Web Dynamics'
Series: 'International Workshop' Event: '2000 International Workshop Description Logics DL2000'
Series: 'International Workshop Description Logics' Event: '2000 International Workshop Description Logics DL2000'
Series: 'International Workshop' Event: '1999 International Workshop Description Logics DL 99'
Series: 'International Workshop Description Logics' Event: '1999 International Workshop Description Logics DL 99'
Series: 'International Workshop' Event: '6th International Workshop Knowledge Representation meets Databases KRDB 99'
Series: 'Quality Information Communications Technology' Event: '3rd International Conference Quality Information Communications Technology'
Series: 'Practical Aspects Knowledge Management' Event: 'Second International Conference Practical Aspects Knowledge Management'
Series: 'International Workshop' Event: '1998 International Workshop Description Logics'
Series: 'Internation

In [34]:
series_titles.index("International Workshop")

115

In [52]:
"7th Italian Information Retrieval Workshop" in series_titles

False

In [21]:
event_titles = [event for event in event_titles if event not in matching_events]
len(event_titles)

2208

## N-grams

### Existing matches

In [22]:
series_matched_titles = matches_df["event_series"].tolist()
event_matched_titles = matches_df["event"].tolist()
n_grams = [3, 4, 5]
threshold_values = [1, 0.9, 0.8, 0.7, 0.6, 0.5]
# n_grams = [2]
# threshold_values = [0.1, 0.01, 0.001, 0.00001]

max_f1_score = 0
max_matches = 0
best_precision = 0
best_recall = 0
best_n_gram = 0
best_threshold = 0


for i in n_grams:
    n = i  # Number of words in each n-gram
    for j in threshold_values:
        threshold = j  # Minimum required similarity for a partial match
        true_positives = 0
        false_positives = 0
        false_negatives = 0
        partially_matched_events = []
        for event in event_matched_titles:
            # We need this dict because there can be a many to many mapping between event and event series
            matched_events_dict = {}
            matched_series = ""
            event_ngrams = set(ngrams(event.split(), n))
            for series in series_matched_titles:
                series_ngrams = set(ngrams(series.split(), n))
                '''There can be cases that series or events don't have 3 words'''
                similarity = 0
                if(len(event_ngrams.union(series_ngrams)) > 0):
                    similarity = 1 - jaccard_distance(event_ngrams, series_ngrams)

                if ((similarity >= threshold) and (event in matched_events_dict and similarity > 
                                                   matched_events_dict[event])) or (event not in matched_events_dict
                                                                                    and similarity >= threshold):
                    matched_events_dict[event] = similarity
                    matched_series = series
        
#             print("Partial match found:")
#             print(f"#####MATCHED_EVENT#####{event}")
#             print(f"######MATCHED_SERIES######{matched_series}")
#             print()
            if (matches_df.loc[matches_df['event'] == event, 'event_series'].values[0]) == matched_series:
                true_positives+=1
            elif matched_series == "":
                # We consider all the events that did not give out a match as the false negative set.
                false_negatives+=1
            else:
                false_positives+=1
                
            if series not in series_distinct:
                series_distinct.append(series)
#             print()
            partially_matched_events.append(event)

        print(f"Statistics for {n}-grams and threshold:{threshold}->")
        print()
        print("true positives: ", true_positives)
        print("false positives: ", false_positives)
        print("false negatives: ", false_negatives)
        precision = true_positives / (true_positives + false_positives)
        print("Precision: ", precision)
        recall = true_positives / (true_positives + false_negatives)
        print("Recall: ", recall)
        f1_score = 2 * (precision * recall) / (precision + recall)
        if(f1_score>max_f1_score):
            best_precision = precision
            best_recall = recall
            max_f1_score = f1_score
            max_matches = len(partially_matched_events)
            best_n_gram = n
            best_threshold = threshold
        print("F1-Score: ", f1_score)
        print("Number of partial matches: ", len(partially_matched_events))
        print()

print("Best Choice: ")
print(f"Statistics for {best_n_gram}-grams and threshold:{best_threshold}->")
print("Precision: ", best_precision)
print("Recall: ", best_recall)
print("F1-Score: ", max_f1_score)
print("Maximum number of partial matches: ", max_matches)

Statistics for 3-grams and threshold:1->

true positives:  23
false positives:  38
false negatives:  657
Precision:  0.3770491803278688
Recall:  0.033823529411764704
F1-Score:  0.06207827260458839
Number of partial matches:  718

Statistics for 3-grams and threshold:0.9->

true positives:  23
false positives:  38
false negatives:  657
Precision:  0.3770491803278688
Recall:  0.033823529411764704
F1-Score:  0.06207827260458839
Number of partial matches:  718

Statistics for 3-grams and threshold:0.8->

true positives:  63
false positives:  40
false negatives:  615
Precision:  0.6116504854368932
Recall:  0.09292035398230089
F1-Score:  0.16133162612035853
Number of partial matches:  718

Statistics for 3-grams and threshold:0.7->

true positives:  84
false positives:  44
false negatives:  590
Precision:  0.65625
Recall:  0.12462908011869436
F1-Score:  0.20947630922693267
Number of partial matches:  718

Statistics for 3-grams and threshold:0.6->

true positives:  158
false positives:  49
f

### Wikidata

In [23]:
n = 3  # Number of words in each n-gram

threshold = 0.5  # Minimum required similarity for a partial match
    
partially_matched_events = []
for event in event_titles:
    matched_events_dict = {}
    matched_series = ""
    event_ngrams = set(ngrams(event.split(), n))
    for series in series_titles:
        series_ngrams = set(ngrams(series.split(), n))
        '''There can be cases that series or events don't have 3 words'''
        if(len(event_ngrams.union(series_ngrams)) > 0):
            similarity = 1 - jaccard_distance(event_ngrams, series_ngrams)
            
        if ((similarity >= threshold) and (event in matched_events_dict and similarity > 
                                   matched_events_dict[event])) or (event not in matched_events_dict
                                                                    and similarity >= threshold):
            matched_events_dict[event] = similarity
            matched_series = series
    print("Partial match found:")
    print(f"#####EVENT#####{event}")
    print(f"######SERIES######{matched_series}")
#     if series not in series_distinct:
#         series_distinct.append(series)
    print()
    if(matched_series!=""):
        partially_matched_events.append(event)

Partial match found:
#####EVENT#####2nd International Conference Multilingual Digital Terminology Today MDTT 2023
######SERIES######

Partial match found:
#####EVENT#####35th International Conference Advanced Information Systems Engineering
######SERIES######

Partial match found:
#####EVENT#####Research Projects Exhibition Papers Presented 35th International Conference Advanced Information Systems Engineering CAiSE 2023
######SERIES######

Partial match found:
#####EVENT#####Workshops Work Progress Demos Doctoral Consortium IS EUD 2023
######SERIES######

Partial match found:
#####EVENT#####Modern Machine Learning Technologies Data Science Workshop MoMLeT DS 2023
######SERIES######

Partial match found:
#####EVENT#####International Symposium Securing NextGeneration Systems Using Future Artificial Intelligence Technologies
######SERIES######

Partial match found:
#####EVENT#####Technology Enhanced Learning Laboratories Workshop TELL 2023
######SERIES######

Partial match found:
#####EV

Partial match found:
#####EVENT#####Workshop information technology scientific computing framework X International Conference Information Telecommunication Technologies Mathematical Modeling High Tech Systems
######SERIES######

Partial match found:
#####EVENT#####Workshops EDBT ICDT 2020 Joint Conference
######SERIES######

Partial match found:
#####EVENT#####XV International Conference New Educational Strategies Modern Information Space
######SERIES######

Partial match found:
#####EVENT#####CrossMMLA practice Collecting annotating analyzing multimodal data across spaces
######SERIES######

Partial match found:
#####EVENT#####REFSQ 2020 Workshops Doctoral Symposium Live Studies Track Poster Track
######SERIES######

Partial match found:
#####EVENT#####AAAI 2020 Spring Symposium Combining Machine Learning Knowledge Engineering Practice AAAI MAKE 2020
######SERIES######

Partial match found:
#####EVENT#####AAAI 2020 Spring Symposium Combining Artificial Intelligence Machine Learning Ph

Partial match found:
#####EVENT#####5th Workshop Society Privacy Semantic Web Policy Technology
######SERIES######

Partial match found:
#####EVENT#####Second Workshop Semantic Web Technologies Internet Things
######SERIES######

Partial match found:
#####EVENT#####First Workshop Enabling Open Semantic Science
######SERIES######

Partial match found:
#####EVENT#####3rd Spring School Networks
######SERIES######

Partial match found:
#####EVENT#####3rd Ural Workshop Parallel Distributed Cloud Computing Young Scientists
######SERIES######

Partial match found:
#####EVENT#####NATO IST 152 Workshop Intelligent Autonomous Agents Cyber Defence Resilience
######SERIES######

Partial match found:
#####EVENT#####11th Turkish National Software Engineering Symposium
######SERIES######

Partial match found:
#####EVENT#####Second Workshop Mining Scientific Papers Computational Linguistics Bibliometrics CLBib 2017
######SERIES######

Partial match found:
#####EVENT#####Positive Gaming Workshop Gamifi

Partial match found:
#####EVENT#####CLEF 2016 Conference Labs Evaluation forum
######SERIES######

Partial match found:
#####EVENT#####26th International Conference Inductive Logic Programming
######SERIES######

Partial match found:
#####EVENT#####CBI 2016 Industrial Track
######SERIES######

Partial match found:
#####EVENT#####2nd Workshop Artificial Intelligence Internet Things
######SERIES######

Partial match found:
#####EVENT#####1st Workshop Ethics Design Intelligent Agents
######SERIES######

Partial match found:
#####EVENT#####Geospatial Sensor Webs Conference 2016
######SERIES######

Partial match found:
#####EVENT#####International Conference Mathematical Information Technologies
######SERIES######

Partial match found:
#####EVENT#####Joint International Conference Biological Ontology BioCreative
######SERIES######

Partial match found:
#####EVENT#####CICM 2016 Doctoral Program
######SERIES######

Partial match found:
#####EVENT#####Workshop Formal Mathematics Mathematicians

Partial match found:
#####EVENT#####first international Workshop Semantics Biodiversity
######SERIES######

Partial match found:
#####EVENT#####Workshop Social Media Linked Data Emergency Response
######SERIES######

Partial match found:
#####EVENT#####First Workshop Services Applications Linked APIs Data
######SERIES######

Partial match found:
#####EVENT#####2nd Workshop Artificial Intelligence meets Web Data AImWD2013
######SERIES######

Partial match found:
#####EVENT#####Framing Digital Curation Curriculum Conference
######SERIES######

Partial match found:
#####EVENT#####1st International IFIP Working Conference Value Driven Social Semantics Collective Intelligence
######SERIES######

Partial match found:
#####EVENT#####CHI2013 Workshop Replication HCI Research
######SERIES######

Partial match found:
#####EVENT#####13th Dutch Belgian Workshop Information Retrieval
######SERIES######

Partial match found:
#####EVENT#####24th Midwest Artificial Intelligence Cognitive Science Confe

Partial match found:
#####EVENT#####PhD Symposium Joint ICSOC ServiceWave 2009 Conference
######SERIES######

Partial match found:
#####EVENT#####EPK 2009 Geschäftsprozessmanagement mit Ereignisgesteuerten Prozessketten
######SERIES######

Partial match found:
#####EVENT#####UGS2009 1st Workshop User generated Services
######SERIES######

Partial match found:
#####EVENT#####TIA09 Workshop1 Du thème au terme Emergence et lexicalisation des connaissances
######SERIES######

Partial match found:
#####EVENT#####TIA09 Workshop2 Acquisition et modélisation de relations sémantiques
######SERIES######

Partial match found:
#####EVENT#####8th International Conference Terminology Artificial Intelligence
######SERIES######

Partial match found:
#####EVENT#####Second Workshop Agreement Technologies
######SERIES######

Partial match found:
#####EVENT#####ER 2009 PhD Colloquium
######SERIES######

Partial match found:
#####EVENT#####3rd International RuleML 2009 Challenge
######SERIES######

Partial

Partial match found:
#####EVENT#####OAS 03 Ontologies Agent Systems
######SERIES######

Partial match found:
#####EVENT#####CAiSE 03 Forum
######SERIES######

Partial match found:
#####EVENT#####CAiSE 03
######SERIES######

Partial match found:
#####EVENT#####PGLDB 2003 PGL DataBase Research Conference
######SERIES######

Partial match found:
#####EVENT#####Knowledge Management Philosophy
######SERIES######

Partial match found:
#####EVENT#####GWEM 2003 German Workshop Experience Management
######SERIES######

Partial match found:
#####EVENT#####WOW2003 Workshop Ontologie basiertes Wissensmanagement
######SERIES######

Partial match found:
#####EVENT#####Bildverarbeitung für die Medizin 2003
######SERIES######

Partial match found:
#####EVENT#####AWRE02 Seventh Australian Workshop Requirements Engineering
######SERIES######

Partial match found:
#####EVENT#####ADIS 2002 Apoyo la Decisión en Ingeniería del Software Decision Support Software Engineering
######SERIES######

Partial match 

In [24]:
print("Number of partial matches: ", len(partially_matched_events))

Number of partial matches:  9


In [25]:
event_titles = [event for event in event_titles if event not in partially_matched_events]
len(event_titles)

2199

### Existing matches

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
import numpy as np

series_matched_titles = matches_df["event_series"].tolist()
event_matched_titles = matches_df["event"].tolist()

vectorizer = TfidfVectorizer(stop_words = text.ENGLISH_STOP_WORDS)
array1_strings = event_matched_titles
array2_strings = series_matched_titles
tfidf_matrix = vectorizer.fit_transform(event_matched_titles + series_matched_titles)

# Calculate cosine similarity between array1 and array2
similarity_matrix = cosine_similarity(tfidf_matrix[:len(array1_strings)], tfidf_matrix[len(array1_strings):])


# Threshold for partial match
threshold_values = [0.5, 0.6, 0.7, 0.8, 0.9]
# threshold_values = [0.9]

# Find partial matches\
for threshold in threshold_values:
    matches = []
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    num_rows = len(similarity_matrix)
    num_cols = len(similarity_matrix[0])
    for row in range(num_rows):
        score = threshold
        new_row = -1
        new_col = -1
        for col in range(num_cols):
            element = similarity_matrix[row][col]
            if(element>=score):
                score = element
                new_row = row
                new_col = col
        if(new_row !=-1 and new_col !=-1):
            matches.append([new_row, new_col])
#     print(matches)          



    # matches = np.argwhere(similarity_matrix >= threshold)

    ctr = 0
    partially_matched_events = []

    # Print partial matches
    for match in matches:
        array1_index = match[0]
        array2_index = match[1]
#         print("Partial match found:")
#         print(f"#####EVENT#####{array1_strings[array1_index]}")
#         print(f"######SERIES######{array2_strings[array2_index]}")


        if (matches_df.loc[matches_df['event'] == array1_strings[array1_index], 'event_series'].values[0]) == array2_strings[array2_index]:
    #         if(matched_events_dict[array1_strings[array1_index]] >)
            true_positives+=1
        else:
            false_positives+=1

        if series not in series_distinct:
            series_distinct.append(series)
#         print()
        partially_matched_events.append(array1_strings[array1_index])

    # Series not matched to any event
#     false_negatives = len(series_titles) - len(series_distinct)
    false_negatives = len(event_matched_titles) - (true_positives + false_positives)
    
    print("Results with threshold: ", threshold)
    print("true positives: ", true_positives)
    print("false positives: ", false_positives)
    print("false negatives: ", false_negatives)
    precision = true_positives / (true_positives + false_positives)
    print("Precision: ", precision)
    recall = true_positives / (true_positives + false_negatives)
    print("Recall: ", recall)
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("F1-Score: ", f1_score)
    print("Number of partial matches: ", len(partially_matched_events))
    print()

Results with threshold:  0.5
true positives:  357
false positives:  91
false negatives:  270
Precision:  0.796875
Recall:  0.569377990430622
F1-Score:  0.6641860465116278
Number of partial matches:  448

Results with threshold:  0.6
true positives:  302
false positives:  68
false negatives:  348
Precision:  0.8162162162162162
Recall:  0.4646153846153846
F1-Score:  0.592156862745098
Number of partial matches:  370

Results with threshold:  0.7
true positives:  219
false positives:  61
false negatives:  438
Precision:  0.7821428571428571
Recall:  0.3333333333333333
F1-Score:  0.4674493062966915
Number of partial matches:  280

Results with threshold:  0.8
true positives:  149
false positives:  52
false negatives:  517
Precision:  0.7412935323383084
Recall:  0.22372372372372373
F1-Score:  0.3437139561707036
Number of partial matches:  201

Results with threshold:  0.9
true positives:  74
false positives:  45
false negatives:  599
Precision:  0.6218487394957983
Recall:  0.1099554234769688


### Wikidata

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
import numpy as np
    
vectorizer = TfidfVectorizer(stop_words = text.ENGLISH_STOP_WORDS)
array1_strings = event_titles
array2_strings = series_titles
tfidf_matrix = vectorizer.fit_transform(event_titles + series_titles)

# Calculate cosine similarity between array1 and array2
similarity_matrix = cosine_similarity(tfidf_matrix[:len(array1_strings)], tfidf_matrix[len(array1_strings):])

# Threshold for partial match
threshold = 0.5

# Find partial matches
matches = np.argwhere(similarity_matrix >= threshold)

ctr = 0
partially_matched_events = []
# Print partial matches
for match in matches:
    array1_index = match[0]
    array2_index = match[1]
#     print("Partial match found:")
#     print(f"#####EVENT#####{array1_strings[array1_index]}")
#     print(f"######SERIES######{array2_strings[array2_index]}")
    if series not in series_distinct:
        series_distinct.append(series)
#     print()
    partially_matched_events.append(array1_strings[array1_index])

In [28]:
print("Number of partial matches: ", len(partially_matched_events))

Number of partial matches:  100


In [29]:
event_titles = [event for event in event_titles if event not in partially_matched_events]
len(event_titles)

2105

## Trying different algorithms using feature vectors

In [30]:
matches_modified_df = matches_df.copy()

In [31]:
import pandas as pd
import nltk
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Step 2: Feature Extraction (TF-IDF)
def extract_features(texts):
    vectorizer = TfidfVectorizer()
    feature_vectors = vectorizer.fit_transform(texts)
    return feature_vectors, vectorizer

# Step 4: Training the Logistic Regression Model
def train_logistic_regression(features, labels):
    model = LogisticRegression()
    model.fit(features, labels)
    return model

def train_svm(features, labels):
    model = SVC(probability=True)
    model.fit(features, labels)
    return model

def train_random_forest(features, labels):
    model = RandomForestClassifier()
    model.fit(features, labels)
    return model

# Step 5: Matching
def perform_matching(new_text_pair, model, vectorizer):
    preprocessed_text_pair = [preprocess_text(text) for text in new_text_pair]
    feature_vectors = vectorizer.transform(preprocessed_text_pair)
    probabilities = model.predict_proba(feature_vectors)[:, 1]  # Get the probability of a match (class 1)
    return probabilities

# Step 3: Prepare Training Data
# texts: List of paired texts from Column1
# labels: List of corresponding labels from Column2 (1 or 0)
texts = matches_modified_df['event'].tolist()
labels = matches_modified_df['event_series'].tolist()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)

# Extract features using TF-IDF
train_features, vectorizer = extract_features(X_train)


###LogisticRegression###
# Train the logistic regression model
model = train_logistic_regression(train_features, y_train)
# Evaluate the model (optional)
test_features = vectorizer.transform(X_test)
predictions = model.predict(test_features)
accuracy = accuracy_score(y_test, predictions)
print("Logistic Regression:")
print("Accuracy:", accuracy)

# Print predictions and actual values
results = pd.DataFrame({'Predictions': predictions, 'Actual Values': y_test})
# print(results)

# Calculate precision and recall
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
print("Precision:", precision)
print("Recall:", recall)




###SVM###
# Train the logistic regression model
model = train_svm(train_features, y_train)
# Evaluate the model (optional)
test_features = vectorizer.transform(X_test)
predictions = model.predict(test_features)
accuracy = accuracy_score(y_test, predictions)
print("SVM:")
print("Accuracy:", accuracy)

# Print predictions and actual values
results = pd.DataFrame({'Predictions': predictions, 'Actual Values': y_test})
# print(results)

# Calculate precision and recall
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
print("Precision:", precision)
print("Recall:", recall)





###RF###
# Train the logistic regression model
model = train_random_forest(train_features, y_train)
# Evaluate the model (optional)
test_features = vectorizer.transform(X_test)
predictions = model.predict(test_features)
accuracy = accuracy_score(y_test, predictions)
print("RF:")
print("Accuracy:", accuracy)

# Print predictions and actual values
results = pd.DataFrame({'Predictions': predictions, 'Actual Values': y_test})
# print(results)

# Calculate precision and recall
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
print("Precision:", precision)
print("Recall:", recall)


Logistic Regression:
Accuracy: 0.35185185185185186
Precision: 0.23418606938992245
Recall: 0.35185185185185186


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM:
Accuracy: 0.41203703703703703
Precision: 0.36808278867102395
Recall: 0.41203703703703703


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RF:
Accuracy: 0.625
Precision: 0.5438400205761317
Recall: 0.625


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", patterns)

doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.
