In [1]:
import nltk
from nltk import ngrams
import json, pickle
import os
import spacy
import numpy as np
import pandas as pd
from nltk.metrics.distance import jaccard_distance
from spacy.matcher import PhraseMatcher

In [2]:
events_with_series_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "events_with_ordinal.json")
events_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "events_without_matches.json")
series_file = os.path.join(os.path.abspath("../eventseries/src/main/resources"), "event_series.json")

In [3]:
with open(events_with_series_file) as file:
    events_with_existing_series = json.load(file)
len(events_with_existing_series)

3525

In [4]:
events_with_existing_series = [event for event in events_with_existing_series if 'series' in event.keys()]
len(events_with_existing_series)

320

In [5]:
events_with_existing_series = [event for event in events_with_existing_series if 'title' in event.keys()]
len(events_with_existing_series)

279

### Since we have only 278 entries with titles that have existing matches, we are not treating this dataset as our training corpus. This leads to dealing with metrics of accuracy only, and not precision, recall and f1-score

In [6]:
with open(events_file) as file:
    events = json.load(file)
len(events)

3163

In [7]:
event_titles = [item['title'] for item in events if 'title' in item]
event_labels = [item['eventLabel'] for item in events if 'eventLabel' in item]

In [8]:
print("Event titles: ", len(event_titles))
print("Event labels: ", len(event_labels))

Event titles:  3121
Event labels:  3163


In [9]:
with open(series_file) as file:
    series = json.load(file)
    series_titles = [item["title"]["value"] for item in series["results"]["bindings"] if "title" in item]
len(series_titles)

119

### It has been usually observed that the event titles are larger than the series title
Example - 
EVENT - Sixth International Workshop on Computer Modeling and Intelligent Systems (CMIS 2023) <br>
EVENT_SERIES - International Workshop on Computer Modeling and Intelligent Systems <br>
<br>
EVENT - 5th International Conference on Recent Trends and Applications in Computer Science and Information Technology <br>
EVENT_SERIES - International Conference on Recent Trends and Applications in Computer Science and Information Technology

#### So we take the series as the phrases to be matched with the events - containment matches

In [10]:
# Capturing all the distinct series
series_distinct = []

In [11]:
event_titles

['2nd International Conference on Multilingual Digital Terminology Today (MDTT 2023)',
 'Workshop on Algorithms & Theories for the Analysis of Event Data and the International Workshop on Petri Nets for Twin Transition',
 'Third International Workshop on Artificial Intelligence and Intelligent Assistance for Legal Professionals in the Digital Workplace',
 'Second International Workshop on Agile Methods for Information Systems Engineering (Agil-ISE 2023)',
 '35th International Conference on Advanced Information Systems Engineering',
 'Research Projects Exhibition Papers Presented at the 35th International Conference on Advanced Information Systems Engineering (CAiSE 2023)',
 'Workshops, Work in Progress Demos and Doctoral Consortium at the IS-EUD 2023',
 'Modern Machine Learning Technologies and Data Science Workshop (MoMLeT&DS 2023)',
 'Second International Workshop on Linked Data-driven Resilience Research 2023',
 'First International Workshop on Semantic Web on Constrained Things',
 

In [12]:
len(series_titles)

119

In [13]:
matches_df = pd.read_json("/Users/ayan/Projects/KGLab/main/CEUR-WS-Event-Series--SS23/eventseries/src/main/resources/all_matches.json")
matches_df.dropna(inplace=True)
matches_df

Unnamed: 0,event,event_series
0,Tunisian-Algerian Joint Conference on Applied ...,Tunisian-Algerian Joint Conference on Applied ...
1,Computational Humanities Research Conference 2022,Conference on Computational Humanities Researc...
2,23rd Italian Conference on Theoretical Compute...,Italian Conference on Theoretical Computer Sci...
3,CLEF 2022 - Conference and Labs of the Evaluat...,Conference and Labs of the Evaluation Forum (C...
4,International Conference on Logic Programming ...,International Conference on Logic Programming ...
...,...,...
714,29th International Workshop on Description Logics,29th International Workshop on Description Logics
715,Design and Management of Data Warehouses,Design and Management of Data Warehouses
716,Workshop on Linked Data on the Web,Workshop on Linked Data on the Web
717,Workshop on Context-Aware Adaptation of Servic...,Workshop on Context-Aware Adaptation of Servic...


## Phrase Matching

In [15]:
import en_core_web_sm
nlp = en_core_web_sm.load()

### Existing matches

In [16]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
# Only run nlp.make_doc to speed things up
series_titles = matches_df["event_series"].tolist() 
patterns = [nlp.make_doc(text) for text in series_titles]
event_titles = matches_df["event"].tolist()

matcher.add("Event_EventSeries_Matcher", patterns)

true_positives = 0
false_positives = 0
false_negatives = 0

matching_events = []
for event in event_titles:
    doc = nlp(event)    
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        if event not in matching_events:
            matching_events.append(event)
        if span.text not in series_distinct:
            series_distinct.append(span.text)
        if (matches_df.loc[matches_df['event'] == event, 'event_series'].values[0]) == span.text:
            true_positives+=1
        else:
            false_positives+=1
#         print(f"Series: '{span.text}' Event: '{event}'")

# We consider all the events that did not give out a match as the false negative set.
false_negatives = len(event_titles) - (true_positives + false_positives)

# print("true positives: ", true_positives)
# print("false positives: ", false_positives)
# print("false negatives: ", false_negatives)
precision = true_positives / (true_positives + false_positives)
print("Precision: ", precision)
recall = true_positives / (true_positives + false_negatives)
print("Recall: ", recall)
f1_score = 2 * (precision * recall) / (precision + recall)
print("F1-Score: ", f1_score)

print("Number of containment matches from event titles: ", len(matching_events))

Precision:  0.6406685236768802
Recall:  0.3904923599320883
F1-Score:  0.48523206751054854
Number of containment matches from event titles:  284


### Wikidata

In [17]:
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)
# Only run nlp.make_doc to speed things up
event_titles = [item['title'] for item in events if 'title' in item]
with open(series_file) as file:
    series = json.load(file)
    series_titles = [item["title"]["value"] for item in series["results"]["bindings"] if "title" in item]

patterns = [nlp.make_doc(text) for text in series_titles]
matcher.add("Event_EventSeries_Matcher", patterns)

matching_events = []
for event in event_titles:
    doc = nlp(event)    
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        if event not in matching_events:
            matching_events.append(event)
        if span.text not in series_distinct:
            series_distinct.append(span.text)
#         print(f"Series: '{span.text}' Event: '{event}'")
print("Number of containment matches from event titles: ", len(matching_events))

Number of containment matches from event titles:  142


In [18]:
len(event_titles)

3121

In [19]:
len(series_titles)

119

In [20]:
accuracy = (len(matching_events) / len(events))*100 
print(f"Accuracy = {accuracy}")

Accuracy = 4.489408789124249


In [21]:
event_without_matches = [event for event in events if event not in matching_events]
with open('events_without_containment_matches.json', 'w') as fp:
    json.dump(event_without_matches, fp)

In [22]:
event_titles = [event for event in event_titles if event not in matching_events]
len(event_titles)

2979

In [23]:
if '' in event_titles:
    print(True)

## N-grams

In [24]:
import nltk
from nltk.corpus import stopwords
# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Get the set of stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(text_list):

    # Remove stopwords from each string in the list
    filtered_list = []
    for text in text_list:
        # Tokenize the string into individual words
        words = nltk.word_tokenize(text)
        # Remove stopwords from the list of words
        filtered_words = [word for word in words if word.lower() not in stop_words]
        # Join the filtered words back into a string
        filtered_text = ' '.join(filtered_words)
        # Add the filtered string to the filtered list
        filtered_list.append(filtered_text)
    return filtered_list

[nltk_data] Downloading package stopwords to /Users/ayan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ayan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Existing matches

In [25]:
series_titles = remove_stopwords(matches_df["event_series"].tolist())
matches_df["event_series"] = series_titles
event_titles = remove_stopwords(matches_df["event"].tolist())
matches_df["event"] = event_titles

In [26]:
matches_df["event_series"]

0      Tunisian-Algerian Joint Conference Applied Com...
1      Conference Computational Humanities Research (...
2      Italian Conference Theoretical Computer Scienc...
3              Conference Labs Evaluation Forum ( CLEF )
4      International Conference Logic Programming ( I...
                             ...                        
714       29th International Workshop Description Logics
715                    Design Management Data Warehouses
716                             Workshop Linked Data Web
717    Workshop Context-Aware Adaptation Service Fron...
718    Joint Workshop Interfaces Human Decision Makin...
Name: event_series, Length: 718, dtype: object

In [27]:
series_titles = matches_df["event_series"].tolist()
event_titles = matches_df["event"].tolist()
n_grams = [3, 4, 5]
threshold_values = [1, 0.9, 0.8, 0.7, 0.6, 0.5]
# n_grams = [2]
# threshold_values = [0.1, 0.01, 0.001, 0.00001]

max_f1_score = 0
max_matches = 0
best_precision = 0
best_recall = 0
best_n_gram = 0
best_threshold = 0


for i in n_grams:
    n = i  # Number of words in each n-gram
    for j in threshold_values:
        threshold = j  # Minimum required similarity for a partial match
        true_positives = 0
        false_positives = 0
        false_negatives = 0
        partially_matched_events = []
        for event in event_titles:
            # We need this dict because there can be a many to many mapping between event and event series
            matched_events_dict = {}
            matched_series = ""
            event_ngrams = set(ngrams(event.split(), n))
            for series in series_titles:
                series_ngrams = set(ngrams(series.split(), n))
                '''There can be cases that series or events don't have 3 words'''
                similarity = 0
                if(len(event_ngrams.union(series_ngrams)) > 0):
                    similarity = 1 - jaccard_distance(event_ngrams, series_ngrams)

                if ((similarity >= threshold) and (event in matched_events_dict and similarity > 
                                                   matched_events_dict[event])) or (event not in matched_events_dict
                                                                                    and similarity >= threshold):
                    matched_events_dict[event] = similarity
                    matched_series = series
        
#             print("Partial match found:")
#             print(f"#####MATCHED_EVENT#####{event}")
#             print(f"######MATCHED_SERIES######{matched_series}")
#             print()
            if (matches_df.loc[matches_df['event'] == event, 'event_series'].values[0]) == matched_series:
                true_positives+=1
            elif matched_series == "":
                # We consider all the events that did not give out a match as the false negative set.
                false_negatives+=1
            else:
                false_positives+=1
                
            if series not in series_distinct:
                series_distinct.append(series)
#             print()
            partially_matched_events.append(event)

        print(f"Statistics for {n}-grams and threshold:{threshold}->")
        print()
        print("true positives: ", true_positives)
        print("false positives: ", false_positives)
        print("false negatives: ", false_negatives)
        precision = true_positives / (true_positives + false_positives)
        print("Precision: ", precision)
        recall = true_positives / (true_positives + false_negatives)
        print("Recall: ", recall)
        f1_score = 2 * (precision * recall) / (precision + recall)
        if(f1_score>max_f1_score):
            best_precision = precision
            best_recall = recall
            max_f1_score = f1_score
            max_matches = len(partially_matched_events)
            best_n_gram = n
            best_threshold = threshold
        print("F1-Score: ", f1_score)
        print("Number of partial matches: ", len(partially_matched_events))
        print()

print("Best Choice: ")
print(f"Statistics for {best_n_gram}-grams and threshold:{best_threshold}->")
print("Precision: ", best_precision)
print("Recall: ", best_recall)
print("F1-Score: ", max_f1_score)
print("Maximum number of partial matches: ", max_matches)

Statistics for 3-grams and threshold:1->

true positives:  23
false positives:  38
false negatives:  657
Precision:  0.3770491803278688
Recall:  0.033823529411764704
F1-Score:  0.06207827260458839
Number of partial matches:  718

Statistics for 3-grams and threshold:0.9->

true positives:  23
false positives:  38
false negatives:  657
Precision:  0.3770491803278688
Recall:  0.033823529411764704
F1-Score:  0.06207827260458839
Number of partial matches:  718

Statistics for 3-grams and threshold:0.8->

true positives:  55
false positives:  40
false negatives:  623
Precision:  0.5789473684210527
Recall:  0.08112094395280237
F1-Score:  0.1423027166882277
Number of partial matches:  718

Statistics for 3-grams and threshold:0.7->

true positives:  71
false positives:  44
false negatives:  603
Precision:  0.6173913043478261
Recall:  0.10534124629080119
F1-Score:  0.1799746514575412
Number of partial matches:  718

Statistics for 3-grams and threshold:0.6->

true positives:  120
false positiv

In [28]:
len(event_titles)

718

In [29]:
event_ngrams

{('Human', 'Decision', 'Making', 'Recommender', 'Systems'),
 ('Interfaces', 'Human', 'Decision', 'Making', 'Recommender'),
 ('Joint', 'Workshop', 'Interfaces', 'Human', 'Decision'),
 ('Workshop', 'Interfaces', 'Human', 'Decision', 'Making')}

In [30]:
series_ngrams

{('Human', 'Decision', 'Making', 'Recommender', 'Systems'),
 ('Interfaces', 'Human', 'Decision', 'Making', 'Recommender'),
 ('Joint', 'Workshop', 'Interfaces', 'Human', 'Decision'),
 ('Workshop', 'Interfaces', 'Human', 'Decision', 'Making')}

In [31]:
event_ngrams.union(series_ngrams)

{('Human', 'Decision', 'Making', 'Recommender', 'Systems'),
 ('Interfaces', 'Human', 'Decision', 'Making', 'Recommender'),
 ('Joint', 'Workshop', 'Interfaces', 'Human', 'Decision'),
 ('Workshop', 'Interfaces', 'Human', 'Decision', 'Making')}

In [32]:
similarity = 1 - jaccard_distance(event_ngrams, series_ngrams)

In [33]:
similarity

1.0

In [34]:
event_ngrams&series_ngrams

{('Human', 'Decision', 'Making', 'Recommender', 'Systems'),
 ('Interfaces', 'Human', 'Decision', 'Making', 'Recommender'),
 ('Joint', 'Workshop', 'Interfaces', 'Human', 'Decision'),
 ('Workshop', 'Interfaces', 'Human', 'Decision', 'Making')}

In [35]:
jaccard_distance(event_ngrams, series_ngrams)

0.0

In [36]:
len(event_ngrams&series_ngrams)/ len(event_ngrams.union(series_ngrams))

1.0

In [37]:
matches_df[matches_df['event'] == 'Eleventh Conference on Semantic Technology for Intelligence, Defense, and Security']

Unnamed: 0,event,event_series


In [38]:
len(matched_events_dict)

1

### Wikidata

In [39]:
n = 3  # Number of words in each n-gram

threshold = 0.5  # Minimum required similarity for a partial match

# event_titles = [item['title'] for item in events if 'title' in item]
# with open(series_file) as file:
#     series = json.load(file)
#     series_titles = [item["title"]["value"] for item in series["results"]["bindings"] if "title" in item]
    
partially_matched_events = []
for event in event_titles:
    matched_events_dict = {}
    matched_series = ""
    event_ngrams = set(ngrams(event.split(), n))
    for series in series_titles:
        series_ngrams = set(ngrams(series.split(), n))
        '''There can be cases that series or events don't have 3 words'''
        if(len(event_ngrams.union(series_ngrams)) > 0):
            similarity = 1 - jaccard_distance(event_ngrams, series_ngrams)
            
        if ((similarity >= threshold) and (event in matched_events_dict and similarity > 
                                   matched_events_dict[event])) or (event not in matched_events_dict
                                                                    and similarity >= threshold):
            matched_events_dict[event] = similarity
            matched_series = series
    print("Partial match found:")
    print(f"#####EVENT#####{event}")
    print(f"######SERIES######{matched_series}")
#     if series not in series_distinct:
#         series_distinct.append(series)
    print()
    if(matched_series!=""):
        partially_matched_events.append(event)

Partial match found:
#####EVENT#####Tunisian-Algerian Joint Conference Applied Computing ( TACC 2022 )
######SERIES######Tunisian-Algerian Joint Conference Applied Computing ( TACC )

Partial match found:
#####EVENT#####Computational Humanities Research Conference 2022
######SERIES######

Partial match found:
#####EVENT#####23rd Italian Conference Theoretical Computer Science
######SERIES######Italian Conference Theoretical Computer Science

Partial match found:
#####EVENT#####CLEF 2022 - Conference Labs Evaluation Forum
######SERIES######

Partial match found:
#####EVENT#####International Conference Logic Programming 2022 Workshops
######SERIES######

Partial match found:
#####EVENT#####2nd Joint Conference Information Retrieval Communities Europe ( CIRCLE 2022 )
######SERIES######Joint Conference Information Retrieval Communities Europe ( CIRCLE )

Partial match found:
#####EVENT#####Eighth Italian Conference Computational Linguistics
######SERIES######Italian Conference Computationa

Partial match found:
#####EVENT#####AI4Narratives — Workshop Artificial Intelligence Narratives conjunction 29th International Joint Conference Artificial Intelligence 17th Pacific Rim International Conference Artificial Intelligence
######SERIES######

Partial match found:
#####EVENT#####XX International Scientific Practical Conference `` Information Technologies Security '' ( 2020 )
######SERIES######

Partial match found:
#####EVENT#####12th Majorov International Conference Software Engineering Computer Systems ( MICSECS 2020 )
######SERIES######Majorov International Conference Software Engineering Computer Systems ( MICSECS )

Partial match found:
#####EVENT#####28th Irish Conference Artificial Intelligence Cognitive Science
######SERIES######Irish Conference Artificial Intelligence Cognitive Science ( AICS )

Partial match found:
#####EVENT#####7th International Conference `` Information Technology Interactions '' ( & I-2020 )
######SERIES######International Conference `` Informat

Partial match found:
#####EVENT#####Digital Humanities Nordic Countries 3rd Conference
######SERIES######

Partial match found:
#####EVENT#####Fourth Italian Conference Computational Linguistics ( CLiC-it 2017 )
######SERIES######Fourth Italian Conference Computational Linguistics ( CLiC-it 2017 )

Partial match found:
#####EVENT#####25th Irish Conference Artificial Intelligence Cognitive Science
######SERIES######Irish Conference Artificial Intelligence Cognitive Science ( AICS )

Partial match found:
#####EVENT#####7th International Symposium Data-driven Process Discovery Analysis ( SIMPDA 2017 )
######SERIES######

Partial match found:
#####EVENT#####XVII International Scientific Practical Conference Information Technologies Security ( 2017 )
######SERIES######

Partial match found:
#####EVENT#####Doctoral Consortium Industry Track Papers presented 10th IFIP WG 8.1 Working Conference Practice Enterprise Modelling
######SERIES######

Partial match found:
#####EVENT#####Second Confere

Partial match found:
#####EVENT#####XVI All-Russian Scientific Conference `` Digital libraries : Advanced Methods Technologies , Digital Collections ''
######SERIES######

Partial match found:
#####EVENT#####Poster Proceedings 8th ACM Conference Recommender Systems
######SERIES######

Partial match found:
#####EVENT#####Doctoral Symposium MODELS'14
######SERIES######

Partial match found:
#####EVENT#####MODELS Educators Symposium
######SERIES######

Partial match found:
#####EVENT#####main track 14th Conference Information Technologies – Applications Theory ( ITAT 2014 ) , selected papers Znalosti 2014
######SERIES######

Partial match found:
#####EVENT#####15th Italian Conference Theoretical Computer Science
######SERIES######15th Italian Conference Theoretical Computer Science

Partial match found:
#####EVENT#####CLEF 2014 Conference
######SERIES######

Partial match found:
#####EVENT#####8th Turkish National Software Engineering Symposium
######SERIES######

Partial match found:
###

Partial match found:
#####EVENT#####Third Conference Digital Curation Technologies ( Qurator 2022 )
######SERIES######

Partial match found:
#####EVENT#####37th Italian Conference Computational Logic
######SERIES######Italian Conference Computational Logic

Partial match found:
#####EVENT#####Sixteenth International Conference Concept Lattices Applications ( CLA 2022 )
######SERIES######

Partial match found:
#####EVENT#####1st International Workshop Knowledge Graph Generation Text
######SERIES######Workshop Knowledge Graph Generation Text

Partial match found:
#####EVENT#####1st International Workshop Modular Knowledge
######SERIES######

Partial match found:
#####EVENT#####3rd International Workshop Knowledge Graph Construction ( KGCW 2022 )
######SERIES######

Partial match found:
#####EVENT#####13th International SWAT4HCLS conference
######SERIES######

Partial match found:
#####EVENT#####Deep Learning Knowledge Graphs 2021
######SERIES######Workshop Deep Learning Knowledge Graphs


Partial match found:
#####EVENT#####9th PhD Symposium Future Directions Information Access co-located 12th European Summer School Information Retrieval ( ESSIR 2019 ) , Milan , Italy , July 17th - - 18th , 2019
######SERIES######

Partial match found:
#####EVENT#####1st International Workshop Information , Computation , Control Systems Distributed Environments , ICCS-DE 2019 , Irkutsk , Russia , July 8-9 , 2019
######SERIES######

Partial match found:
#####EVENT#####Seminar Series Advanced Techniques & Tools Software Evolution ( SATTOSE 2019 ) , Bolzano , Italy , July 8-10 Day , 2019
######SERIES######

Partial match found:
#####EVENT#####20th Workshop `` Objects Agents '' , Parma , Italy , June 26th-28th , 2019
######SERIES######

Partial match found:
#####EVENT#####34th Italian Conference Computational Logic
######SERIES######Italian Conference Computational Logic

Partial match found:
#####EVENT#####4th Swiss Text Analytics Conference , SwissText 2019 , Winterthur , Switzerland , Ju

Partial match found:
#####EVENT#####9th Workshop Linked Data Web
######SERIES######Workshop Linked Data Web

Partial match found:
#####EVENT#####6th Workshop 'Making Sense Microposts ' co-located 25th International World Wide Web Conference ( WWW 2016 ) , Montréal , Canada , April 11 , 2016
######SERIES######

Partial match found:
#####EVENT#####5th International Workshop Bidirectional Transformations , Bx 2016 , co-located European Joint Conferences Theory Practice Software , ETAPS 2016 , Eindhoven , Netherlands , April 8 , 2016
######SERIES######

Partial match found:
#####EVENT#####1st Chinese Conference Logic Argumentation ( CLAR 2016 ) , Hangzhou , China , April 2-3 , 2016
######SERIES######

Partial match found:
#####EVENT#####Third Workshop Bibliometric-enhanced Information Retrieval co-located 38th European Conference Information Retrieval ( ECIR 2016 ) , Padova , Italy , March 20 , 2016
######SERIES######

Partial match found:
#####EVENT#####Joint REFSQ-2016 Workshops , Doctor

Partial match found:
#####EVENT#####10th CURAC ( Computer- und Roboterassistierte Chirurgie ) Annual Meeting
######SERIES######CURAC ( Computer- und Roboterassistierte Chirurgie ) Annual Meeting

Partial match found:
#####EVENT#####5th International Conference Information Communication Technologies Sustainable Agri-production Environment ( HAICTA 2011 ) , Skiathos , Greece , September 8-11 , 2011
######SERIES######

Partial match found:
#####EVENT#####First International Workshop Searching Integrating New Web Data Sources - Large Data Search , Seattle , WA , USA , September 2 , 2011
######SERIES######

Partial match found:
#####EVENT#####26th Italian Conference Computational Logic
######SERIES######Italian Conference Computational Logic

Partial match found:
#####EVENT#####5th International * Workshop 2011
######SERIES######

Partial match found:
#####EVENT#####3rd Canadian Semantic Web Symposium ( CSWS2011 ) , Vancouver , British Columbia , Canada , August 5 , 2011
######SERIES######


Partial match found:
#####EVENT#####8th Seminar Advanced Techniques Tools Software Evolution
######SERIES######8th Seminar Advanced Techniques Tools Software Evolution

Partial match found:
#####EVENT#####Seventh Seminar Advanced Techniques Tools Software Evolution
######SERIES######Seventh Seminar Advanced Techniques Tools Software Evolution

Partial match found:
#####EVENT#####32nd International Workshop Description Logics
######SERIES######32nd International Workshop Description Logics

Partial match found:
#####EVENT#####30th International Workshop Description Logics
######SERIES######30th International Workshop Description Logics

Partial match found:
#####EVENT#####29th International Workshop Description Logics
######SERIES######29th International Workshop Description Logics

Partial match found:
#####EVENT#####Design Management Data Warehouses
######SERIES######Design Management Data Warehouses

Partial match found:
#####EVENT#####Workshop Linked Data Web
######SERIES######Works

In [40]:
len(partially_matched_events)

210

In [41]:
len(event_titles)

718

In [42]:
len(series_titles)

718

In [43]:
print("Number of partial matches: ", len(partially_matched_events))

Number of partial matches:  210


In [44]:
accuracy = (len(partially_matched_events) / len(event_titles))*100 
print(f"Accuracy = {accuracy}")

Accuracy = 29.247910863509752


In [45]:
event_titles = [event for event in event_titles if event not in partially_matched_events]
len(event_titles)

508

### Existing matches

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
import numpy as np

series_titles = matches_df["event_series"].tolist()
event_titles = matches_df["event"].tolist()

vectorizer = TfidfVectorizer(stop_words = text.ENGLISH_STOP_WORDS)
array1_strings = event_titles
array2_strings = series_titles
tfidf_matrix = vectorizer.fit_transform(event_titles + series_titles)

# Calculate cosine similarity between array1 and array2
similarity_matrix = cosine_similarity(tfidf_matrix[:len(array1_strings)], tfidf_matrix[len(array1_strings):])


# Threshold for partial match
threshold_values = [0.5, 0.6, 0.7, 0.8, 0.9]
# threshold_values = [0.9]

# Find partial matches\
for threshold in threshold_values:
    matches = []
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    num_rows = len(similarity_matrix)
    num_cols = len(similarity_matrix[0])
    for row in range(num_rows):
        score = threshold
        new_row = -1
        new_col = -1
        for col in range(num_cols):
            element = similarity_matrix[row][col]
            if(element>=score):
                score = element
                new_row = row
                new_col = col
        if(new_row !=-1 and new_col !=-1):
            matches.append([new_row, new_col])
#     print(matches)          



    # matches = np.argwhere(similarity_matrix >= threshold)

    ctr = 0
    partially_matched_events = []

    # Print partial matches
    for match in matches:
        array1_index = match[0]
        array2_index = match[1]
#         print("Partial match found:")
#         print(f"#####EVENT#####{array1_strings[array1_index]}")
#         print(f"######SERIES######{array2_strings[array2_index]}")


        if (matches_df.loc[matches_df['event'] == array1_strings[array1_index], 'event_series'].values[0]) == array2_strings[array2_index]:
    #         if(matched_events_dict[array1_strings[array1_index]] >)
            true_positives+=1
        else:
            false_positives+=1

        if series not in series_distinct:
            series_distinct.append(series)
#         print()
        partially_matched_events.append(array1_strings[array1_index])

    # Series not matched to any event
#     false_negatives = len(series_titles) - len(series_distinct)
    false_negatives = len(event_titles) - (true_positives + false_positives)
    
    print("Results with threshold: ", threshold)
    print("true positives: ", true_positives)
    print("false positives: ", false_positives)
    print("false negatives: ", false_negatives)
    precision = true_positives / (true_positives + false_positives)
    print("Precision: ", precision)
    recall = true_positives / (true_positives + false_negatives)
    print("Recall: ", recall)
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("F1-Score: ", f1_score)
    print("Number of partial matches: ", len(partially_matched_events))
    print()

Results with threshold:  0.5
true positives:  357
false positives:  91
false negatives:  270
Precision:  0.796875
Recall:  0.569377990430622
F1-Score:  0.6641860465116278
Number of partial matches:  448

Results with threshold:  0.6
true positives:  302
false positives:  68
false negatives:  348
Precision:  0.8162162162162162
Recall:  0.4646153846153846
F1-Score:  0.592156862745098
Number of partial matches:  370

Results with threshold:  0.7
true positives:  219
false positives:  61
false negatives:  438
Precision:  0.7821428571428571
Recall:  0.3333333333333333
F1-Score:  0.4674493062966915
Number of partial matches:  280

Results with threshold:  0.8
true positives:  149
false positives:  52
false negatives:  517
Precision:  0.7412935323383084
Recall:  0.22372372372372373
F1-Score:  0.3437139561707036
Number of partial matches:  201

Results with threshold:  0.9
true positives:  74
false positives:  45
false negatives:  599
Precision:  0.6218487394957983
Recall:  0.1099554234769688


### Wikidata

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text
import numpy as np

# event_titles = [item['title'] for item in events if 'title' in item]
# with open(series_file) as file:
#     series = json.load(file)
#     series_titles = [item["title"]["value"] for item in series["results"]["bindings"] if "title" in item]
    
vectorizer = TfidfVectorizer(stop_words = text.ENGLISH_STOP_WORDS)
array1_strings = event_titles
array2_strings = series_titles
tfidf_matrix = vectorizer.fit_transform(event_titles + series_titles)

# Calculate cosine similarity between array1 and array2
similarity_matrix = cosine_similarity(tfidf_matrix[:len(array1_strings)], tfidf_matrix[len(array1_strings):])

# Threshold for partial match
threshold = 0.5

# Find partial matches
matches = np.argwhere(similarity_matrix >= threshold)

ctr = 0
partially_matched_events = []
# Print partial matches
for match in matches:
    array1_index = match[0]
    array2_index = match[1]
#     print("Partial match found:")
#     print(f"#####EVENT#####{array1_strings[array1_index]}")
#     print(f"######SERIES######{array2_strings[array2_index]}")
    if series not in series_distinct:
        series_distinct.append(series)
#     print()
    partially_matched_events.append(array1_strings[array1_index])

In [None]:
print("Number of partial matches: ", len(partially_matched_events))

In [None]:
accuracy = (len(partially_matched_events) / len(event_titles))*100 
print(f"Accuracy = {accuracy}")

In [None]:
len(series_distinct)

In [None]:
dblp_series_df = pd.read_json("../eventseries/src/main/resources/dblp_event_series_names.json")
dblp_series_df

## Import DBLP entries

In [None]:
from eventseries.src.main

In [None]:
filename = "/Users/ayan/Projects/KGLab/main/CEUR-WS-Event-Series--SS23/eventseries/src/main/resources/dblp_event_series.pickle"
print(filename)
try:
    with open(filename, 'rb') as file:
        entries = pickle.load(file)
except (FileNotFoundError, pickle.UnpicklingError):
    print("Not found")

In [None]:
car_pickle = open(filename, "rb")
car_contents = pickle.load(car_pickle)
print(car_contents)

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Assuming you have a dataset with event titles and corresponding event series
data = {
    'EventTitle': ['Summer Music Festival', 'Winter Film Festival', 'Spring Art Exhibition'],
    'EventSeries': ['Music Festival', 'Film Festival', 'Art Exhibition']
}

df = pd.DataFrame(data)

# Perform one-hot encoding on event titles
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['EventTitle'])

# Target variable: Event Series
y = df['EventSeries']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict event series for test data
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

feature_names = vectorizer.get_feature_names()
df_train = pd.DataFrame(X_train.toarray(), columns=feature_names)


In [None]:
df_train

In [None]:
X_test.val