In [2]:
import pandas as pd

In [3]:
pip install rdflib

Note: you may need to restart the kernel to use updated packages.


In [3]:
from rdflib import Graph, URIRef
from rdflib.namespace import OWL
from difflib import SequenceMatcher

def similarity(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

df = pd.read_csv("Query-M2V2.csv")


ieee_title_pred = "http://www.ieeeLOD.org/2024/01/scientific/data#title"
dblp_title_pred = "https://dblp.org/rdf/schema#title"

ieee_titles = df[df['p'] == ieee_title_pred][['s', 'o']].rename(columns={'s': 'ieee_uri', 'o': 'ieee_title'})
dblp_titles = df[df['p'] == dblp_title_pred][['s', 'o']].rename(columns={'s': 'dblp_uri', 'o': 'dblp_title'})

g = Graph()
g.bind("owl", OWL)

for i, row_ieee in ieee_titles.iterrows():
    for j, row_dblp in dblp_titles.iterrows():
        sim = similarity(str(row_ieee['ieee_title']), str(row_dblp['dblp_title']))
        if sim >= 0.92:
            g.add((URIRef(row_dblp['dblp_uri']), OWL.sameAs, URIRef(row_ieee['ieee_uri'])))

g.serialize(destination="sameAs-title.ttl", format="turtle")
print("✅ File saved as sameAs-title.ttl")

✅ File saved as sameAs-title.ttl


In [None]:
import pandas as pd
from difflib import SequenceMatcher


df = pd.read_csv("Query-M2V2.csv")


hasName_predicate = "https://w3id.org/SWLD-course-UT/2025/data#hasName"
ieee_marker = "ieeeLOD"
dblp_marker = "dblp"

# Filter IEEE and DBLP authors
ieee_authors = df[(df['s'].str.contains(ieee_marker)) & (df['p'] == hasName_predicate)][['s', 'o']].rename(columns={'s': 'ieee_uri', 'o': 'ieee_author'})
dblp_authors = df[(df['s'].str.contains(dblp_marker)) & (df['p'] == hasName_predicate)][['s', 'o']].rename(columns={'s': 'dblp_uri', 'o': 'dblp_author'})


ieee_authors['key'] = 1
dblp_authors['key'] = 1
all_pairs = pd.merge(ieee_authors, dblp_authors, on='key').drop('key', axis=1)


def similarity(row):
    return SequenceMatcher(None, row['ieee_author'].lower(), row['dblp_author'].lower()).ratio()


all_pairs['Similarity'] = all_pairs.apply(similarity, axis=1)
top_matches = all_pairs.sort_values(by='Similarity', ascending=False).head(100)
print(top_matches[['ieee_uri', 'dblp_uri', 'ieee_author', 'dblp_author', 'Similarity']])
