In [2]:
from rdflib import Graph, URIRef, Namespace, Literal, XSD, RDF
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from tqdm.autonotebook import tqdm
import pandas as pd
from copy import deepcopy
import numpy
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
from rapidfuzz import process, fuzz, distance

import torch
from transformers import BertTokenizer
from transformers import BertModel
from sklearn.metrics.pairwise import cosine_similarity


  from tqdm.autonotebook import tqdm


In [3]:
GRAPH_1_INV_FUNC_PATH = ''
GRAPH_1 = ""
GRAPH_2_INV_FUNC_PATH = ''
GRAPH_2 = ""
#GRAPH_2 = "http://localhost:9999/bigdata/sparql"
SUBJECT = 'subject'
PREDICATE = 'predicate'
INVERSE_FUNCTIONALITY = 'inverse_functionality'

In [4]:
graph_1_inv_func_df = pd.read_csv(GRAPH_1_INV_FUNC_PATH)

threshold_graph_1_inv_func_df = graph_1_inv_func_df[graph_1_inv_func_df[INVERSE_FUNCTIONALITY] > 0.5]

#graph_1_predicate_list = threshold_graph_1_inv_func_df[PREDICATE].tolist()

graph_1 = Graph()
graph_1.parse(GRAPH_1)

<Graph identifier=Ne66a7f22697e4cbeb53ef1fef7dc3798 (<class 'rdflib.graph.Graph'>)>

In [5]:
graph_2_inv_func_df = pd.read_csv(GRAPH_2_INV_FUNC_PATH)

threshold_graph_2_inv_func_df = graph_2_inv_func_df[graph_2_inv_func_df[INVERSE_FUNCTIONALITY] > 0.5]

#graph_2_predicate_list = threshold_graph_2_inv_func_df[PREDICATE].tolist()

graph_2 = Graph()
graph_2.parse(GRAPH_2)
#graph_2 = Graph("SPARQLStore")
#graph_2.open(GRAPH_2)

<Graph identifier=N039f8d41798747f69abc438b7b0c6828 (<class 'rdflib.graph.Graph'>)>

In [6]:
def get_label_graph(graph, predicate_df):
    predicate_label_list = list()
    LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
    label_predicate = URIRef(LABEL)
    for index, row in tqdm(predicate_df.iterrows(), total=len(predicate_df)):
        search_uriref = URIRef(row['predicate'])
        label = str(next(graph.objects(search_uriref, label_predicate), ""))
        predicate_label_list.append(label)
    return predicate_label_list

In [7]:
predicate_1_list = get_label_graph(graph_1, graph_1_inv_func_df) 

  0%|          | 0/342 [00:00<?, ?it/s]

In [8]:
graph_1_inv_func_df['labels'] = predicate_1_list

In [9]:
graph_1_inv_func_df

Unnamed: 0.1,Unnamed: 0,predicate,inverse_functionality,labels
0,0,http://dbkwik.webdatacommons.org/ontology/wiki...,0.064058,
1,1,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,0.001296,
2,2,http://purl.org/dc/terms/subject,0.022274,
3,3,http://dbkwik.webdatacommons.org/ontology/wiki...,0.978144,
4,4,http://xmlns.com/foaf/0.1/thumbnail,1.000000,
...,...,...,...,...
337,337,http://dbkwik.webdatacommons.org/memory-alpha....,1.000000,Packs/Box
338,338,http://dbkwik.webdatacommons.org/memory-alpha....,1.000000,
339,339,http://dbkwik.webdatacommons.org/memory-alpha....,1.000000,nArc3PartNumber
340,340,http://dbkwik.webdatacommons.org/memory-alpha....,1.000000,Written


In [10]:
predicate_2_list = get_label_graph(graph_2, graph_2_inv_func_df) 

  0%|          | 0/218 [00:00<?, ?it/s]

In [11]:
graph_2_inv_func_df['labels'] = predicate_2_list

In [12]:
graph_2_inv_func_df

Unnamed: 0.1,Unnamed: 0,predicate,inverse_functionality,labels
0,0,http://purl.org/dc/elements/1.1/rights,0.500000,
1,1,http://dbkwik.webdatacommons.org/ontology/wiki...,0.123107,
2,2,http://www.w3.org/2000/01/rdf-schema#comment,0.999063,
3,3,http://www.w3.org/2004/02/skos/core#altLabel,1.000000,
4,4,http://dbkwik.webdatacommons.org/ontology/thum...,0.288348,
...,...,...,...,...
213,213,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,skin
214,214,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,assigned
215,215,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,y
216,216,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,duration


In [13]:
graph_1_predicate_set = set(graph_1_inv_func_df['predicate'])

In [14]:
graph_2_predicate_set = set(graph_2_inv_func_df['predicate'])

In [15]:
predicate_prob_dict = dict()

In [16]:
for predicate_uri in graph_1_predicate_set:
    if predicate_uri in graph_2_predicate_set:
        if not predicate_uri in predicate_prob_dict.keys():
            predicate_prob_dict[predicate_uri] = dict()
        predicate_prob_dict[predicate_uri][predicate_uri] = 1.0

In [17]:
graph_2_predicate_labels_dict = {label: list(graph_2_inv_func_df.loc[graph_2_inv_func_df['labels'] == label, 'predicate'])
           for label in graph_2_inv_func_df['labels'].unique()}

In [18]:
del graph_2_predicate_labels_dict['']

In [19]:
for _, row in graph_1_inv_func_df.iterrows():
    label = row['labels']
    predicate_1 = row['predicate']
    if label in graph_2_predicate_labels_dict:
        for predicate_2 in graph_2_predicate_labels_dict[label]:
            if predicate_1 in predicate_prob_dict.keys() and predicate_2 in predicate_prob_dict[predicate_1].keys():
                continue
            predicate_prob_dict.setdefault(predicate_1, {})[predicate_2] = 0.9

In [20]:
def normalize_string(s):
    # remove punctuation
    s = s.translate(str.maketrans('', '', string.punctuation))
    # split the string at uppercase letters and digits and join with spaces
    s = re.sub(r'([a-z])([A-Z])', r'\1 \2', s)
    s = re.sub(r'([A-Z])([A-Z][a-z])', r'\1 \2', s)
    s = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', s)
    s = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', s)
    # replace underscores with spaces
    s = s.replace('_', ' ')
    # remove multiple consecutive spaces and leading/trailing spaces
    s = ' '.join(s.split()).strip()
    # convert to lowercase and return
    return s.lower()

In [21]:
graph_1_inv_func_df['normalized label'] = graph_1_inv_func_df['labels'].apply(normalize_string)


In [22]:
graph_2_inv_func_df['normalized label'] = graph_2_inv_func_df['labels'].apply(normalize_string)


In [23]:
graph_2_inv_func_df

Unnamed: 0.1,Unnamed: 0,predicate,inverse_functionality,labels,normalized label
0,0,http://purl.org/dc/elements/1.1/rights,0.500000,,
1,1,http://dbkwik.webdatacommons.org/ontology/wiki...,0.123107,,
2,2,http://www.w3.org/2000/01/rdf-schema#comment,0.999063,,
3,3,http://www.w3.org/2004/02/skos/core#altLabel,1.000000,,
4,4,http://dbkwik.webdatacommons.org/ontology/thum...,0.288348,,
...,...,...,...,...,...
213,213,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,skin,skin
214,214,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,assigned,assigned
215,215,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,y,y
216,216,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,duration,duration


In [24]:
LABEL = 'normalized label'

graph_2_predicate_labels_dict = {label: list(graph_2_inv_func_df.loc[graph_2_inv_func_df[LABEL] == label, 'predicate'])
           for label in graph_2_inv_func_df[LABEL].unique()}
del graph_2_predicate_labels_dict['']

for _, row in graph_1_inv_func_df.iterrows():
    label = row[LABEL]
    predicate_1 = row['predicate']
    if label in graph_2_predicate_labels_dict:
        for predicate_2 in graph_2_predicate_labels_dict[label]:
            if predicate_1 in predicate_prob_dict.keys() and predicate_2 in predicate_prob_dict[predicate_1].keys():
                continue
            predicate_prob_dict.setdefault(predicate_1, {})[predicate_2] = 0.8
            
            

In [25]:
nltk.download('stopwords') # download stop words list

stop_words = set(stopwords.words('english')) # set of English stop words

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yamamotobikutorueiichi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
def remove_stopwords(s):
    words = s.split() # split text into individual words

    filtered_words = [word for word in words if not word.lower() in stop_words] # remove stop words

    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [27]:
graph_1_inv_func_df['stopword label'] = graph_1_inv_func_df['normalized label'].apply(remove_stopwords)


In [28]:
graph_2_inv_func_df['stopword label'] = graph_2_inv_func_df['normalized label'].apply(remove_stopwords)


In [29]:
graph_2_inv_func_df

Unnamed: 0.1,Unnamed: 0,predicate,inverse_functionality,labels,normalized label,stopword label
0,0,http://purl.org/dc/elements/1.1/rights,0.500000,,,
1,1,http://dbkwik.webdatacommons.org/ontology/wiki...,0.123107,,,
2,2,http://www.w3.org/2000/01/rdf-schema#comment,0.999063,,,
3,3,http://www.w3.org/2004/02/skos/core#altLabel,1.000000,,,
4,4,http://dbkwik.webdatacommons.org/ontology/thum...,0.288348,,,
...,...,...,...,...,...,...
213,213,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,skin,skin,skin
214,214,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,assigned,assigned,assigned
215,215,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,y,y,
216,216,http://dbkwik.webdatacommons.org/stexpanded.wi...,1.000000,duration,duration,duration


In [30]:
LABEL = 'stopword label'

graph_2_predicate_labels_dict = {label: list(graph_2_inv_func_df.loc[graph_2_inv_func_df[LABEL] == label, 'predicate'])
           for label in graph_2_inv_func_df[LABEL].unique()}
del graph_2_predicate_labels_dict['']

for _, row in graph_1_inv_func_df.iterrows():
    label = row[LABEL]
    predicate_1 = row['predicate']
    if label in graph_2_predicate_labels_dict:
        for predicate_2 in graph_2_predicate_labels_dict[label]:
            if predicate_1 in predicate_prob_dict.keys() and predicate_2 in predicate_prob_dict[predicate_1].keys():
                continue
            predicate_prob_dict.setdefault(predicate_1, {})[predicate_2] = 0.7
            
            

In [31]:
def remove_corpus_specific_stopword(graph_df):
    doc_freq = defaultdict(int)

    for _, row in graph_df.iterrows():
        doc = row['stopword label']
        words = set(word_tokenize(doc))
        for word in words:
            doc_freq[word] += 1
            
    total_docs = len(graph_df)
    corpus_stopwords = set()
    for word, freq in doc_freq.items():
        if freq / total_docs > 0.2: # word appears in more than 20% of documents
            corpus_stopwords.add(word)
    print(corpus_stopwords)
    
    label_list = list()
    for _, row in graph_df.iterrows():
        words = word_tokenize(row['stopword label'])
        filtered_words = [word for word in words if not word.lower() in corpus_stopwords]
        filtered_text = ' '.join(filtered_words)
        label_list.append(filtered_text)
    
    graph_df['corpus stopword label'] = label_list

In [32]:
remove_corpus_specific_stopword(graph_1_inv_func_df)

set()


In [33]:
remove_corpus_specific_stopword(graph_2_inv_func_df)

set()


In [34]:
LABEL = 'corpus stopword label'

for _, row in graph_1_inv_func_df.iterrows():
    label_1 = row[LABEL]
    predicate_1 = row['predicate']
    if not label_1:
        continue
    
    for _, row_2 in graph_2_inv_func_df.iterrows():
        label_2 = row_2[LABEL]
        predicate_2 = row_2['predicate']
        if not label_2:
            continue
        if not predicate_1 in predicate_prob_dict.keys() or not predicate_2 in predicate_prob_dict[predicate_1]:
            sim = fuzz.WRatio(label_1, label_2) / 100
            prob = 0.6 * sim
            predicate_prob_dict.setdefault(predicate_1, {})[predicate_2] = prob

In [35]:
nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
def embed_long_sentence(long_sentence):
    sentences = nltk_tokenizer.tokenize(long_sentence)
    
    tokens = {'input_ids': [], 'attention_mask': []}

    max_token = 140
    for sentence in sentences:
        new_tokens = tokenizer.encode_plus(sentence, return_tensors="pt", max_length=max_token, padding='max_length')
        tokens['input_ids'].append(new_tokens['input_ids'][0][:max_token])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0][:max_token])

    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    
    outputs = model(**tokens)
    
    embeddings = outputs.last_hidden_state
    
    attention_mask = tokens['attention_mask']
    
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()

    masked_embeddings = embeddings * mask
    
    summed = torch.sum(masked_embeddings, 1)

    summed_mask = torch.clamp(mask.sum(1), min=1e-9)

    mean_pooled = summed / summed_mask
    
    return mean_pooled.detach().numpy()

In [37]:
def get_label_vec_dict(df):
    LABEL = 'corpus stopword label'

    df_label_set = set(df[LABEL])
    df_label_vec_dict = dict()

    for label in tqdm(df_label_set):
        if not label:
            continue

        df_label_vec_dict[label] = embed_long_sentence(label)
    return df_label_vec_dict

In [38]:
graph_2_label_vec_dict = get_label_vec_dict(graph_2_inv_func_df)

  0%|          | 0/194 [00:00<?, ?it/s]

In [39]:
LABEL = 'corpus stopword label'

for _, row in tqdm(graph_1_inv_func_df.iterrows(), total=len(graph_1_inv_func_df)):
    label_1 = row[LABEL]
    predicate_1 = row['predicate']
    if not label_1:
        continue
    vec_1 = embed_long_sentence(label_1)
    
    for _, row_2 in graph_2_inv_func_df.iterrows():
        predicate_2 = row_2['predicate']
        label_2 = row_2[LABEL]
        
        if not label_2:
            continue
        
        if predicate_prob_dict[predicate_1][predicate_2] > 0.5:
            continue
        
        vec_2 = graph_2_label_vec_dict[row_2[LABEL]]
        
        similarities = cosine_similarity(vec_1, vec_2)
        max_similarity = max(map(max, similarities))
        prob = 0.5 * max_similarity
        
        if prob > predicate_prob_dict[predicate_1][predicate_2]:
            predicate_prob_dict[predicate_1][predicate_2] = prob

  0%|          | 0/342 [00:00<?, ?it/s]

In [42]:
for pred_1 in predicate_prob_dict.keys():
    pred_1_dict = predicate_prob_dict[pred_1]
    highest_sim_predicate = max(pred_1_dict, key=lambda x: pred_1_dict[x])
    print(pred_1)
    print(highest_sim_predicate)
    print(pred_1_dict[highest_sim_predicate])
    print('____')

http://dbkwik.webdatacommons.org/ontology/abstract
http://dbkwik.webdatacommons.org/ontology/abstract
1.0
____
http://xmlns.com/foaf/0.1/depiction
http://xmlns.com/foaf/0.1/depiction
1.0
____
http://purl.org/dc/terms/subject
http://purl.org/dc/terms/subject
1.0
____
http://xmlns.com/foaf/0.1/thumbnail
http://xmlns.com/foaf/0.1/thumbnail
1.0
____
http://www.w3.org/2000/01/rdf-schema#comment
http://www.w3.org/2000/01/rdf-schema#comment
1.0
____
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
1.0
____
http://www.w3.org/2004/02/skos/core#broader
http://www.w3.org/2004/02/skos/core#broader
1.0
____
http://dbkwik.webdatacommons.org/ontology/wikiPageDisambiguates
http://dbkwik.webdatacommons.org/ontology/wikiPageDisambiguates
1.0
____
http://www.w3.org/2000/01/rdf-schema#label
http://www.w3.org/2000/01/rdf-schema#label
1.0
____
http://dbkwik.webdatacommons.org/ontology/wikiPageWikiLinkText
http://dbkwik.webdatacommons.org/ontology/wikiPageWikiLi