In [1]:
from rdflib import Graph, URIRef, Namespace, Literal, XSD, RDF
from rdflib.plugins.stores.sparqlstore import SPARQLStore
from tqdm.autonotebook import tqdm
import pandas as pd
from copy import deepcopy
import numpy
from sklearn.metrics.pairwise import cosine_similarity
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import defaultdict
from rapidfuzz import process, fuzz, distance
import math

import torch
from transformers import BertTokenizer
from transformers import BertModel
from sklearn.metrics.pairwise import cosine_similarity

import concurrent.futures

  from tqdm.autonotebook import tqdm


In [2]:
GRAPH_1 = ""
GRAPH_1_INV_FUNC_PATH = ''

GRAPH_2 = ""
GRAPH_2_INV_FUNC_PATH = ''

#GRAPH_2 = "http://localhost:9999/bigdata/sparql"
SUBJECT = 'subject'
PREDICATE = 'predicate'
INVERSE_FUNCTIONALITY = 'inverse_functionality'
INVERSABILITY = 'inversability'
MAX_LENGTH_FULL_MATCH = 10000
LABEL_PREDICATE = "http://www.w3.org/2000/01/rdf-schema#label"
FILE_FOLDER = "ma-mb/"
GRAPH_1_ABSTRACT = "http://dbkwik.webdatacommons.org/ontology/abstract"
GRAPH_2_ABSTRACT = "http://dbkwik.webdatacommons.org/ontology/abstract"

In [3]:
graph_1_inv_func_df = pd.read_csv(GRAPH_1_INV_FUNC_PATH)

#graph_1_predicate_list = threshold_graph_1_inv_func_df[PREDICATE].tolist()

graph_1 = Graph()
graph_1.parse(GRAPH_1)

<Graph identifier=N0509029e4adf41a6a19566f98c44e2b6 (<class 'rdflib.graph.Graph'>)>

In [4]:
graph_2_inv_func_df = pd.read_csv(GRAPH_2_INV_FUNC_PATH)

#graph_2_predicate_list = threshold_graph_2_inv_func_df[PREDICATE].tolist()

graph_2 = Graph()
graph_2.parse(GRAPH_2)
#graph_2 = Graph("SPARQLStore")
#graph_2.open(GRAPH_2)

<Graph identifier=N45e672671d1e4b05b505d8cc933e217e (<class 'rdflib.graph.Graph'>)>

In [5]:
def get_label_graph(graph, predicate_df, uri_column):
    predicate_label_list = list()
    LABEL = "http://www.w3.org/2000/01/rdf-schema#label"
    label_predicate = URIRef(LABEL)
    for index, row in tqdm(predicate_df.iterrows(), total=len(predicate_df)):
        search_uriref = URIRef(row[uri_column])
        label = str(next(graph.objects(search_uriref, label_predicate), ""))
        predicate_label_list.append(label)
    return predicate_label_list

In [6]:
def calculate_similarity_using_label(df_1, df_2, uri_column):
    label_list_1 = get_label_graph(graph_1, df_1, uri_column)
    df_1['label'] = label_list_1
    
    label_list_2 = get_label_graph(graph_2, df_2, uri_column)
    df_2['label'] = label_list_2
    
    uri_set_1 = set(df_1[uri_column])
    uri_set_2 = set(df_2[uri_column])
    
    prob_dict = dict()
    
    for uri_1 in uri_set_1:
        if uri_1 in uri_set_2:
            prob_dict.setdefault(uri_1, {})
            prob_dict[uri_1][uri_1] = 1.0
    
    return prob_dict

In [7]:
def match_labels(prob_dict, df_1, df_2, label_column, max_score, uri_column):
    df_1_label_dict = {label: list(df_1.loc[df_1[label_column] == label, uri_column])
           for label in df_1[label_column].unique()}
    
    if '' in df_1_label_dict:
        del df_1_label_dict['']
    
    for _, row in tqdm(df_2.iterrows(), total=len(df_2), desc="match labels"):
        label = row[label_column]
        uri_2 = row[uri_column]
        if label in df_1_label_dict:
            for uri_1 in df_1_label_dict[label]:
                if uri_1 in prob_dict and uri_2 in prob_dict[uri_1]:
                    continue
                prob_dict.setdefault(uri_1, {})[uri_2] = max_score

In [8]:
def normalize_string(s):
    # remove parenthesis
    s = re.sub("[\(\[].*?[\)\]]", "", s)
    # remove punctuation
    s = s.translate(str.maketrans('', '', string.punctuation))
    # split the string at uppercase letters and digits and join with spaces
    s = re.sub(r'([a-z])([A-Z])', r'\1 \2', s)
    s = re.sub(r'([A-Z])([A-Z][a-z])', r'\1 \2', s)
    s = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', s)
    s = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', s)
    # replace underscores with spaces
    s = s.replace('_', ' ')
    # remove multiple consecutive spaces and leading/trailing spaces
    s = ' '.join(s.split()).strip()
    # convert to lowercase and return
    return s.lower()

In [9]:
nltk.download('stopwords') # download stop words list

stop_words = set(stopwords.words('english')) # set of English stop words

def remove_stopwords(s):
    words = s.split() # split text into individual words

    filtered_words = [word for word in words if not word.lower() in stop_words] # remove stop words

    filtered_text = ' '.join(filtered_words)
    return filtered_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yamamotobikutorueiichi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def remove_corpus_specific_stopword(graph_df):
    doc_freq = defaultdict(int)

    for _, row in graph_df.iterrows():
        doc = row['stopword label']
        words = set(word_tokenize(doc))
        for word in words:
            doc_freq[word] += 1
            
    total_docs = len(graph_df)
    corpus_stopwords = set()
    for word, freq in doc_freq.items():
        if freq / total_docs > 0.2: # word appears in more than 20% of documents
            corpus_stopwords.add(word)    
    label_list = list()
    for _, row in graph_df.iterrows():
        words = word_tokenize(row['stopword label'])
        filtered_words = [word for word in words if not word.lower() in corpus_stopwords]
        filtered_text = ' '.join(filtered_words)
        label_list.append(filtered_text)
    
    graph_df['corpus stopword label'] = label_list

In [11]:
nltk_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
def embed_long_sentence(long_sentence):
    sentences = nltk_tokenizer.tokenize(long_sentence)
    
    tokens = {'input_ids': [], 'attention_mask': []}

    max_token = 140
    for sentence in sentences:
        new_tokens = tokenizer.encode_plus(sentence, return_tensors="pt", max_length=max_token, padding='max_length')
        tokens['input_ids'].append(new_tokens['input_ids'][0][:max_token])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0][:max_token])

    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    
    outputs = model(**tokens)
    
    embeddings = outputs.last_hidden_state
    
    attention_mask = tokens['attention_mask']
    
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()

    masked_embeddings = embeddings * mask
    
    summed = torch.sum(masked_embeddings, 1)

    summed_mask = torch.clamp(mask.sum(1), min=1e-9)

    mean_pooled = summed / summed_mask
    
    return mean_pooled.detach().numpy()

In [13]:
def get_label_vec_dict(df):
    LABEL = 'corpus stopword label'

    df_label_set = set(df[LABEL])
    df_label_vec_dict = dict()

    for label in tqdm(df_label_set):
        if not label:
            continue

        df_label_vec_dict[label] = embed_long_sentence(label)
    return df_label_vec_dict

In [14]:
def apply_match_label(prob_dict, df_1, df_2, uri_column, label_to_apply, new_label, max_score):
    df_1[new_label] = df_1[label_to_apply].apply(normalize_string)
    df_2[new_label] = df_2[label_to_apply].apply(normalize_string)
    
    match_labels(prob_dict, df_1, df_2, new_label, max_score, uri_column)

In [15]:
def calculate_prob_dict(df_1, df_2, uri_column):
    prob_dict = calculate_similarity_using_label(df_1, df_2, uri_column)
    
    match_labels(prob_dict, df_1, df_2, 'label', 0.9, uri_column)

    apply_match_label(prob_dict, df_1, df_2, uri_column, 'label', 'normalized label', 0.8)
    apply_match_label(prob_dict, df_1, df_2, uri_column, 'normalized label', 'stopword label', 0.7)

    remove_corpus_specific_stopword(df_1)
    remove_corpus_specific_stopword(df_2)
    
    # Fuzzy and BERT match only for list with less than MAX_LENGTH_FULL_MATCH
    if len(df_1) > MAX_LENGTH_FULL_MATCH or len(df_2) > MAX_LENGTH_FULL_MATCH:
        return prob_dict

    # Match using fuzzy string
    LABEL = 'corpus stopword label'
    for _, row in tqdm(df_1.iterrows(), total=len(df_1), desc='Corpus stopword match'):
        label_1 = row[LABEL]
        uri_1 = row[uri_column]
        if not label_1:
            continue

        for _, row_2 in df_2.iterrows():
            label_2 = row_2[LABEL]
            uri_2 = row_2[uri_column]
            if not label_2:
                continue
            if not uri_1 in prob_dict.keys() or not uri_2 in prob_dict[uri_1]:
                sim = fuzz.WRatio(label_1, label_2) / 100
                prob = 0.6 * sim
                prob_dict.setdefault(uri_1, {})[uri_2] = prob           
    
    # Match using BERT
    graph_1_label_vec_dict = get_label_vec_dict(df_1)
    
    LABEL = 'corpus stopword label'
    for _, row in tqdm(df_2.iterrows(), total=len(df_2)):
        label_2 = row[LABEL]
        uri_2 = row[uri_column]
        if not label_2:
            continue
        vec_2 = embed_long_sentence(label_2)

        for _, row_1 in df_1.iterrows():
            uri_1 = row_1[uri_column]
            label_1 = row_1[LABEL]

            if not label_1:
                continue

            if prob_dict[uri_1][uri_2] > 0.5:
                continue

            vec_1 = graph_1_label_vec_dict[row_1[LABEL]]

            similarities = cosine_similarity(vec_1, vec_2)
            max_similarity = max(map(max, similarities))
            prob = 0.5 * max_similarity

            if prob > prob_dict[uri_1][uri_2]:
                prob_dict[uri_1][uri_2] = prob
    
    return prob_dict

In [16]:
predicate_prob_dict = calculate_prob_dict(graph_1_inv_func_df, graph_2_inv_func_df, 'predicate')

  0%|          | 0/342 [00:00<?, ?it/s]

  0%|          | 0/440 [00:00<?, ?it/s]

match labels:   0%|          | 0/440 [00:00<?, ?it/s]

match labels:   0%|          | 0/440 [00:00<?, ?it/s]

match labels:   0%|          | 0/440 [00:00<?, ?it/s]

Corpus stopword match:   0%|          | 0/342 [00:00<?, ?it/s]

  0%|          | 0/321 [00:00<?, ?it/s]

  0%|          | 0/440 [00:00<?, ?it/s]

In [17]:
predicate_sem_prob_list = list()

for pred_1, pred_2_dict in predicate_prob_dict.items():
    for pred_2 in pred_2_dict:
        predicate_sem_prob_list.append({
            'p1': pred_1,
            'p2': pred_2,
            'sim': pred_2_dict[pred_2]
        })

In [18]:
predicate_sem_prob_df = pd.DataFrame(predicate_sem_prob_list)
predicate_sem_prob_df.to_csv(FILE_FOLDER + 'predicate_sem_prob.csv')

In [19]:
threshold_graph_1_inv_func_df = graph_1_inv_func_df[graph_1_inv_func_df[INVERSABILITY] > 0.25]
threshold_graph_2_inv_func_df = graph_2_inv_func_df[graph_2_inv_func_df[INVERSABILITY] > 0.25]


In [20]:
def get_graph_classes(graph):
    query = """
        select * {?s a owl:Class.}
        """
    class_list = list()
    
    for res in graph.query(query):
        class_list.append(res[0])
    return class_list

In [21]:
graph_1_class_list = get_graph_classes(graph_1)
graph_2_class_list = get_graph_classes(graph_2)

class_1_df = pd.DataFrame()
class_1_df['class'] = graph_1_class_list
class_2_df = pd.DataFrame()
class_2_df['class'] = graph_2_class_list

class_sem_prob_dict = calculate_prob_dict(class_1_df, class_2_df, 'class')

  0%|          | 0/181 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

match labels:   0%|          | 0/240 [00:00<?, ?it/s]

match labels:   0%|          | 0/240 [00:00<?, ?it/s]

match labels:   0%|          | 0/240 [00:00<?, ?it/s]

Corpus stopword match:   0%|          | 0/181 [00:00<?, ?it/s]

  0%|          | 0/179 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

In [22]:
class_sem_prob_list = list()

for class_1, class_2_dict in class_sem_prob_dict.items():
    for class_2 in class_2_dict:
        class_sem_prob_list.append({
            'c1': class_1,
            'c2': class_2,
            'sim': class_2_dict[class_2]
        })

In [23]:
class_sem_prob_df = pd.DataFrame(class_sem_prob_list)
class_sem_prob_df.to_csv(FILE_FOLDER + 'class_sem_prob.csv')

In [24]:
def get_graph_entities(graph):
    query = """
        SELECT DISTINCT ?s
        WHERE {
          ?s <http://www.w3.org/2000/01/rdf-schema#label> ?o.
        }"""
    entity_list = list()
    
    for res in graph.query(query):
        entity_list.append(str(res[0]))
    return entity_list

In [25]:
graph_1_entity_list = get_graph_entities(graph_1)
graph_2_entity_list = get_graph_entities(graph_2)

entity_1_df = pd.DataFrame()
entity_1_df['entity'] = graph_1_entity_list
entity_2_df = pd.DataFrame()
entity_2_df['entity'] = graph_2_entity_list

entity_sem_prob_dict = calculate_prob_dict(entity_1_df, entity_2_df, 'entity')

  0%|          | 0/49959 [00:00<?, ?it/s]

  0%|          | 0/57700 [00:00<?, ?it/s]

match labels:   0%|          | 0/57700 [00:00<?, ?it/s]

match labels:   0%|          | 0/57700 [00:00<?, ?it/s]

match labels:   0%|          | 0/57700 [00:00<?, ?it/s]

In [26]:
entity_sem_prob_list = list()

for entity_1 in entity_sem_prob_dict.keys():
    for entity_2 in entity_sem_prob_dict[entity_1].keys():
        entity_sem_prob_list.append({
            'e1': entity_1,
            'e2': entity_2,
            'sim': entity_sem_prob_dict[entity_1][entity_2]
        })

In [27]:
entity_sem_prob_df = pd.DataFrame(entity_sem_prob_list)

In [28]:
entity_sem_prob_df.to_csv(FILE_FOLDER + 'entity_sem_prob.csv')

In [29]:
threshold_graph_1_inv_func_df = threshold_graph_1_inv_func_df[threshold_graph_1_inv_func_df['predicate'].str.contains(LABEL_PREDICATE) == False]
threshold_graph_2_inv_func_df = threshold_graph_2_inv_func_df[threshold_graph_2_inv_func_df['predicate'].str.contains(LABEL_PREDICATE) == False]





In [30]:
threshold_graph_1_inv_func_df

Unnamed: 0.1,Unnamed: 0,predicate,functionality,inverse_functionality,inversability,label,normalized label,stopword label,corpus stopword label
1,1,http://purl.org/dc/elements/1.1/rights,1.000000,0.500000,0.500000,,,,
8,8,http://www.w3.org/2004/02/skos/core#altLabel,0.704954,1.000000,0.704954,,,,
9,9,http://dbkwik.webdatacommons.org/memory-alpha....,0.918749,0.983028,0.903156,Name,name,name,name
10,10,http://dbkwik.webdatacommons.org/memory-alpha....,0.994955,0.421982,0.419853,Place of birth,place of birth,place of birth,place of birth
11,11,http://dbkwik.webdatacommons.org/ontology/wiki...,0.437566,0.875140,0.382932,,,,
...,...,...,...,...,...,...,...,...,...
337,337,http://dbkwik.webdatacommons.org/memory-alpha....,1.000000,1.000000,1.000000,UPC,upc,upc,upc
338,338,http://dbkwik.webdatacommons.org/memory-alpha....,1.000000,1.000000,1.000000,image3cap,image 3 cap,image 3 cap,image 3 cap
339,339,http://dbkwik.webdatacommons.org/memory-alpha....,1.000000,1.000000,1.000000,Volume,volume,volume,volume
340,340,http://dbkwik.webdatacommons.org/memory-alpha....,1.000000,1.000000,1.000000,FSK,fsk,fsk,fsk


In [31]:
literal_y1_dict = dict()
used_p1_dict = dict()
graph_1_predicate_list = list()

for index, row in tqdm(threshold_graph_1_inv_func_df.iterrows(), total=len(threshold_graph_1_inv_func_df)):
    predicate = row[PREDICATE]
    predicate_uri = URIRef(predicate)

    if sum(1 for _ in graph_1.subject_objects(predicate_uri)) < 5:
        continue
    
    graph_1_predicate_list.append(predicate)
    
    #for s, p, o in graph_1.triples((None, predicate_uri, None)):
    #    if type(o) != Literal or not isinstance(o, Literal) or not (o.datatype is None or o.datatype == XSD.string or o.datatype == RDF.langString):
    #        continue
    for s, p, o in graph_1.triples((None, predicate_uri, None)):

        if type(o) != Literal or not isinstance(o, Literal) or not (o.datatype is None or o.datatype == XSD.string or o.datatype == RDF.langString):
            continue
        
        #if type(o) != Literal or not isinstance(o, Literal):
        #    continue
        o_value = str(o)
        if o_value not in literal_y1_dict.keys():
            literal_y1_dict[o_value] = list()
        y1_value_list = literal_y1_dict[o_value]
        y1_value_list.append({
            SUBJECT: s,
            PREDICATE: predicate_uri
        })

  0%|          | 0/256 [00:00<?, ?it/s]

In [32]:
y2_matches = dict()
graph_2_predicate_list = list()

for index, row in tqdm(threshold_graph_2_inv_func_df.iterrows(), total=len(threshold_graph_2_inv_func_df)):
    predicate = row[PREDICATE]
    predicate_uri = URIRef(predicate)
    
    if sum(1 for _ in graph_2.subject_objects(predicate_uri)) < 5:
        continue
        
    graph_2_predicate_list.append(predicate)
    
    for s, o in graph_2.subject_objects(predicate_uri):
        if type(o) != Literal or not isinstance(o, Literal) or not (o.datatype is None or o.datatype == XSD.string or o.datatype == RDF.langString):
            continue
        
        #if type(o) != Literal or not isinstance(o, Literal):
        #    continue
        
        # Check if exist match
        o_value = str(o)
        if o_value not in literal_y1_dict.keys():
            continue
            
        if o_value not in y2_matches.keys():
            y2_matches[o_value] = list()
        y2_match_list = y2_matches[o_value]
        y2_match_list.append({
            SUBJECT: s,
            PREDICATE: predicate_uri
        })

  0%|          | 0/315 [00:00<?, ?it/s]

In [33]:
def get_predicate_prob(predicate_pair_dict, p1, p2):
    if p1 not in predicate_pair_dict.keys():
        predicate_pair_dict[p1] = dict()
    if p2 not in predicate_pair_dict[p1].keys():
        predicate_pair_dict[p1][p2] = 0.1

    return predicate_pair_dict[p1][p2]

In [34]:
inv_func_1_dict = dict()

for index, row in tqdm(threshold_graph_1_inv_func_df.iterrows(), total=len(threshold_graph_1_inv_func_df)):
    inv_func_1_dict[row['predicate']] = row[INVERSABILITY]

  0%|          | 0/256 [00:00<?, ?it/s]

In [35]:
inv_func_2_dict = dict()

for index, row in tqdm(threshold_graph_2_inv_func_df.iterrows(), total=len(threshold_graph_2_inv_func_df)):
    inv_func_2_dict[row['predicate']] = row[INVERSABILITY]

  0%|          | 0/315 [00:00<?, ?it/s]

In [36]:
def save_predicate_pair(predicate_pair_counter, x1, p1, y1, x2, p2, y2):
    predicate_pair_counter.append({
        'x1': x1,
        'p1': p1,
        'y1': y1,
        'x2': x2,
        'p2': p2,
        'y2': y2,
    })

In [37]:
def update_prob_entities(entity_pair_match_dict, x1, x2, p1, p2, prob_y):
    if x1 not in entity_pair_match_dict.keys():
        entity_pair_match_dict[x1] = dict()
    if x2 not in entity_pair_match_dict[x1].keys():
        entity_pair_match_dict[x1][x2] = 1
    #if x1 == 'http://dbkwik.webdatacommons.org/finalfantasy./resource/Toad_(Final_Fantasy_V_enemy)':
    #    print(len(entity_pair_match_dict[x1]))
    
    y1_pred = str(p1)
    y2_pred = str(p2)
    pred_prob_1_2 = get_predicate_prob(predicate_pair_dict_1, y1_pred, y2_pred)
    pred_prob_2_1 = get_predicate_prob(predicate_pair_dict_2, y2_pred, y1_pred)

    new_factor = (1-pred_prob_1_2*inv_func_1_dict[y1_pred] * prob_y) * (1-pred_prob_2_1*inv_func_2_dict[y2_pred] * prob_y)

    entity_pair_match_dict[x1][x2] *= new_factor

    return 1 - entity_pair_match_dict[x1][x2]

In [38]:
def calculate_entity_prob_using_attribute(entity_triple_match):
    for o_value, y2_match_list in tqdm(y2_matches.items(), desc='Attribute loop', leave=False):
        y1_match_list = literal_y1_dict[o_value]

        for y1_match in y1_match_list:
            y1_subject = y1_match['subject']

            for y2_match in y2_match_list:
                y2_subject = y2_match['subject']


                y1_pred = str(y1_match['predicate'])
                y2_pred = str(y2_match['predicate'])
                sim = update_prob_entities(entity_triple_match, str(y1_subject), str(y2_subject),
                                     y1_pred, y2_pred, 1)

                if sim > 0.1:
                    save_predicate_pair(predicate_pair_counter, str(y1_subject), y1_pred, o_value, str(y2_subject),
                                        y2_pred, o_value)
                

In [39]:
def get_inbound_neighbors(graph, predicate_list, y, loaded_neighbors_dict):
    if not y:
        return []
    
    if y in loaded_neighbors_dict.keys():
        return loaded_neighbors_dict[y]
    
    y_url = y
    if type(y_url) != URIRef:
        y_url = URIRef(y)
    
    attempts = 0
    while attempts < 10:
        try:
            neighbor_list = list()
            for s, p in graph.subject_predicates(y_url):
                if str(p) in predicate_list:
                    neighbor_list.append({
                        'p': p,
                        's': s
                    })
            return neighbor_list
        except Exception as e:
            print(e)
            attempts += 1
    loaded_neighbors_dict[y] = neighbor_list
    return neighbor_list

In [40]:
def update_prob_entity_neighbors(entity_triple_match, y1_neighbors,
                                 y2_neighbors, y_sim, y1, y2):
    for x1 in y1_neighbors:
        for x2 in y2_neighbors:
            x1_str = str(x1['s'])
            x2_str = str(x2['s'])
            p1_str = str(x1['p'])
            p2_str = str(x2['p'])
            sim = update_prob_entities(entity_triple_match, x1_str, x2_str, p1_str, str(x2['p']),
                                 y_sim)
            if sim > 0.1:
                save_predicate_pair(predicate_pair_counter, x1_str, p1_str, str(y1), x2_str, p2_str, str(y2))

In [41]:
def calculate_entity_prob_using_neighbors(entity_triple_match, entity_pair_dict, loaded_neighbors_dict):
    y1_keys = list(entity_pair_dict.keys())
    for y1 in tqdm(y1_keys, desc='neighbor loop', leave=False):
        y1_neighbors = get_inbound_neighbors(graph_1, graph_1_predicate_list, y1, loaded_neighbors_dict)
        y2_keys = list(entity_pair_dict[y1].keys())
        for y2 in y2_keys:
            y_sim = entity_pair_dict[y1][y2]
            if y_sim < 0.5:
                continue
            
            y2_neighbors = get_inbound_neighbors(graph_2, graph_2_predicate_list, y2, loaded_neighbors_dict)
            update_prob_entity_neighbors(entity_triple_match, y1_neighbors, y2_neighbors, y_sim, y1, y2)
    

In [42]:
def get_prob_entity_pair(entity_pair_match_dict, s1, s2):
    if s1 in entity_pair_match_dict.keys() and s2 in entity_pair_match_dict[s1].keys():
        return 1 - entity_pair_match_dict[s1][s2]
    return 0

In [43]:
def get_set_of_triples_by_predicate(predicate_pair_counter_df, reversed_bool):
    if reversed_bool:
        return predicate_pair_counter_df[['x2', 'p2', 'y2']].drop_duplicates()
    else:
        return predicate_pair_counter_df[['x1', 'p1', 'y1']].drop_duplicates()

In [44]:
def get_columns(reversed_bool):
    if reversed_bool:
        return {
            'x1': 'x2',
            'p1': 'p2',
            'y1': 'y2',
            'x2': 'x1',
            'p2': 'p1',
            'y2': 'y1'
        }
    else:
        return {
            'x1': 'x1',
            'p1': 'p1',
            'y1': 'y1',
            'x2': 'x2',
            'p2': 'p2',
            'y2': 'y2'
        }

In [45]:
def calculate_product(matched_row, entity_pair_match_dict):
    x_sim = 1 - entity_pair_match_dict[matched_row['x1']][matched_row['x2']]
    if matched_row['y1'] == matched_row['y2']:
        y_sim = 1
    else:
        y_sim = entity_pair_match_dict[matched_row['y1']][matched_row['y2']]
    return 1 - x_sim * y_sim

In [46]:
def calculate_sum_by_triple(indexed_df, triple_row):
    matched_triples = indexed_df.join(triple_row)
    return 1.0 - numpy.product(matched_triples['product_element'])

In [47]:
def calculate_upper_sum(entity_pair_match_dict, predicate_pair_counter_df, unique_triples, p1, p2, reversed_bool):
    columns = get_columns(reversed_bool)
    
    filtered_by_ps_df = predicate_pair_counter_df[predicate_pair_counter_df[columns['p2']].isin([p2])]
    
    grouped_df = filtered_by_ps_df.groupby([columns['x1'], columns['y1']])
    product = grouped_df['product_element'].prod()
    return (1 - product).sum()

In [48]:
def calculate_lower_sum(entity_pair_match_dict, predicate_pair_counter_df, unique_triples, p1, reversed_bool):
    columns = get_columns(reversed_bool)
    
    grouped_df = predicate_pair_counter_df.groupby([columns['x1'], columns['y1']])
    product = grouped_df['product_element'].prod()
    
    return (1 - product).sum()

In [49]:
def calculate_x_sim(x1, x2, entity_pair_match_dict, reversed_bool):
    if reversed_bool:
        return 1 - entity_pair_match_dict[x2][x1]
    return 1 - entity_pair_match_dict[x1][x2]

In [50]:
def calculate_y_sim(y1, y2, entity_pair_match_dict, reversed_bool):
    if y1 == y2:
        return 1
    else:
        if reversed_bool:
            if y2 in entity_pair_match_dict.keys() and y1 in entity_pair_match_dict[y2].keys():
                return 1 - entity_pair_match_dict[y2][y1]
        if y1 in entity_pair_match_dict.keys() and y2 in entity_pair_match_dict[y1].keys():
            return 1 - entity_pair_match_dict[y1][y2]
    return 0

In [51]:
def calculate_sub_predicates(predicate_sub_relation, predicate_pair_dict, predicate_pair_counter, entity_pair_match_dict, reversed_bool):
    columns = get_columns(reversed_bool)
    predicate_pair_counter_df = pd.DataFrame(predicate_pair_counter)
    
    predicate_pair_counter_df['x_sim'] = predicate_pair_counter_df.apply(lambda x: calculate_x_sim(x[columns["x1"]], x[columns["x2"]], entity_pair_match_dict, reversed_bool), axis=1)
    predicate_pair_counter_df['y_sim'] = predicate_pair_counter_df.apply(lambda x: calculate_y_sim(x[columns["y1"]], x[columns["y2"]], entity_pair_match_dict, reversed_bool), axis=1)

    predicate_pair_counter_df['product_element'] = predicate_pair_counter_df.apply(lambda x: 1.0 - x['x_sim'] * x['y_sim'], axis=1)

    for p1 in tqdm(predicate_pair_dict.keys(), desc='Predicate loop', leave=False):
        filtered_counter = predicate_pair_counter_df[predicate_pair_counter_df[columns['p1']].isin([p1])]
        if len(filtered_counter) < 10:
            for p2 in predicate_pair_dict[p1].keys():
                predicate_sub_relation.setdefault(p1, {})[p2] = 0.0
        unique_triples = get_set_of_triples_by_predicate(filtered_counter, reversed_bool)
        for p2 in predicate_pair_dict[p1].keys():
            predicate_sub_relation.setdefault(p1, {})[p2] = calculate_upper_sum(entity_pair_match_dict, filtered_counter, unique_triples, p1,
                                                              p2, reversed_bool)
        
        lower_sum = calculate_lower_sum(entity_pair_match_dict, filtered_counter, unique_triples, p1, reversed_bool)
        if lower_sum == 0.0:
            continue
        for p2 in predicate_sub_relation[p1].keys():
            predicate_sub_relation[p1][p2] /= lower_sum
            
            

In [52]:
def get_subject_object(graph, predicate):
    attempts = 0
    while attempts < 10:
        try:
            s_o_list = list()
            for s, o in graph.subject_objects(URIRef(predicate)):
                s_o_list.append({
                        'subject': s,
                        'object': o
                    })
            return s_o_list
        except Exception as e:
            print(e)
            attempts += 1
    
    return s_o_list

In [53]:
def check_if_string(y):
    return type(y) == Literal and y.value and type(y.value) == str

In [54]:
def calculate_sub_pred_product(entity_pair_match_dict, x1, x2, y1, y2):
    if check_if_string(y1) and check_if_string(y2):
        if y1.value != y2.value:
            return 0.0
        y_value = y1.value
        if y_value not in literal_y1_dict.keys() or y_value not in y2_matches.keys():
            return 0.0
        return entity_pair_match_dict[str(x1)][str(x2)]
    return 0.0

In [55]:
def save_entity_pairs(i, entity_pair_dict):
    entity_pair_sim_list = list()
    for y1 in entity_pair_dict.keys():
        for y2 in entity_pair_dict[y1].keys():
            entity_pair_sim_list.append({
                "e1": y1,
                "e2": y2,
                "sim": entity_pair_dict[y1][y2]
            })
    entity_sim_df = pd.DataFrame(entity_pair_sim_list)
    entity_sim_df.to_csv(FILE_FOLDER + "entity_sim_" + str(i) + ".csv")

In [56]:
def save_predicate_pairs_into_df(i, predicate_pair_dict, name):
    predicate_pair_list = list()
    for p1 in predicate_pair_dict.keys():
        for p2 in predicate_pair_dict[p1].keys():
            predicate_pair_list.append({
                "p1": p1,
                "p2": p2,
                "sim": predicate_pair_dict[p1][p2]
            })
    predicate_pair_df = pd.DataFrame(predicate_pair_list)
    predicate_pair_df.to_csv(name + str(i) + ".csv")

In [57]:
def pre_calc_pred_pair_dict(predicate_prob_dict, pred_sub_relation, reversed_bool):
    new_predicate_pair_dict = dict()
    
    for pred_1, pred_2_dict in pred_sub_relation.items():
        for pred_2 in pred_2_dict:
            if reversed_bool:
                if pred_1 not in new_predicate_pair_dict.get(pred_2, {}):
                    new_predicate_pair_dict.setdefault(pred_2, {})[pred_1] = 0
                new_predicate_pair_dict[pred_2][pred_1] += 0.5 * pred_sub_relation[pred_1][pred_2]
            else:
                if pred_2 not in new_predicate_pair_dict.get(pred_1, {}):
                    new_predicate_pair_dict.setdefault(pred_1, {})[pred_2] = 0
                new_predicate_pair_dict[pred_1][pred_2] += 0.5 * pred_sub_relation[pred_1][pred_2]

    for pred_1 in predicate_prob_dict:
        for pred_2 in predicate_prob_dict[pred_1]:
            if reversed_bool:
                if pred_1 not in new_predicate_pair_dict.get(pred_2, {}):
                    new_predicate_pair_dict.setdefault(pred_2, {})[pred_1] = 0
                new_predicate_pair_dict[pred_2][pred_1] += 0.5 * predicate_prob_dict[pred_1][pred_2]
            else:
                if pred_2 not in new_predicate_pair_dict.get(pred_1, {}):
                    new_predicate_pair_dict.setdefault(pred_1, {})[pred_2] = 0
                new_predicate_pair_dict[pred_1][pred_2] += 0.5 * predicate_prob_dict[pred_1][pred_2]

    return new_predicate_pair_dict

In [58]:
def get_entity_vec(graph, entity):
    if entity in label_dict:
        return label_dict[entity]
    predicate = URIRef(LABEL_PREDICATE)
    entity_uri = URIRef(entity)
    
    label = str(next(graph.objects(entity_uri, predicate), ""))
    
    if not label:
        return None
    
    vec = embed_long_sentence(label)
    label_dict[entity] = vec
    
    return vec

In [59]:
def entity_label_bert_similarity(graph_1, graph_2, entity_1, entity_2):
    vec_1 = get_entity_vec(graph_1, entity_1)
    vec_2 = get_entity_vec(graph_2, entity_2)
    
    if vec_1 is None or vec_2 is None:
        return 0.0
    
    similarities = cosine_similarity(vec_1, vec_2)
    return similarities.mean()

In [60]:
def pre_calc_entity_pair_dict(entity_sem_prob_dict, entity_triple_match):
    entity_prob_dict = dict()
    
    for entity_1, entity_2_dict in entity_sem_prob_dict.items():
        for entity_2 in entity_2_dict:
            entity_prob_dict.setdefault(entity_1, {})[entity_2] = 0.5 * entity_sem_prob_dict[entity_1][entity_2]
    
    for entity_1, entity_2_dict in tqdm(entity_triple_match.items(), desc='entity_pair', leave=False):
        for entity_2 in entity_2_dict:
            if entity_2 not in entity_prob_dict.setdefault(entity_1, {}):
                if 1 - entity_triple_match[entity_1][entity_2] < 0.5:
                    entity_prob_dict.setdefault(entity_1, {})[entity_2] = 0
                else:
                    bert_sim = 0.6 * entity_label_bert_similarity(graph_1, graph_2, entity_1, entity_2)
                    entity_sem_prob_dict.setdefault(entity_1, {})[entity_2] = bert_sim
                    entity_prob_dict.setdefault(entity_1, {})[entity_2] = 0.5 * bert_sim
            entity_prob_dict.setdefault(entity_1, {})[entity_2] += 0.5 * (1 - entity_triple_match[entity_1][entity_2])
            
    return entity_prob_dict

In [62]:
loaded_neighbors_dict = dict()
pred_sub_relation_1 = dict()
pred_sub_relation_2 = dict()
entity_triple_match = dict()
label_dict = dict()
entity_pair_dict = pre_calc_entity_pair_dict(entity_sem_prob_dict, entity_triple_match)

for i in tqdm(range(10), desc='Main loop'):
    predicate_pair_dict_1 =pre_calc_pred_pair_dict(predicate_prob_dict, pred_sub_relation_1, False)
    predicate_pair_dict_2 = pre_calc_pred_pair_dict(predicate_prob_dict, pred_sub_relation_2, True)
    predicate_pair_counter = list()
    
    save_predicate_pairs_into_df(i, predicate_pair_dict_1, FILE_FOLDER + "predicate_pair_1_")
    save_predicate_pairs_into_df(i, predicate_pair_dict_2, FILE_FOLDER + "predicate_pair_2_")
    
    calculate_entity_prob_using_attribute(entity_triple_match)     
    
    calculate_entity_prob_using_neighbors(entity_triple_match, entity_pair_dict, loaded_neighbors_dict)
    
    entity_pair_dict = pre_calc_entity_pair_dict(entity_sem_prob_dict, entity_triple_match)
    entity_triple_match = dict()
    save_entity_pairs(i, entity_pair_dict)
    
    #calculate_prob_predicates(predicate_pair_dict, new_entity_pair_match_dict)
    calculate_sub_predicates(pred_sub_relation_1, predicate_pair_dict_1, predicate_pair_counter, entity_pair_dict, False)
    calculate_sub_predicates(pred_sub_relation_2, predicate_pair_dict_2, predicate_pair_counter, entity_pair_dict, True)


entity_pair: 0it [00:00, ?it/s]

Main loop:   0%|          | 0/10 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/13935 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/29766 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/435 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/30603 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/31456 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/32239 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/32403 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/33171 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/32426 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/33193 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/32438 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/33205 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/32448 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/33215 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/32456 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/33223 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/32461 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/33227 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/32464 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

Attribute loop:   0%|          | 0/38512 [00:00<?, ?it/s]

neighbor loop:   0%|          | 0/33229 [00:00<?, ?it/s]

entity_pair:   0%|          | 0/32468 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/339 [00:00<?, ?it/s]

Predicate loop:   0%|          | 0/759 [00:00<?, ?it/s]

In [63]:
def get_abstract_vector(hidden_state_dict, graph, entity, abstract_predicate):
    if entity in hidden_state_dict.keys():
        return hidden_state_dict[entity]
    
    triple = (URIRef(entity), abstract_predicate, None)
    
    if not triple in graph:
        return None
    
    objects = graph.objects(URIRef(entity), abstract_predicate)
    abstract = next(objects, '')
    
    if not abstract:
        return None
    
    embed = embed_long_sentence(abstract)
    hidden_state_dict[entity] = embed
    
    return embed

In [64]:
def calculate_abstract_similarity(entity_1, entity_2):
    abstract_predicate_1 = URIRef(GRAPH_1_ABSTRACT)
    abstract_predicate_2 = URIRef(GRAPH_2_ABSTRACT)
    
    vec_1 = get_abstract_vector(vector_dict, graph_1, entity_1, abstract_predicate_1)
    vec_2 = get_abstract_vector(vector_dict, graph_2, entity_2, abstract_predicate_2)
    
    if vec_1 is None or vec_2 is None:
        return 0.0

    similarities = cosine_similarity(vec_1, vec_2)
    return max(map(max, similarities))

In [65]:
query = """
        SELECT ?instance
    WHERE {
      ?instance rdf:type ?type .
      ?type a owl:Class.
    }
        """

instance_1_list = list()
for res in graph_1.query(query):
    instance_1_list.append(str(res[0]))

In [66]:
query = """
        SELECT ?instance
    WHERE {
      ?instance rdf:type ?type .
      ?type a owl:Class.
    }
        """

instance_2_list = list()
for res in graph_2.query(query):
    instance_2_list.append(str(res[0]))

In [67]:
vector_dict = dict()

final_entity_sim_list = list()

for entity_1, entity_2_dict in tqdm(entity_pair_dict.items()):
    
    if entity_1 not in instance_1_list:
        continue
    for entity_2 in entity_2_dict:
        if entity_2 not in instance_2_list:
            continue
        abstract_sim = 0.0
        if entity_2_dict[entity_2] > 0.6:
            abstract_sim = calculate_abstract_similarity(entity_1, entity_2)
            
        final_sim = (2/3) * entity_2_dict[entity_2] + (1/3) * abstract_sim
        
        final_entity_sim_list.append({
            'e1': entity_1,
            'e2': entity_2,
            'sim': final_sim
        })

  0%|          | 0/33233 [00:00<?, ?it/s]

In [68]:
final_entity_sim_df = pd.DataFrame(final_entity_sim_list)

In [69]:
final_entity_sim_df.to_csv(FILE_FOLDER + "final_entity_sim.csv")

In [114]:
entity_pair_dict

{'http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Ten_Forward': {'http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Ten_Forward': 0.8762284603214602},
 'http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Category:Ten_Forward': {'http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Ten_Forward': 0.6894521425681279},
 'http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Arabic': {'http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Arabic': 0.8762284603214602},
 'http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Xenophobia': {'http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Xenophobia': 0.9342980966506069},
 'http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Pregnancy': {'http://dbkwik.webdatacommons.org/stexpanded.wikia.com/resource/Pregnancy': 0.9342980966506069},
 'http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/resource/Spock': {'http://dbkwik.webdatacomm

In [69]:
def get_entity_of_class(graph, type_class):
    TYPE_PREDICATE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
    predicate = URIRef(TYPE_PREDICATE)
    
    object_class = URIRef(type_class)
    
    subject_list = list()
    for subject in graph.subjects(predicate, object_class):
        subject_list.append(str(subject))
    return subject_list

In [88]:
graph_1_class_list = get_graph_classes(graph_1)
graph_2_class_list = get_graph_classes(graph_2)


In [89]:
def calculate_sub_relation(subject_list_1, subject_list_2):
    sum_pair = 0.0
    for subject_1 in subject_list_1:
        total_subject = 1.0
        for subject_2 in subject_list_2:
            if subject_1 in entity_pair_dict.keys() and subject_2 in entity_pair_dict[subject_1].keys():
                total_subject *= 1 - entity_pair_dict[subject_1][subject_2]
        
        sum_pair += 1.0 - total_subject
    
    return sum_pair

In [90]:
class_sub_relation_list = list()

for type_class_1 in tqdm(graph_1_class_list, desc='type 1'):
    subject_list_1 = get_entity_of_class(graph_1, type_class_1)
    
    total_subjects_1 = len(subject_list_1)
    if total_subjects_1 == 0:
        continue
    for type_class_2 in tqdm(graph_2_class_list, desc='type 2', leave=False):
        subject_list_2 = get_entity_of_class(graph_2, type_class_2)
        
        total_sum_relation = calculate_sub_relation(subject_list_1, subject_list_2)
        
        class_sub_relation_list.append({
            'class1': type_class_1,
            'class2': type_class_2,
            'subclass': total_sum_relation / total_subjects_1
        })

type 1:   0%|          | 0/181 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

type 2:   0%|          | 0/283 [00:00<?, ?it/s]

In [91]:
sub_relation_df = pd.DataFrame(class_sub_relation_list)

In [92]:
pd.set_option('display.max_colwidth', None)


In [94]:
sub_relation_df[sub_relation_df['subclass'] > 0.5].sort_values('subclass', ascending=False)

Unnamed: 0,class1,class2,subclass
43834,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/sidebar_species_xindi,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/battle_data,0.998141
31486,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/novel_nav,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/fanfic_episode,0.992436
42523,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/sidebar_film,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/fanfic_episode,0.9068
43651,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/sidebar_species_xindi,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/character,0.883932
12286,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/sidebar_year,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/yearbox,0.877747
15317,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/decade_nav,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/timeline,0.876883
40358,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/omid,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/ds9,0.876228
43697,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/sidebar_species_xindi,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/ent,0.876228
42453,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/sidebar_film,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/fanfilm_episode,0.837127
18795,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/timeline_nav,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/yearbox,0.793217


In [68]:
pd.set_option('display.max_colwidth', None)


In [96]:
class_list = list()

for class_1, class_2_dict in class_sem_prob_dict.items():
    for class_2 in class_2_dict:
        class_list.append({'class_1': class_1,
                          'class_2': class_2,
                          'sim': class_2_dict[class_2]})

In [97]:
class_df = pd.DataFrame(class_list)

In [99]:
one_one_class_df = class_df.sort_values('sim', ascending=False).drop_duplicates('class_1').drop_duplicates('class_2')

In [100]:
one_one_class_df[one_one_class_df['sim'] > 0.5]

Unnamed: 0,class_1,class_2,sim
0,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/wikipedia,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/wikipedia,0.9
1692,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/%27,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/%27,0.9
2539,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/tas,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/tas,0.9
846,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/m,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/m,0.9
1410,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/tos,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/tos,0.9
282,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/voy,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/voy,0.9
4513,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/mb,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/mb,0.9
564,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/w,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/w,0.9
4231,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/e,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/e,0.9
3949,http://dbkwik.webdatacommons.org/memory-alpha.wikia.com/class/sidebar_species,http://dbkwik.webdatacommons.org/stexpanded.wikia.com/class/sidebar_species,0.9


In [101]:
len(one_one_class_df[one_one_class_df['sim'] > 0.5])

60

In [102]:
def get_graph_classes(graph):
    query = """
        select * {?s a rdfs:Resource.}
        """
    class_list = list()
    
    for res in graph.query(query):
        class_list.append(res[0])
    return class_list