In [1]:
from transformers import AutoTokenizer, AutoModel
import dill
from transformers import BertTokenizer, BertModel
import struct
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import itertools
import numpy as np
from om.match import onts, aligns, Step, Runner
from om.ont import get_n, tokenize
from rdflib import Graph
from rdflib.term import URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS, OWL
from termcolor import colored
from py_stringmatching import Levenshtein, SoftTfIdf, JaroWinkler
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from sklearn.model_selection import KFold
from utils import gn, pad_encode, metrics
from datasets import build_dataset1, build_dataset2
from models import Finbank, Model1, Model2
from property_matching import is_property, PropertyMatcher
from sentence_transformers import SentenceTransformer

torch.manual_seed(0)
random.seed(0)
np.random.seed(0)

In [2]:
l, x, y = build_dataset1('/projets/melodi/gsantoss/data/yago/yago-class.nt',
                         '/projets/melodi/gsantoss/data/yago/yago-schema.nt',
                         '/projets/melodi/gsantoss/data/semclass1.pyo')



tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
x1 = tokenizer(x, return_tensors='pt', padding=True)
x1_ids = x1['input_ids']
x1_attention_mask = x1['attention_mask']
vocab = set(itertools.chain(*(list(map(str.split, l)) + list(map(str.split, x)))))
word_index = {q: (i + 1) for i, q in enumerate(vocab)}
l2 = pad_encode(l, word_index)
x2 = pad_encode(x, word_index)

print(l2.shape, x2.shape)
dataset = list(zip(x1_ids, x1_attention_mask, l2, x2, y))
print(len(dataset))


10148
torch.Size([8063, 9]) torch.Size([8063, 70])
8063


In [3]:
kf = KFold(n_splits=10)

crit = nn.NLLLoss()

model1 = Model1(6)
model1.cuda(0)

optimizer1 = optim.Adam(model1.parameters(), lr=0.00003)

for x1i, x1a, x2l, x2x, y in DataLoader(dataset, batch_size=32, shuffle=True):
    optimizer1.zero_grad()

    out1 = model1(x1i.cuda(0), x1a.cuda(0)).cpu()

    l1 = crit(out1, y)
    l1.backward()


    optimizer1.step()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
lev = Levenshtein()

In [5]:
def match_class(o1, o2):
    correct = 0
    pred = 0
    total = 0
    iterations = 0
    for r, k1, k2 in onts(o1, o2):

        print('-' * 100)
        print(k1.split('/')[-1], k2.split('/')[-1])

        o1 = Graph().parse(k1)
        o2 = Graph().parse(k2)

        als = set(aligns(r))

        pa = set()

        for a1, a2 in als:

            if not is_property(a1, o1) and not is_property(a2, o2):
                total += 1
                pa.add((a1, a2))

                print(colored('#', 'blue'), get_n(a1, o1), colored('<>', 'green'), get_n(a2, o2))

        p = set()
        oi = 0
        for step in range(1):

            for e1 in set(o1.subjects()):
                if is_property(e1, o1) or type(e1) is BNode:
                    continue
                for e2 in set(o2.subjects()):
                    if is_property(e2, o2) or type(e2) is BNode:
                        continue

                    oi += 1
                    iterations += 1
                    if lev.get_sim_score(get_n(e1, o1).lower(), get_n(e2, o2).lower()) > 0.85:
                        p.add((e1, e2))

        pred += len(p)
        correct += len(pa.intersection(p))

        for a1, a2 in pa.intersection(p):
            print(colored('✓', 'green'), get_n(a1, o1), get_n(a2, o2))

        for a1, a2 in p.difference(pa):
            print(colored('X', 'red'), get_n(a1, o1), colored('<>', 'green'), get_n(a2, o2))

        print('ontology iterations:', oi)

    print(f'iterations: {iterations}, {metrics(correct, pred, total)}')


match_class('/projets/melodi/gsantoss/data/conference', '/projets/melodi/gsantoss/data/reference')

----------------------------------------------------------------------------------------------------
cmt.owl Conference.owl
[34m#[0m PaperAbstract [32m<>[0m Abstract
[34m#[0m Conference [32m<>[0m Conference
[34m#[0m Conference [32m<>[0m Conference_volume
[34m#[0m Review [32m<>[0m Review
[34m#[0m Document [32m<>[0m Conference_document
[34m#[0m ProgramCommittee [32m<>[0m Program_committee
[34m#[0m Preference [32m<>[0m Review_preference
[34m#[0m Author [32m<>[0m Regular_author
[34m#[0m SubjectArea [32m<>[0m Topic
[34m#[0m Co-author [32m<>[0m Contribution_co-author
[34m#[0m Person [32m<>[0m Person
[34m#[0m Chairman [32m<>[0m Chair
[32m✓[0m Person Person
[32m✓[0m Conference Conference
[32m✓[0m ProgramCommittee Program_committee
[32m✓[0m Review Review
[31mX[0m Paper [32m<>[0m Paper
[31mX[0m Reviewer [32m<>[0m Reviewer
ontology iterations: 1800
--------------------------------------------------------------------------------------

In [9]:
def match_classes_with_model_filter(o1, o2, tokenizer, model):
    correct = 0
    pred = 0
    total = 0
    iterations = 0
    for r, k1, k2 in onts(o1, o2):


        print('-' * 100)
        print(k1.split('/')[-1], k2.split('/')[-1])

        o1 = Graph().parse(k1)
        o2 = Graph().parse(k2)

        als = set(aligns(r))

        pa = set()

        for a1, a2 in als:

            if not is_property(a1, o1) and not is_property(a2, o2):
                total += 1
                pa.add((a1, a2))

                print(colored('#', 'blue'), get_n(a1, o1), colored('<>', 'green'), get_n(a2, o2))

        p1 = []

        for e1 in set(o1.subjects()):
            if is_property(e1, o1) or type(e1) is BNode:
                continue

            p1.append(e1)

        p2 = []

        for e2 in set(o2.subjects()):
            if is_property(e2, o2) or type(e2) is BNode:
                continue

            p2.append(e2)

        p1d = [' '.join(map(str.lower, tokenize(get_n(x, o1)))) for x in p1]
        p2d = [' '.join(map(str.lower, tokenize(get_n(x, o2)))) for x in p2]

        tk = tokenizer(p1d, return_tensors='pt', padding=True)

        idx = tk['input_ids']
        atn = tk['attention_mask']

        with torch.no_grad():
            out1 = model(idx.cuda(0), atn.cuda(0)).exp().cpu()

        tk = tokenizer(p2d, return_tensors='pt', padding=True)

        idx = tk['input_ids']
        atn = tk['attention_mask']

        with torch.no_grad():
            out2 = model(idx.cuda(0), atn.cuda(0)).exp().cpu()

        cl1 = out1.argmax(dim=1)
        cl2 = out2.argmax(dim=1)
        sim = cl1.unsqueeze(1) == cl2.unsqueeze(0)

        nz = list(map(lambda x: (p1[x[0].item()], p2[x[1].item()]), sim.nonzero()))

        p = set()
        oi = 0
        for step in range(1):

            for e1, e2 in nz:

                oi += 1
                iterations += 1
                if lev.get_sim_score(get_n(e1, o1).lower(), get_n(e2, o2).lower()) > 0.85:
                    p.add((e1, e2))

        pred += len(p)
        correct += len(pa.intersection(p))

        for a1, a2 in pa.intersection(p):
            print(colored('✓', 'green'), get_n(a1, o1), get_n(a2, o2))

        for a1, a2 in p.difference(pa):
            print(colored('X', 'red'), get_n(a1, o1), colored('<>', 'green'), get_n(a2, o2))

        print('ontology iterations:', oi)

    print(f'iterations: {iterations}, {metrics(correct, pred, total)}')


match_classes_with_model_filter('/projets/melodi/gsantoss/data/conference', '/projets/melodi/gsantoss/data/reference', tokenizer, model1)

----------------------------------------------------------------------------------------------------
cmt.owl Conference.owl
[34m#[0m PaperAbstract [32m<>[0m Abstract
[34m#[0m Conference [32m<>[0m Conference
[34m#[0m Conference [32m<>[0m Conference_volume
[34m#[0m Review [32m<>[0m Review
[34m#[0m Document [32m<>[0m Conference_document
[34m#[0m ProgramCommittee [32m<>[0m Program_committee
[34m#[0m Preference [32m<>[0m Review_preference
[34m#[0m Author [32m<>[0m Regular_author
[34m#[0m SubjectArea [32m<>[0m Topic
[34m#[0m Co-author [32m<>[0m Contribution_co-author
[34m#[0m Person [32m<>[0m Person
[34m#[0m Chairman [32m<>[0m Chair
[32m✓[0m Person Person
[32m✓[0m Conference Conference
[32m✓[0m ProgramCommittee Program_committee
[32m✓[0m Review Review
[31mX[0m Paper [32m<>[0m Paper
[31mX[0m Reviewer [32m<>[0m Reviewer
ontology iterations: 430
---------------------------------------------------------------------------------------

In [10]:

wm = Finbank('/projets/melodi/gsantoss/data/embeddings/fb.txt', '/projets/melodi/gsantoss/data/embeddings/fbe.bin')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

property_matcher = PropertyMatcher(wm, model)

In [11]:
property_matcher.match_with_model_filter('/projets/melodi/gsantoss/data/conference', '/projets/melodi/gsantoss/data/reference', tokenizer, model1)


----------------------------------------------------------------------------------------------------
cmt.owl Conference.owl
[34m#[0m Reviewer assignExternalReviewer ExternalReviewer [32m<>[0m Reviewer invites_co-reviewers Reviewer
[34m#[0m Person email string [32m<>[0m Person has_an_email string
[34m#[0m ExternalReviewer assignedByReviewer Reviewer [32m<>[0m Reviewer invited_by Reviewer
[32m✓[0m email has_an_email
ontology iterations: 1608
----------------------------------------------------------------------------------------------------
confOf.owl iasted.owl
ontology iterations: 348
----------------------------------------------------------------------------------------------------
edas.owl sigkdd.owl
[34m#[0m Conference startDate dateTime [32m<>[0m Conference Start_of_conference dateTime
[34m#[0m Conference hasName string [32m<>[0m Conference Name_of_conference string
[34m#[0m Sponsorship hasCostAmount int [32m<>[0m Registration_fee Price int
[34m#[0m Con

In [12]:
c, x, y = build_dataset2('/projets/melodi/gsantoss/data/dataset2.csv')
x1 = tokenizer(list(x), return_tensors='pt', padding=True)
x1_ids = x1['input_ids']
x1_attention_mask = x1['attention_mask']
print(x1_ids.shape)
vocab = set(itertools.chain(*(list(map(str.split, c)) + list(map(str.split, x)))))

word_index = {q: (i + 1) for i, q in enumerate(vocab)}
print(len(word_index))
l2 = pad_encode(c, word_index)
x2 = pad_encode(x, word_index)

print(l2.shape, x2.shape)
dataset = list(zip(x1_ids, x1_attention_mask, l2, x2, y))
print(len(dataset))



4035
torch.Size([20175, 18])
30990
torch.Size([20175, 66]) torch.Size([20175, 6])
20175


In [13]:
kf = KFold(n_splits=10)

crit = nn.NLLLoss()

model1 = Model1(5)
model1.cuda(0)
optimizer1 = optim.Adam(model1.parameters(), lr=0.00003)

for x1i, x1a, x2l, x2x, y in DataLoader(dataset, batch_size=32, shuffle=True):
    optimizer1.zero_grad()

    out1 = model1(x1i.cuda(0), x1a.cuda(0)).cpu()

    l1 = crit(out1, y)
    l1.backward()

    optimizer1.step()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
match_classes_with_model_filter('/projets/melodi/gsantoss/data/conference', '/projets/melodi/gsantoss/data/reference', tokenizer, model1)

----------------------------------------------------------------------------------------------------
cmt.owl Conference.owl
[34m#[0m PaperAbstract [32m<>[0m Abstract
[34m#[0m Conference [32m<>[0m Conference
[34m#[0m Conference [32m<>[0m Conference_volume
[34m#[0m Review [32m<>[0m Review
[34m#[0m Document [32m<>[0m Conference_document
[34m#[0m ProgramCommittee [32m<>[0m Program_committee
[34m#[0m Preference [32m<>[0m Review_preference
[34m#[0m Author [32m<>[0m Regular_author
[34m#[0m SubjectArea [32m<>[0m Topic
[34m#[0m Co-author [32m<>[0m Contribution_co-author
[34m#[0m Person [32m<>[0m Person
[34m#[0m Chairman [32m<>[0m Chair
[32m✓[0m Person Person
[32m✓[0m Conference Conference
[32m✓[0m ProgramCommittee Program_committee
[32m✓[0m Review Review
[31mX[0m Paper [32m<>[0m Paper
[31mX[0m Reviewer [32m<>[0m Reviewer
ontology iterations: 592
---------------------------------------------------------------------------------------

In [15]:
property_matcher.match_with_model_filter('/projets/melodi/gsantoss/data/conference', '/projets/melodi/gsantoss/data/reference', tokenizer, model1)

----------------------------------------------------------------------------------------------------
cmt.owl Conference.owl
[34m#[0m Reviewer assignExternalReviewer ExternalReviewer [32m<>[0m Reviewer invites_co-reviewers Reviewer
[34m#[0m Person email string [32m<>[0m Person has_an_email string
[34m#[0m ExternalReviewer assignedByReviewer Reviewer [32m<>[0m Reviewer invited_by Reviewer
[32m✓[0m email has_an_email
ontology iterations: 1990
----------------------------------------------------------------------------------------------------
confOf.owl iasted.owl
ontology iterations: 776
----------------------------------------------------------------------------------------------------
edas.owl sigkdd.owl
[34m#[0m Conference startDate dateTime [32m<>[0m Conference Start_of_conference dateTime
[34m#[0m Conference hasName string [32m<>[0m Conference Name_of_conference string
[34m#[0m Sponsorship hasCostAmount int [32m<>[0m Registration_fee Price int
[34m#[0m Con