In [None]:
import torch
print(torch.cuda.get_device_name(0))
print('Torch', torch.__version__, 'CUDA', torch.version.cuda)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

Tesla T4
Torch 1.12.0+cu113 CUDA 11.3
cuda


Install dependencies and import libraries

In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, DistilBertTokenizerFast
from datasets import load_dataset
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

Preprocess data

In [None]:
base_path = 'drive/MyDrive/Colab Notebooks/ontology-matching/'
property_mapping = pd.read_csv(base_path + "gs_property.csv", names= ['column','property', 'match'], header=None)
property_mapping

Unnamed: 0,column,property,match
0,68779923_1_3240042497463101224.csv~Col4,http://dbpedia.org/ontology/governmentType,True
1,10630177_0_4831842476649004753.csv~Col2,http://www.w3.org/2000/01/rdf-schema#label,True
2,78891639_0_3299957631631122948.csv~Col3,http://dbpedia.org/ontology/populationTotal,True
3,25404227_0_2240631045609013057.csv~Col3,http://dbpedia.org/ontology/director,True
4,71840765_0_6664391841933033844.csv~Col2,http://dbpedia.org/ontology/elevation,True
...,...,...,...
327,47709681_0_4437772923903322343.csv~Col0,http://www.w3.org/2000/01/rdf-schema#label,True
328,86627271_6_2239821927452848323.csv~Col2,http://dbpedia.org/ontology/areaTotal,True
329,21245481_0_8730460088443117515.csv~Col1,http://www.w3.org/2000/01/rdf-schema#label,True
330,55027702_0_628532586316851176.csv~Col2,http://dbpedia.org/ontology/elevation,True


In [None]:
import json

class PropertyIndex:
  def __init__(self, persisted_file=None) -> None:
      if persisted_file is None:
        self.property_ids = {}
        self.property_names = {}
        self.counter = 0
      else:
        self.load(persisted_file)

  def get_property_id(self, column_name):
    if property not in self.property_ids:
      self.property_ids[property] = self.counter
      self.property_names[self.counter] = property
      self.counter += 1
    return self.property_ids[property]

  def get_property_name(self, id):
    return self.property_names.get(id)

  def num_labels(self):
    return self.counter + 1

  def load(self, persisted_file):
    with open(persisted_file, 'r') as f:
      self.property_ids = json.load(f.readline())
      self.property_names = json.load(f.readline())
      self.counter = int(json.load(f.readline()))

  def persist(self, filename):
    id_mapping = json.dumps(self.property_ids)
    name_mapping = json.dumps(self.property_names)
    with open(filename, "w") as f:
      f.write(id_mapping + "\n")
      f.write(name_mapping + "\n")
      f.write(str(self.counter) + "\n")


In [None]:
def extract_filename_and_col_index(str):
  [filename, col] = str.split('~')
  return filename, int(col[3:])

In [None]:
values = pd.DataFrame({'value': pd.Series(dtype='str'), 'property_id': pd.Series(dtype='int')})
property_index = PropertyIndex()

for index, row in property_mapping.iterrows():
  column = row['column']
  property = row['property']

  property_id = property_index.get_property_id(property)

  filename, col_index = extract_filename_and_col_index(column)
  webtable = pd.read_csv(base_path + "webtables/" + filename, header=None)
  colum_values = webtable.iloc[:, col_index]
  colum_values.name = "value"
  df = colum_values.to_frame()
  df['property_id'] = df['value'].apply(lambda x: int(property_id))
  values = pd.concat([values, df])

values = values.dropna()

train_texts = values["value"].values.tolist()
train_labels = values['property_id'].values.tolist()

train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size=.2)
print(f"Train size: {len(train_texts)}, Test size: {len(test_texts)}")

property_index.persist(base_path + "property_index.txt")

Train size: 33041, Test size: 8261


Create a custom dataset

In [None]:
class WebtableDataset(Dataset):
  def __init__(self, encodings, labels, name):
      super().__init__()
      self.encodings = encodings
      self.labels = labels
      self.name = name

  def __len__(self):
      return len(self.labels)

  def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      item['labels'] = torch.tensor(self.labels[idx])
      return item


Load the BERT model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.20.1",
  "vocab_size": 30522
}

loading file https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/0e1bbfda7f63a99bb52e3915dcf10

In [None]:

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

training_data = WebtableDataset(train_encodings, train_labels, 'train')
test_data = WebtableDataset(test_encodings, test_labels, 'test')

In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [None]:
def preprocess_function(text):
    return tokenizer(text, truncation=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=property_index.num_labels())
model = model

training_args = TrainingArguments(
    output_dir=base_path + "results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    evaluation_strategy="steps"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics
)

results = trainer.train()

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
 

Step,Training Loss,Validation Loss,Accuracy
500,2.3103,1.470279,0.673889
1000,1.2808,1.011921,0.73623
1500,0.9652,0.854922,0.764677
2000,0.8203,0.746403,0.78453
2500,0.6784,0.704161,0.79034
3000,0.645,0.667168,0.79494
3500,0.611,0.636555,0.804261
4000,0.5812,0.622814,0.808498
4500,0.5237,0.602059,0.814913
5000,0.4829,0.595967,0.81334


***** Running Evaluation *****
  Num examples = 8261
  Batch size = 16
Saving model checkpoint to drive/MyDrive/Colab Notebooks/ontology-matching/results/checkpoint-500
Configuration saved in drive/MyDrive/Colab Notebooks/ontology-matching/results/checkpoint-500/config.json
Model weights saved in drive/MyDrive/Colab Notebooks/ontology-matching/results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8261
  Batch size = 16
Saving model checkpoint to drive/MyDrive/Colab Notebooks/ontology-matching/results/checkpoint-1000
Configuration saved in drive/MyDrive/Colab Notebooks/ontology-matching/results/checkpoint-1000/config.json
Model weights saved in drive/MyDrive/Colab Notebooks/ontology-matching/results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8261
  Batch size = 16
Saving model checkpoint to drive/MyDrive/Colab Notebooks/ontology-matching/results/checkpoint-1500
Configuration saved in drive/MyDrive/Colab Notebooks/

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 8261
  Batch size = 16


{'epoch': 5.0,
 'eval_accuracy': 0.819392325384336,
 'eval_loss': 0.5745163559913635,
 'eval_runtime': 7.7766,
 'eval_samples_per_second': 1062.293,
 'eval_steps_per_second': 66.482}

In [None]:
results

TrainOutput(global_step=10330, training_loss=0.6439428257457516, metrics={'train_runtime': 1367.5231, 'train_samples_per_second': 120.806, 'train_steps_per_second': 7.554, 'total_flos': 2482703496941040.0, 'train_loss': 0.6439428257457516, 'epoch': 5.0})

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    probs, idxs = probs.topk(3)
    # Unwrap the tensor
    idxs = idxs.tolist()[0]
    probs = probs.tolist()[0]
    return [{'key': property_index.get_property_name(idxs[i]), 'prob': probs[i]} for i in range(len(idxs))]


In [None]:
model.eval()

get_prediction("germany")

[{'key': 'http://dbpedia.org/ontology/country', 'prob': 0.9325388073921204},
 {'key': 'http://www.w3.org/2000/01/rdf-schema#label',
  'prob': 0.04996142536401749},
 {'key': 'http://dbpedia.org/ontology/collectionSize',
  'prob': 0.004149576183408499}]

In [None]:
model_path = base_path + "ontology-matching-base-uncased"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Configuration saved in drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/config.json
Model weights saved in drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/tokenizer_config.json
Special tokens file saved in drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/special_tokens_map.json


('drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/tokenizer_config.json',
 'drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/special_tokens_map.json',
 'drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/vocab.txt',
 'drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/added_tokens.json',
 'drive/MyDrive/Colab Notebooks/ontology-matching/ontology-matching-base-uncased/tokenizer.json')