In [1]:
!pip install transformers==3.0.2 -q

[K     |████████████████████████████████| 778kB 15.9MB/s 
[K     |████████████████████████████████| 3.0MB 55.2MB/s 
[K     |████████████████████████████████| 1.2MB 51.6MB/s 
[K     |████████████████████████████████| 890kB 59.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import os
from google.colab import files
import random
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
import pickle
import numpy as np
from sklearn import metrics
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd drive/MyDrive/WSD-projekt

/content/drive/MyDrive/WSD-projekt


In [8]:
ls

[0m[01;34mpoleval20-wsd-master[0m/  [01;34mtest_gold_standard[0m/   wsddata.pkl
[01;34mresults[0m/               train_valid_data.pkl  WSD_polbert_model3.bin


In [None]:
!mkdir results
!mkdir results/kpwr
!mkdir results/sherlock

mkdir: cannot create directory ‘drive/MyDrive/WSD-projekt/results’: File exists
mkdir: cannot create directory ‘drive/MyDrive/WSD-projekt/results/kpwr’: File exists
mkdir: cannot create directory ‘drive/MyDrive/WSD-projekt/results/sherlock’: File exists


Training data

In [None]:
ls poleval20-wsd-master/

[0m[01;34mdata[0m/  README.md  [01;34mtestdata[0m/


In [9]:
path_lemmas = 'poleval20-wsd-master/data/lemmas.txt'
path_synsets = 'poleval20-wsd-master/data/synsets.txt'
path_lexicalunits = 'poleval20-wsd-master/data/lexicalunits.txt'
path_synsets_examples = 'poleval20-wsd-master/data/synset_defs_examples.txt'

lemmas_df = pd.read_csv(path_lemmas, names=['lemma', 'POS'])
synsets_df = pd.read_csv(path_synsets, names=['synset'])
lexicalunits_df = pd.read_csv(path_lexicalunits, sep='\t', names=['lemma_idx', 'synset_idx'])
synsets_examples_df = pd.read_csv(path_synsets_examples, sep='\t', names=['synset', 'example'])

lemmas_df['lemma_idx'] = lemmas_df.index
lemmas_df['lemma_idx'] = lemmas_df['lemma_idx'].apply(lambda i: i+1)

synsets_df['synset_idx'] = synsets_df.index
synsets_df['synset_idx'] = synsets_df['synset_idx'].apply(lambda i: i+1)
synsets_df['synset'] = synsets_df['synset'].apply(lambda i: "s"+str(i))

full_df = pd.merge(lexicalunits_df, lemmas_df, how='left', on='lemma_idx')
full_df = pd.merge(full_df, synsets_df, how='left', on='synset_idx')
full_df = pd.merge(full_df, synsets_examples_df, how='left', on='synset')

full_df = full_df.drop(['lemma_idx', 'synset_idx'], axis=1)

full_df.head(30)

Unnamed: 0,lemma,POS,synset,example
0,absolutny,adj,s238698,"**absolutystyczny**, oparty na zasadach, idei ..."
1,absolutny,adj,s238698,Różnica między monarchią **absolutną** a despo...
2,absolutny,adj,s238698,Księstwo oficjalnie było de jure od 1905 monar...
3,absolutny,adj,s238698,Aby zaprowadzić rządy **absolutne** i despotyc...
4,absolutny,adj,s238698,"Jego nowa, **absolutystyczna** polityka, połąc..."
5,absolutny,adj,s238698,Armia jest zabawką w rękach władcy **absolutne...
6,absolutny,adj,s238698,Po zniknięciu z Europy **absolutystycznych** w...
7,absolutny,adj,s238698,"oparty na zasadach, idei absolutyzmu, uzsadnio..."
8,absolwent,noun,s12,
9,abstrakcyjny,adj,s103631,"Był to dość **abstrakcyjny** pomysł, zwłaszcza..."


In [None]:
full_df[full_df.synset == "s445000"]

Unnamed: 0,lemma,POS,synset,example
601922,brukwiowy,adj,s445000,W powietrzu unosił się ciężki zapach **brukwio...
601923,brukwiowy,adj,s445000,"Może to i dziwne, ale czasem z nostalgią wspom..."
601924,brukwiowy,adj,s445000,związany z brukwią - warzywem.
601925,brukwiowy,adj,s445000,Pozbierała szybko **brukwiowe** łupiny i wrzuc...
601927,brukwiany,adj,s445000,W powietrzu unosił się ciężki zapach **brukwio...
601928,brukwiany,adj,s445000,"Może to i dziwne, ale czasem z nostalgią wspom..."
601929,brukwiany,adj,s445000,związany z brukwią - warzywem.
601930,brukwiany,adj,s445000,Pozbierała szybko **brukwiowe** łupiny i wrzuc...


In [None]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729174 entries, 0 to 729173
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   lemma    729168 non-null  object
 1   POS      729174 non-null  object
 2   synset   729174 non-null  object
 3   example  396035 non-null  object
dtypes: object(4)
memory usage: 27.8+ MB


In [11]:
lemma_synset_df = full_df[['lemma', 'synset']].drop_duplicates().reset_index(drop=True)
len(lemma_synset_df)

505229

In [12]:
lemma_synset_pairs_df = full_df[['lemma', 'synset']].drop_duplicates().reset_index(drop=True)
len(lemma_synset_pairs_df)

505229

In [None]:
lemma_synset_pairs_df.head()

Unnamed: 0,lemma,synset
0,absolutny,s238698
1,absolwent,s12
2,abstrakcyjny,s103631
3,adherent,s4450
4,administracja,s14


In [None]:
lemma_synset_df['n_synset_per_lemma'] = lemma_synset_df.groupby('lemma')['lemma'].transform('count')

In [None]:
lemma_synset_df[lemma_synset_df.lemma == "absolutny"]

Unnamed: 0,lemma,synset,n_synset_per_lemma
0,absolutny,s238698,6.0
13910,absolutny,s442011,6.0
13911,absolutny,s9107,6.0
13912,absolutny,s9681,6.0
428126,absolutny,s442023,6.0
499007,absolutny,s7073629,6.0


In [None]:
non_missing_examples_df = full_df[full_df['example'].notna()]
len(non_missing_examples_df)

396035

In [None]:
non_missing_examples_lemma_synset_df = non_missing_examples_df[['lemma', 'synset']].drop_duplicates().reset_index(drop=True)
len(non_missing_examples_lemma_synset_df)

172168

In [None]:
non_missing_examples_lemma_synset_df['n_synset_per_lemma_with_example'] = non_missing_examples_lemma_synset_df.groupby('lemma')['lemma'].transform('count')

In [None]:
lemma_synset_df = lemma_synset_df[['lemma', 'n_synset_per_lemma']].drop_duplicates().reset_index(drop=True)
non_missing_examples_lemma_synset_df = non_missing_examples_lemma_synset_df[['lemma', 'n_synset_per_lemma_with_example']].drop_duplicates().reset_index(drop=True)
lemmas_with_synset_counts_df = pd.merge(lemma_synset_df, non_missing_examples_lemma_synset_df, how='left', on='lemma')
lemmas_with_synset_counts_df['n_synset_per_lemma_with_example'] = lemmas_with_synset_counts_df['n_synset_per_lemma_with_example'].fillna(0)
lemmas_with_synset_counts_df['n_synset_per_lemma_with_example'] = lemmas_with_synset_counts_df['n_synset_per_lemma_with_example'].astype(int)

In [None]:
lemmas_with_synset_counts_df.head()

Unnamed: 0,lemma,n_synset_per_lemma,n_synset_per_lemma_with_example
0,absolutny,6.0,6
1,absolwent,1.0,0
2,abstrakcyjny,6.0,6
3,adherent,3.0,0
4,administracja,4.0,4


In [None]:
def concat_columns(row):
  a, b = str(row[1])[0], str(row[2])
  return a + '_' + b

lemmas_with_synset_counts_df['n_syns_per_lemma_all_with_example'] = lemmas_with_synset_counts_df.apply(concat_columns, axis=1)

In [None]:
lemmas_with_synset_counts_df.head()

Unnamed: 0,lemma,n_synset_per_lemma,n_synset_per_lemma_with_example,n_syns_per_lemma_all_with_example
0,absolutny,6.0,6,6_6
1,absolwent,1.0,0,1_0
2,abstrakcyjny,6.0,6,6_6
3,adherent,3.0,0,3_0
4,administracja,4.0,4,4_4


In [None]:
non_missing_examples_df = pd.merge(non_missing_examples_df, 
                                   lemmas_with_synset_counts_df[['lemma', 'n_synset_per_lemma_with_example']], 
                                   how='left', 
                                   on='lemma')

non_missing_examples_df.head()

Unnamed: 0,lemma,POS,synset,example,has_underscore,has_whitespace,n_synset_per_lemma_with_example
0,absolutny,adj,s238698,"**absolutystyczny**, oparty na zasadach, idei ...",False,False,6
1,absolutny,adj,s238698,Różnica między monarchią **absolutną** a despo...,False,False,6
2,absolutny,adj,s238698,Księstwo oficjalnie było de jure od 1905 monar...,False,False,6
3,absolutny,adj,s238698,Aby zaprowadzić rządy **absolutne** i despotyc...,False,False,6
4,absolutny,adj,s238698,"Jego nowa, **absolutystyczna** polityka, połąc...",False,False,6


In [None]:
training_df = non_missing_examples_df[non_missing_examples_df['n_synset_per_lemma_with_example'] > 1]
training_df = training_df.reset_index(drop=True)
training_df.head()

Unnamed: 0,lemma,POS,synset,example,has_underscore,has_whitespace,n_synset_per_lemma_with_example
0,absolutny,adj,s238698,"**absolutystyczny**, oparty na zasadach, idei ...",False,False,6
1,absolutny,adj,s238698,Różnica między monarchią **absolutną** a despo...,False,False,6
2,absolutny,adj,s238698,Księstwo oficjalnie było de jure od 1905 monar...,False,False,6
3,absolutny,adj,s238698,Aby zaprowadzić rządy **absolutne** i despotyc...,False,False,6
4,absolutny,adj,s238698,"Jego nowa, **absolutystyczna** polityka, połąc...",False,False,6


In [None]:
examples = []

for i in tqdm(range(len(training_df))):
  lemma, synset, example = training_df['lemma'].loc[i], training_df['synset'].loc[i], training_df['example'].loc[i] #lista wszystkich indeksów wierszy z tymi samymi synsetami
  trues = training_df.index[training_df['synset'] == synset].tolist() #lista wszystkich indeksów wierszy z tymi samymi synsetami z wykluczeniem obecnego indeksu w pętli
  trues = [x for x in trues if x != i] #lista wszystkich indeksów wierszy z tymi samymi lematami
  falses = training_df.index[training_df['lemma'] == lemma].tolist() #lista wszystkich indeksów wierszy z tymi samymi lematami z wykluczeniem obecnego indeksu w pętli
  falses = [x for x in falses if x not in trues and x != i] 
  if len(trues) > 0:
      rp = random.choice(trues)
      lp, ep = training_df['lemma'].loc[rp], training_df['example'].loc[rp]
      examples.append({
          'l1': lemma,
          'e1': example,
          'l2': lp,
          'e2': ep,
          'label': True #wiersze dla których dwa example mówią o tym samym synsecie
      })
  if len(falses) > 0: 
      rn = random.choice(falses)
      ln, en = training_df['lemma'].loc[rn], training_df['example'].loc[rn]
      examples.append({
          'l1': lemma,
          'e1': example,
          'l2': ln,
          'e2': en,
          'label': False #wiersze dla których dwa example mówią o innym synsecie
      })

HBox(children=(FloatProgress(value=0.0, max=233238.0), HTML(value='')))




In [None]:
examples[30:40]

[{'e1': 'siedziba **kierownictwa**.',
  'e2': 'siedziba **kierownictwa**.',
  'l1': 'administracja',
  'l2': 'dyrekcja',
  'label': True},
 {'e1': 'siedziba **kierownictwa**.',
  'e2': 'zarządzanie.',
  'l1': 'administracja',
  'l2': 'administracja',
  'label': False},
 {'e1': 'siedziba **dyrekcji**.',
  'e2': 'siedziba **zarządu**.',
  'l1': 'administracja',
  'l2': 'administracja',
  'label': True},
 {'e1': 'siedziba **dyrekcji**.',
  'e2': 'Po spotkaniu ze swoją **administracją**, prezydent zapowiedział podpisanie ustawy.',
  'l1': 'administracja',
  'l2': 'administracja',
  'label': False},
 {'e1': 'związany z zapleczem organizacyjnym przedsiębiorstwa.',
  'e2': 'Zosia została pracownikiem **administracyjnym**.',
  'l1': 'administracyjny',
  'l2': 'administracyjny',
  'label': True},
 {'e1': 'związany z zapleczem organizacyjnym przedsiębiorstwa.',
  'e2': 'związany z zarządzeniem czymś.',
  'l1': 'administracyjny',
  'l2': 'administracyjny',
  'label': False},
 {'e1': 'Zosia został

In [None]:
l = [training_df, examples]
with open('wsddata.pkl', 'wb') as fp:
    pickle.dump(l, fp)

Model

In [13]:
def create_tuple(example):
  l1, l2, e1, e2 = example['l1'], example['l2'], example['e1'], example['e2'] 
  return (l1 +': ' + e1, l2 + ': ' + e2)

In [14]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [15]:
class EarlyStopping:
    def __init__(self, patience=7, delta=0.0001):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        self.val_score = -np.Inf
            
    def __call__(self, epoch_score, model, model_path):
      
        score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(
                "EarlyStopping counter: {} out of {}".format(
                    self.counter, self.patience
                )
            )
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print(
                "Validation score improved ({} --> {}). Saving model!".format(
                    self.val_score, epoch_score
                )
            )
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [16]:
pretrained_model = 'dkleczek/bert-base-polish-uncased-v1'
LR = 5e-5
WARMUP_STEPS = 8
MODEL_PATH = "WSD_polbert_model3.bin"
TOKENIZER = BertTokenizer.from_pretrained(pretrained_model)

MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 4

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=494801.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




In [17]:
class PolbertWSD(nn.Module):
    def __init__(self, pretrained_polbert_path):
        super(PolbertWSD, self).__init__()
        self.pretrained_polbert_path = pretrained_polbert_path
        self.polbert = BertModel.from_pretrained(self.pretrained_polbert_path)
        self.dropout = nn.Dropout(0.25)
        self.ranking_linear = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, pooled_output = self.polbert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        do = self.dropout(pooled_output)
        return self.ranking_linear(do)

In [18]:
class PrepareTrainingDataset:
    def __init__(self, examples, max_length, tokenizer):
        self.examples = examples
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        tpl = create_tuple(self.examples[item])
        tgt = int(self.examples[item]['label'])
        enc = self.tokenizer.encode_plus(tpl[0], tpl[1], padding='max_length', truncation='longest_first', max_length=self.max_length)      
        return {
            "ids": torch.tensor(enc.input_ids, dtype=torch.long),
            "mask": torch.tensor(enc.attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(enc.token_type_ids, dtype=torch.long),
            "target": torch.tensor(tgt, dtype=torch.float) 
        }

In [19]:
class PrepareTestDataset:
    def __init__(self, examples, max_length, tokenizer):
        self.examples = examples
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        tpl = create_tuple(self.examples[item])
        enc = self.tokenizer.encode_plus(tpl[0], tpl[1], padding='max_length', truncation='longest_first', max_length=self.max_length)      
        return {
            "ids": torch.tensor(enc.input_ids, dtype=torch.long),
            "mask": torch.tensor(enc.attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(enc.token_type_ids, dtype=torch.long)
        }        

In [20]:
def loss_function(logits, targets):
    return nn.BCEWithLogitsLoss()(logits, targets.view(-1, 1))

In [21]:
def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader), mininterval=1.)

    for bi, d in enumerate(tk0):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]
        targets = d["target"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg)

def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    fin_loss = []

    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader), mininterval=1.)
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]
            targets = d["target"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            loss = loss_function(outputs, targets)

            fin_targets.append(targets.cpu().detach().numpy())
            fin_outputs.append(outputs.cpu().detach().numpy())
            fin_loss.append(loss.cpu().detach().numpy())

    return np.concatenate(fin_outputs, axis=0), np.concatenate(fin_targets, axis=0), np.average(fin_loss)

def final_eval(data_loader, model, device):
    model.eval()
    fin_outputs = []

    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            fin_outputs.append(outputs.cpu().detach().numpy())

    return np.concatenate(fin_outputs, axis=0)

In [22]:
def run(train, valid, pretrained_model):

    train_dataset = PrepareTrainingDataset(
        examples=train,   
        max_length=MAX_LEN,
        tokenizer=TOKENIZER
    )
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True,
        num_workers=4
    )
    valid_dataset = PrepareTrainingDataset(
        examples=valid,   
        max_length=MAX_LEN,
        tokenizer=TOKENIZER
    )
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        shuffle=False,
        num_workers=2
    )

    device = 'cuda'
    num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
    model = PolbertWSD(pretrained_model).to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if (any(nd in n for nd in no_decay))], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=LR)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=WARMUP_STEPS, 
        num_training_steps=num_train_steps
    )

    es = EarlyStopping(patience=4)
    print(f"Training is starting")

    for epoch in range(EPOCHS):
        train_loop_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        outputs, targets, vloss = eval_loop_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"epoch = {epoch}, accuracy = {accuracy}, valid_loss = {vloss}")
        es(accuracy, model, model_path=MODEL_PATH)
        if es.early_stop:
            print("Early stopping")
            break
        

In [23]:
def evaluate(examples, model):

    test_dataset = PrepareTestDataset(
        examples=examples,   
        max_length=MAX_LEN,
        tokenizer=TOKENIZER
    )
    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=TEST_BATCH_SIZE,
        shuffle=False,
        num_workers=2
    )

    device = 'cuda'
    outputs = final_eval(test_data_loader, model, device)

    return outputs

Load data

In [24]:
with open('wsddata.pkl', 'rb') as fp:
    [training_df, examples] = pickle.load(fp)

In [None]:
from numpy.random import shuffle
shuffle(examples)

train = examples[:-20000]
valid = examples[-20000:]

In [None]:
train_dataset = PrepareTrainingDataset(
        examples=train,   
        max_length=MAX_LEN,
        tokenizer=TOKENIZER
    )

In [None]:
train_dataset[1]

{'ids': tensor([    2,  2908, 42488,  2170,    30,  6771, 46284,    14,    14,  2908,
         42488,  2231,    14,    14,  3003, 26091,  3176, 51248,  3650,    16,
          2827,  2847,    68,  2891,  8090,  2932, 12468,  2537,  1015,  2243,
          1911,  3898,  1008,  2029,  1926,    16,  2301,  4653,     4,  2908,
         42488,  8599,  9219,    30,    57, 27839,    30,  2899,    16, 34357,
          1919, 25470,  1895,  2908, 42488,  1021,    16, 32624,  2243, 22177,
            65, 11806,    18,     4]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
 'target': tensor(1.),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
pretrained_model

'dkleczek/bert-base-polish-uncased-v1'

Model running and saving

In [25]:
torch.cuda.is_available()

True

In [None]:
run(train, valid, pretrained_model)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=459.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=531146902.0, style=ProgressStyle(descri…


Training is starting


HBox(children=(FloatProgress(value=0.0, max=13234.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


epoch = 0, accuracy = 0.82665, valid_loss = 0.35719043016433716
Validation score improved (-inf --> 0.82665). Saving model!


HBox(children=(FloatProgress(value=0.0, max=13234.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


epoch = 1, accuracy = 0.86325, valid_loss = 0.3126107156276703
Validation score improved (0.82665 --> 0.86325). Saving model!


HBox(children=(FloatProgress(value=0.0, max=13234.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


epoch = 2, accuracy = 0.8797, valid_loss = 0.3362606465816498
Validation score improved (0.86325 --> 0.8797). Saving model!


HBox(children=(FloatProgress(value=0.0, max=13234.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1250.0), HTML(value='')))


epoch = 3, accuracy = 0.886, valid_loss = 0.46046873927116394
Validation score improved (0.8797 --> 0.886). Saving model!


In [None]:
train_valid = [train, valid]
with open('train_valid_data.pkl', 'wb') as fp:
    pickle.dump(train_valid, fp)

Disambiguation process

In [26]:
def disambiguate(lemma, row, df_conll, lemma_synset_pairs_df, synsets_examples_df, tokenizer, model):
  orth = df_conll['ORTH'].loc[row] #właściwa forma słowa 
  sent = df_conll['SENT'].loc[row] #które to zdanie

  words = df_conll[df_conll['SENT'] == sent]['ORTH'].tolist() #lista wszystkich słów w zdaniu
  words = ['**' + x + '**' if x == orth else x for x in words] #zmienna orth **(obecny wyraz)**
  sent1 = ' '.join(words) #całe zdanie z jednym wyrazem **.**
  # find all synsets corresponding to lemma
  synsets = lemma_synset_pairs_df['synset'][lemma_synset_pairs_df['lemma'] == lemma].tolist() #wszystkie możliwe synsety dla lematu obecnego wyrazu 
  # find all examples per synset
  scores = [] 
  for x in synsets: #x = każdy synset możliwy
    exmpls = synsets_examples_df[synsets_examples_df['synset'] == x]['example'].tolist() #ściągamy wszystkie zdania z examples dla danego synsetu w pętli
    if len(exmpls) == 0:  #dla synsetu który nie posiada żadnego example dajemy score -999
      scores.append(-999)
      continue #przejdź do kolejnego synsetu
    examples = []
    for ex in exmpls: #dla kazdego example z synsetu w pętli
        examples.append({
          'l1': lemma,
          'e1': sent1,
          'l2': lemma,
          'e2': ex 
        })
    scrs = evaluate(examples, model) #w examples mamy tyle elementów ile jest examples dla danego synset, różniące się tylko tym example -> lista scorów
    score = np.max(scrs) #maksymalny score dla wszystkich example danego synsetu
    scores.append(score) 
  best_synset = synsets[np.argmax(scores)] #w zbiorze scores mamy score dla każdego z synsetów ; wybieramy ze zbioru synsets element o tym samym indeksie co największy score w scores
  return best_synset #synset

In [27]:
def check_if_mwe(lemma, mwe_candidate, lemma_synset_pairs_df): #funkcja sprawdzająca czy mamy do czynienia z multi-word-expression
  is_mwe = False
  cand_lemma = lemma + ' ' + mwe_candidate
  if len(lemma_synset_pairs_df[lemma_synset_pairs_df['lemma'] == cand_lemma]) > 0:
    lemma = cand_lemma
    is_mwe = True
  return is_mwe, lemma

In [28]:
def process_conll(path):
    df = pd.read_csv(path, sep='\t')
    tknids = df.TOKEN_ID.to_list()
    sntids = []
    s = -1
    for tknid in tknids:
      if tknid == 0: s += 1
      sntids.append(s)
    df['SENT'] = sntids
    df['MWE_FLAG'] = -1
    df['WN_ID'] = '_'
    for i in range(len(df)):
        # if flagged as MWE, copy synset:
        flag = df['MWE_FLAG'].loc[i]
        if flag > -1:
          df['WN_ID'].loc[i] = df['WN_ID'].loc[flag]
          continue

        # get lemma
        lemma = df['LEMMA'].loc[i]
        # check if MWE -> if yes, update lemma, write flag to another column ("MWE_FLAG")
        # for now the simplest case only: znajdować się
        if i < (len(df) - 1):
          mwe_candidate = df['LEMMA'].loc[i+1]                                #kolejny lemat po i
          mwe_flag, lemma = check_if_mwe(lemma, mwe_candidate, lemma_synset_pairs_df)               #obecny lemat, kolejny lemat, zbiór synsetów
          if mwe_flag == True: df['MWE_FLAG'].loc[i+1] = i

        # get synset candidates
        synsets = lemma_synset_pairs_df['synset'][lemma_synset_pairs_df['lemma'] == lemma].tolist()
        # if one candidate, update with it
        if len(synsets) == 1: df['WN_ID'].loc[i] = synsets[0] 
        # if zero candidate, update with underscore == do nothing
        # if multiple candidates, disambiguate
        if len(synsets) > 1: df['WN_ID'].loc[i] = disambiguate(lemma, i, df, lemma_synset_pairs_df, synsets_examples_df, TOKENIZER, model)

    #remove unnecessary columns (e.g. MWE_FLAG etc.)
    df = df[['ORDER_ID', 'TOKEN_ID', 'ORTH', 'LEMMA', 'CTAG', 'FROM', 'TO', 'WN_ID']]
    [folder, fname] = path.split('/')[-2:]
    #save csv in conll format in new folder
    rpath = 'results/' + folder + '/' + fname
    df.to_csv(rpath, sep='\t', index=False)

In [29]:
ls

[0m[01;34mpoleval20-wsd-master[0m/  [01;34mtest_gold_standard[0m/   wsddata.pkl
[01;34mresults[0m/               train_valid_data.pkl  WSD_polbert_model3.bin


In [30]:
model = PolbertWSD(pretrained_model)
model.load_state_dict(torch.load('WSD_polbert_model3.bin'))
model.to('cuda')
model.eval();

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=459.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=531146902.0, style=ProgressStyle(descri…




In [None]:
pathlist = list(Path('poleval20-wsd-master/testdata/testdata').glob('**/*.conll'))
pathlist_str = [str(x) for x in pathlist]
pathlist_done = list(Path('results').glob('**/*.conll'))
pathlist_done_str = ["poleval20-wsd-master/testdata/testdata" + str(x)[7:] for x in pathlist_done]

pathlist_to_do = list(set(pathlist_str) - set(pathlist_done_str))

for path in tqdm(pathlist_to_do):
     process_conll(path)

HBox(children=(FloatProgress(value=0.0, max=66.0), HTML(value='')))

In [None]:
!head -n 100 results/kpwr/00099883.conll

ORDER_ID	TOKEN_ID	ORTH	LEMMA	CTAG	FROM	TO	WN_ID
0	0	Toronto	Toronto	subst:sg:nom:n	0	6	_
1	1	Dominion	dominion	subst:sg:nom:n	7	14	s362230
2	2	Centre	centre	subst:sg:nom:n	15	20	s370532
3	0	Toronto	Toronto	subst:sg:nom:n	21	27	_
4	1	Dominion	dominion	subst:sg:nom:n	28	35	s362230
5	2	Centre	centre	subst:sg:nom:n	36	41	s370532
6	3	-	-	interp	42	42	_
7	4	kompleks	kompleks	subst:sg:nom:m3	43	50	s102644
8	5	handlowo	handlowo	adv:pos	51	58	s453527
9	6	-	-	interp	59	59	_
10	7	kulturalny	kulturalny	adj:sg:nom:m1:pos	60	69	s105670
11	8	w	w	prep:loc:nwok	70	70	s7974
12	9	kanadyjskim	kanadyjski	adj:sg:loc:n:pos	71	81	s104026
13	10	mieście	miasto	subst:sg:loc:n	82	88	s4845
14	11	Toronto	Toronto	subst:sg:nom:n	89	95	_
15	12	,	,	interp	96	96	_
16	13	w	w	prep:loc:nwok	97	97	s7974
17	14	Financial	financial	subst:sg:loc:n	98	106	s279186
18	15	District	district	subst:sg:loc:n	107	114	s331239
19	16	.	.	interp	115	115	_
20	0	Składa	składać	fin:sg:ter:imperf	116	121	s65290
21	1	się	się	qub	122	124	s65290
2

Test data metrics

In [None]:
path_kpwr = 'test_gold_standard/kpwr/'
path_results = 'results/kpwr/'
files_kpwr = os.listdir(path_results)
full_kpwr_results_df = pd.DataFrame()

for file_name in files_kpwr:
  result_file_name = path_results + file_name
  file_name = path_kpwr + file_name
  df = pd.read_csv(file_name, sep = '\t')[["ORDER_ID", "WN_ID"]]
  result_df = pd.read_csv(result_file_name, sep = '\t')
  df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
  result_df = pd.merge(result_df, df, how='inner', on='ORDER_ID')
  full_kpwr_results_df = full_kpwr_results_df.append(result_df)

In [None]:
full_kpwr_results_df['correct_classif'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model'] = full_kpwr_results_df['WN_ID'] != "_"
full_kpwr_results_df['if_annotated_in_test_data'] = full_kpwr_results_df['WN_ID_ACTUAL'] != "_"

correctly_predicted_senses_count = sum(full_kpwr_results_df['correct_classif'])
count_decisions_made_by_model = sum(full_kpwr_results_df['if_decision_made_by_model'])
count_annotated_in_test_data = sum(full_kpwr_results_df['if_annotated_in_test_data'])

print(f"precision for kpwr = {correctly_predicted_senses_count/count_decisions_made_by_model}")
print(f"recall for kpwr = {correctly_predicted_senses_count/count_annotated_in_test_data}")

In [None]:
path_sherlock = 'test_gold_standard/sherlock/'
path_results = 'results/sherlock/'
files_sherlock = os.listdir(path_results)
full_sherlock_results_df = pd.DataFrame()

for file_name in files_sherlock:
  result_file_name = path_results + file_name
  file_name = path_sherlock + file_name
  df = pd.read_csv(file_name, sep = '\t')[["ORDER_ID", "WN_ID"]]
  result_df = pd.read_csv(result_file_name, sep = '\t')
  df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
  result_df = pd.merge(result_df, df, how='inner', on='ORDER_ID')
  full_sherlock_results_df = full_sherlock_results_df.append(result_df)

In [None]:
full_sherlock_results_df['correct_classif'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model'] = full_sherlock_results_df['WN_ID'] != "_"
full_sherlock_results_df['if_annotated_in_test_data'] = full_sherlock_results_df['WN_ID_ACTUAL'] != "_"

correctly_predicted_senses_count = sum(full_sherlock_results_df['correct_classif'])
count_decisions_made_by_model = sum(full_sherlock_results_df['if_decision_made_by_model'])
count_annotated_in_test_data = sum(full_sherlock_results_df['if_annotated_in_test_data'])

print(f"precision for sherlock = {correctly_predicted_senses_count/count_decisions_made_by_model}")
print(f"recall for sherlock = {correctly_predicted_senses_count/count_annotated_in_test_data}")

In [None]:
list_final_results = [full_kpwr_results_df, full_sherlock_results_df]
with open('list_final_results.pkl', 'wb') as fp:
    pickle.dump(list_final_results, fp)