In [1]:
import pandas as pd
import os
import random
from tqdm.notebook import tqdm
import functools

import pickle
import numpy as np
from sklearn import metrics
from pathlib import Path
import warnings
import statistics

from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel,  RobertaTokenizer, PreTrainedTokenizerFast, RobertaModel, HerbertTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import lr_scheduler
import time

warnings.filterwarnings('ignore')

In [2]:
torch.cuda.is_available()

True

In [4]:
#%mkdir results
#%cd results
#%mkdir kpwr
#%mkdir sherlock
#%cd ..

C:\Users\Daniel\WORKSPACE\Magisterka\data\results


A subdirectory or file results already exists.
A subdirectory or file kpwr already exists.


C:\Users\Daniel\WORKSPACE\Magisterka\data


A subdirectory or file sherlock already exists.


Training data

In [3]:
path_lemmas = 'lemmas.txt'
path_synsets = 'synsets.txt'
path_lexicalunits = 'lexicalunits.txt'
path_synsets_examples = 'synset_defs_examples.txt'

lemmas_df = pd.read_csv(path_lemmas, names=['lemma', 'POS'])
synsets_df = pd.read_csv(path_synsets, names=['synset'])
lexicalunits_df = pd.read_csv(path_lexicalunits, sep='\t', names=['lemma_idx', 'synset_idx'])
synsets_examples_df = pd.read_csv(path_synsets_examples, sep='\t', names=['synset', 'example'])

lemmas_df['lemma_idx'] = lemmas_df.index
lemmas_df['lemma_idx'] = lemmas_df['lemma_idx'].apply(lambda i: i+1)

synsets_df['synset_idx'] = synsets_df.index
synsets_df['synset_idx'] = synsets_df['synset_idx'].apply(lambda i: i+1)
synsets_df['synset'] = synsets_df['synset'].apply(lambda i: "s"+str(i))

full_df = pd.merge(lexicalunits_df, lemmas_df, how='left', on='lemma_idx')
full_df = pd.merge(full_df, synsets_df, how='left', on='synset_idx')
full_df = pd.merge(full_df, synsets_examples_df, how='left', on='synset')

full_df = full_df.drop(['lemma_idx', 'synset_idx'], axis=1)

full_df.head(30)

Unnamed: 0,lemma,POS,synset,example
0,absolutny,adj,s238698,"**absolutystyczny**, oparty na zasadach, idei ..."
1,absolutny,adj,s238698,Różnica między monarchią **absolutną** a despo...
2,absolutny,adj,s238698,Księstwo oficjalnie było de jure od 1905 monar...
3,absolutny,adj,s238698,Aby zaprowadzić rządy **absolutne** i despotyc...
4,absolutny,adj,s238698,"Jego nowa, **absolutystyczna** polityka, połąc..."
5,absolutny,adj,s238698,Armia jest zabawką w rękach władcy **absolutne...
6,absolutny,adj,s238698,Po zniknięciu z Europy **absolutystycznych** w...
7,absolutny,adj,s238698,"oparty na zasadach, idei absolutyzmu, uzsadnio..."
8,absolwent,noun,s12,
9,abstrakcyjny,adj,s103631,"Był to dość **abstrakcyjny** pomysł, zwłaszcza..."


In [4]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 729174 entries, 0 to 729173
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   lemma    729168 non-null  object
 1   POS      729174 non-null  object
 2   synset   729174 non-null  object
 3   example  396035 non-null  object
dtypes: object(4)
memory usage: 27.8+ MB


In [5]:
lemma_synset_df = full_df[['lemma', 'synset']].drop_duplicates().reset_index(drop=True)
len(lemma_synset_df)

505231

In [6]:
lemma_synset_pairs_df = full_df[['lemma', 'synset']].drop_duplicates().reset_index(drop=True)
len(lemma_synset_pairs_df)

505231

In [7]:
lemma_synset_pairs_df.head()

Unnamed: 0,lemma,synset
0,absolutny,s238698
1,absolwent,s12
2,abstrakcyjny,s103631
3,adherent,s4450
4,administracja,s14


In [8]:
lemma_synset_df['n_synset_per_lemma'] = lemma_synset_df.groupby('lemma')['lemma'].transform('count')

In [9]:
lemma_synset_df[lemma_synset_df.lemma == "absolutny"]

Unnamed: 0,lemma,synset,n_synset_per_lemma
0,absolutny,s238698,6.0
13910,absolutny,s442011,6.0
13911,absolutny,s9107,6.0
13912,absolutny,s9681,6.0
428127,absolutny,s442023,6.0
499009,absolutny,s7073629,6.0


In [10]:
non_missing_examples_df = full_df[full_df['example'].notna()]
len(non_missing_examples_df)

396035

In [11]:
non_missing_examples_lemma_synset_df = non_missing_examples_df[['lemma', 'synset']].drop_duplicates().reset_index(drop=True)
len(non_missing_examples_lemma_synset_df)

172169

In [12]:
non_missing_examples_lemma_synset_df['n_synset_per_lemma_with_example'] = non_missing_examples_lemma_synset_df.groupby('lemma')['lemma'].transform('count')

In [13]:
lemma_synset_df = lemma_synset_df[['lemma', 'n_synset_per_lemma']].drop_duplicates().reset_index(drop=True)
non_missing_examples_lemma_synset_df = non_missing_examples_lemma_synset_df[['lemma', 'n_synset_per_lemma_with_example']].drop_duplicates().reset_index(drop=True)
lemmas_with_synset_counts_df = pd.merge(lemma_synset_df, non_missing_examples_lemma_synset_df, how='left', on='lemma')
lemmas_with_synset_counts_df['n_synset_per_lemma_with_example'] = lemmas_with_synset_counts_df['n_synset_per_lemma_with_example'].fillna(0)
lemmas_with_synset_counts_df['n_synset_per_lemma_with_example'] = lemmas_with_synset_counts_df['n_synset_per_lemma_with_example'].astype(int)

In [14]:
lemmas_with_synset_counts_df.head()

Unnamed: 0,lemma,n_synset_per_lemma,n_synset_per_lemma_with_example
0,absolutny,6.0,6
1,absolwent,1.0,0
2,abstrakcyjny,6.0,6
3,adherent,3.0,0
4,administracja,4.0,4


In [15]:
full_df[full_df.lemma == "biały"]

Unnamed: 0,lemma,POS,synset,example
828,biały,adj,s190,Japonki dążą do idealnie **białej** skóry stos...
829,biały,adj,s190,Ciocia Gabrysia jeździ **białym** maluchem.
830,biały,adj,s190,**Biała** koszula i granatowa spódniczka - tak...
831,biały,adj,s190,"jasny, taki, który ma kolor uważany za najjaśn..."
34116,biały,noun,s9689,"człowiek, który ma jasną skórę; należący do bi..."
...,...,...,...,...
642412,biały,adj,s460700,Zaczynamy **białego** mazura.
642413,biały,adj,s460702,"taki, który jest oznaczony jakimś umownym symb..."
642414,biały,adj,s460702,"Zamówiłam podstawowy, **biały** pakiet programów."
642415,biały,adj,s460702,Wybraliśmy wycieczkę **białym** szlakiem.


In [16]:
full_df[['lemma', 'synset']].drop_duplicates(subset='synset').reset_index(drop=True)

Unnamed: 0,lemma,synset
0,absolutny,s238698
1,absolwent,s12
2,abstrakcyjny,s103631
3,adherent,s4450
4,administracja,s14
...,...,...
346388,ciorać,s7080065
346389,ściorać się,s7080066
346390,ściorać,s7080067
346391,rzutować,s7080068


In [17]:
unique_synset_lemma_df = full_df[['lemma', 'synset']].drop_duplicates(subset='synset').reset_index(drop=True)

synset_rels_df = pd.read_csv('synset_rels.txt', names=['synset_idx', 'rel'], sep='\t')
synset_rels_df['syn1'] = synset_rels_df.index
synset_rels_df = pd.merge(synset_rels_df, synsets_df, how='left', on='synset_idx')
synset_rels_df = pd.merge(synset_rels_df, unique_synset_lemma_df, how='left', on='synset')
synset_rels_df.columns = ['synset_idx1', 'rel', 'synset_idx', 'synset2', 'lemma2']
synset_rels_df = pd.merge(synset_rels_df, synsets_df, how='left', on='synset_idx')
synset_rels_df = pd.merge(synset_rels_df, unique_synset_lemma_df, how='left', on='synset')
synset_rels_df.columns = ['synset_idx1', 'rel', 'synset_idx', 'synset2', 'lemma2', 'synset1', 'lemma1']
del synset_rels_df['synset_idx1']
del synset_rels_df['synset_idx']

In [18]:
synset_rels_df

Unnamed: 0,rel,synset2,lemma2,synset1,lemma1
0,hiperonimia,s9139,mimowolny,s10,samoczynny
1,hiperonimia,s104191,automatyczny,s10,samoczynny
2,hiperonimia,s105404,samorzutny,s10,samoczynny
3,hiperonimia,s239111,samobieżny,s10,samoczynny
4,Hipo_plWN-PWN,s267039,unconditional,s10,samoczynny
...,...,...,...,...,...
1448383,hiponimia,s49993,umęczyć się,s7080066,ściorać się
1448384,hiponimia,s51813,zmęczyć,s7080067,ściorać
1448385,kauzacja,s7080066,ściorać się,s7080067,ściorać
1448386,hiponimia,s65236,odwzorowywać,s7080068,rzutować


In [19]:
synset_rels_df.to_csv("synset_rels_.csv", sep = ";", encoding='utf-8-sig')

In [20]:
rels_to_handle_df = pd.read_csv("rels_to_handle.csv", sep = ";", engine='python', encoding='ansi')
rels_to_handle_df

Unnamed: 0,rel,synset1,synset2,use
0,Also see,1,1,0
1,Attribute,1,1,0
2,bliskoznaczność,0,0,1
3,cecha_definicyjna,0,0,1
4,czas_przy_niewyrażonym_predykacie,0,0,0
...,...,...,...,...
113,Typ_PWN-plWN,1,0,0
114,uprzedniość_bez_tożsamości_podmiotu,0,0,0
115,uprzedniość_z_tożsamością_podmiotu,0,0,0
116,wartość_cechy,0,0,0


In [21]:
# Relationships to delete english synsets
rels_synsets1_to_del = list(rels_to_handle_df[rels_to_handle_df.synset1 == 1].rel)
rels_synsets2_to_del = list(rels_to_handle_df[rels_to_handle_df.synset2 == 1].rel)

In [22]:
synsets1_to_del = synset_rels_df[synset_rels_df.rel.isin(rels_synsets1_to_del)].loc[:, ['synset1', 'lemma1', 'rel']]
#okay: s1411, s11313, s28948, s34040, s101003, s103252, s7073064, s39237, s32864, s15641, s65387, s12958, s2898, s1235, s22810, s3456, s14523, s404211
#sprawdzać te, które mają inną długość niż 7

In [23]:
synsets2_to_del = synset_rels_df[synset_rels_df.rel.isin(rels_synsets2_to_del)].loc[:, ['synset2', 'lemma2', 'rel']]

In [24]:
synsets2_to_del[synsets2_to_del.lemma2 == "szczep"]

Unnamed: 0,synset2,lemma2,rel
525063,s404211,szczep,międzyjęzykowa_synonimia_międzyparadygmatyczna...


In [25]:
synsets1_to_del["len_syns"] = synsets1_to_del["synset1"].apply(len)
synsets2_to_del["len_syns"] = synsets2_to_del["synset2"].apply(len)

In [26]:
okay_synsets = ['s1411', 's11313', 's28948', 's34040', 's101003', 's103252', 's7073064', 's39237', 's404211',
                's32864', 's15641', 's65387', 's12958', 's2898', 's1235', 's22810', 's3456', 's14523','s34029',
                's232339','s362811','s360269','s360109','s360108','s323276','s305543','s300046','s289207','s226925',
                's389219','s389220','s394257','s400196','s414629','s414763','s428578','s472376','s7062595']

all_synsets_to_del = set(list(synsets1_to_del["synset1"]) + list(synsets2_to_del["synset2"]))
all_synsets_to_del = [x for x in all_synsets_to_del if x not in okay_synsets]

In [27]:
full_df = full_df[~full_df.synset.isin(all_synsets_to_del)]

In [28]:
full_df

Unnamed: 0,lemma,POS,synset,example
0,absolutny,adj,s238698,"**absolutystyczny**, oparty na zasadach, idei ..."
1,absolutny,adj,s238698,Różnica między monarchią **absolutną** a despo...
2,absolutny,adj,s238698,Księstwo oficjalnie było de jure od 1905 monar...
3,absolutny,adj,s238698,Aby zaprowadzić rządy **absolutne** i despotyc...
4,absolutny,adj,s238698,"Jego nowa, **absolutystyczna** polityka, połąc..."
...,...,...,...,...
729169,ciorać,verb,s7080065,
729170,ściorać się,verb,s7080066,
729171,ściorać,verb,s7080067,
729172,rzutować,verb,s7080068,


In [29]:
synset_rels_df.shape

(1448388, 5)

In [30]:
synset_rels_df = synset_rels_df[~synset_rels_df.synset2.isin(all_synsets_to_del) & ~synset_rels_df.synset1.isin(all_synsets_to_del)]
synset_rels_df.shape

(577315, 5)

In [31]:
lemma_synset_pairs_df = lemma_synset_pairs_df[~lemma_synset_pairs_df.synset.isin(all_synsets_to_del)]
lemma_synset_pairs_df.shape

(292251, 2)

In [32]:
synset_rels_df

Unnamed: 0,rel,synset2,lemma2,synset1,lemma1
0,hiperonimia,s9139,mimowolny,s10,samoczynny
1,hiperonimia,s104191,automatyczny,s10,samoczynny
2,hiperonimia,s105404,samorzutny,s10,samoczynny
3,hiperonimia,s239111,samobieżny,s10,samoczynny
5,hiponimia,s424226,niezależny,s10,samoczynny
...,...,...,...,...,...
1448383,hiponimia,s49993,umęczyć się,s7080066,ściorać się
1448384,hiponimia,s51813,zmęczyć,s7080067,ściorać
1448385,kauzacja,s7080066,ściorać się,s7080067,ściorać
1448386,hiponimia,s65236,odwzorowywać,s7080068,rzutować


In [33]:
synsets_examples_df.shape

(260197, 2)

In [34]:
synsets_examples_df = synsets_examples_df[~synsets_examples_df.synset.isin(all_synsets_to_del)]
synsets_examples_df.shape

(260197, 2)

# Relations for additional examples

In [35]:
rels_for_examples = list(rels_to_handle_df[rels_to_handle_df.use == 1].rel)
rels_for_examples

['bliskoznaczność',
 'cecha_definicyjna',
 'część',
 'deminutywność',
 'dystrybutywność',
 'egzemplarz',
 'ekspresywność | augmentatywność',
 'element kolekcji',
 'element taksonomiczny',
 'fuzzynimia_synsetów',
 'gradacyjność',
 'hiperonimia',
 'hiponimia',
 'inchoatywność',
 'istota młoda',
 'iteratywność',
 'kauzacja',
 'miejsce_przy_niewyrażonym_predykacie',
 'mieszkaniec',
 'obszar',
 'procesywność',
 'stanowość',
 'typ']

In [36]:
synset_rels_df.shape

(577315, 5)

In [37]:
synset_rels_df = synset_rels_df[synset_rels_df.rel.isin(rels_for_examples)]
synset_rels_df.shape

(539451, 5)

In [38]:
synset_rels_df

Unnamed: 0,rel,synset2,lemma2,synset1,lemma1
0,hiperonimia,s9139,mimowolny,s10,samoczynny
1,hiperonimia,s104191,automatyczny,s10,samoczynny
2,hiperonimia,s105404,samorzutny,s10,samoczynny
3,hiperonimia,s239111,samobieżny,s10,samoczynny
5,hiponimia,s424226,niezależny,s10,samoczynny
...,...,...,...,...,...
1448383,hiponimia,s49993,umęczyć się,s7080066,ściorać się
1448384,hiponimia,s51813,zmęczyć,s7080067,ściorać
1448385,kauzacja,s7080066,ściorać się,s7080067,ściorać
1448386,hiponimia,s65236,odwzorowywać,s7080068,rzutować


In [39]:
def concat_columns(row):
  a, b = str(row[1])[0], str(row[2])
  return a + '_' + b

lemmas_with_synset_counts_df['n_syns_per_lemma_all_with_example'] = lemmas_with_synset_counts_df.apply(concat_columns, axis=1)

In [40]:
non_missing_examples_df = pd.merge(non_missing_examples_df, 
                                   lemmas_with_synset_counts_df[['lemma', 'n_synset_per_lemma_with_example']], 
                                   how='left', 
                                   on='lemma')

non_missing_examples_df.head()

Unnamed: 0,lemma,POS,synset,example,n_synset_per_lemma_with_example
0,absolutny,adj,s238698,"**absolutystyczny**, oparty na zasadach, idei ...",6
1,absolutny,adj,s238698,Różnica między monarchią **absolutną** a despo...,6
2,absolutny,adj,s238698,Księstwo oficjalnie było de jure od 1905 monar...,6
3,absolutny,adj,s238698,Aby zaprowadzić rządy **absolutne** i despotyc...,6
4,absolutny,adj,s238698,"Jego nowa, **absolutystyczna** polityka, połąc...",6


In [41]:
training_df = non_missing_examples_df[non_missing_examples_df['n_synset_per_lemma_with_example'] > 1]
training_df = training_df.reset_index(drop=True)
training_df.head()

Unnamed: 0,lemma,POS,synset,example,n_synset_per_lemma_with_example
0,absolutny,adj,s238698,"**absolutystyczny**, oparty na zasadach, idei ...",6
1,absolutny,adj,s238698,Różnica między monarchią **absolutną** a despo...,6
2,absolutny,adj,s238698,Księstwo oficjalnie było de jure od 1905 monar...,6
3,absolutny,adj,s238698,Aby zaprowadzić rządy **absolutne** i despotyc...,6
4,absolutny,adj,s238698,"Jego nowa, **absolutystyczna** polityka, połąc...",6


In [194]:
examples = []

for i in tqdm(range(len(training_df))):
  lemma, synset, example = training_df['lemma'].loc[i], training_df['synset'].loc[i], training_df['example'].loc[i] #lista wszystkich indeksów wierszy z tymi samymi synsetami
  trues = training_df.index[training_df['synset'] == synset].tolist() #lista wszystkich indeksów wierszy z tymi samymi synsetami z wykluczeniem obecnego indeksu w pętli
  trues = [x for x in trues if x != i] #lista wszystkich indeksów wierszy z tymi samymi lematami
  falses = training_df.index[training_df['lemma'] == lemma].tolist() #lista wszystkich indeksów wierszy z tymi samymi lematami z wykluczeniem obecnego indeksu w pętli
  falses = [x for x in falses if x not in trues and x != i] 
  if len(trues) > 0:
      rp = random.choice(trues)
      lp, ep = training_df['lemma'].loc[rp], training_df['example'].loc[rp]
      examples.append({
          'l1': lemma,
          'e1': example,
          'l2': lp,
          'e2': ep,
          'label': True #wiersze dla których dwa example mówią o tym samym synsecie
      })
  if len(falses) > 0: 
      rn = random.choice(falses)
      ln, en = training_df['lemma'].loc[rn], training_df['example'].loc[rn]
      examples.append({
          'l1': lemma,
          'e1': example,
          'l2': ln,
          'e2': en,
          'label': False #wiersze dla których dwa example mówią o innym synsecie
      })

  0%|          | 0/233238 [00:00<?, ?it/s]

In [197]:
examples[60:100]

[{'l1': 'afgański',
  'e1': 'Nowy rząd **afgański** w dobrym geście i w celu rozwoju stosunków międzynarodowych, podarował trzy samoloty Airbus A300, odkupione od Air India.',
  'l2': 'afgański',
  'e2': 'związny z Afganistanem.',
  'label': True},
 {'l1': 'afgański',
  'e1': 'Nowy rząd **afgański** w dobrym geście i w celu rozwoju stosunków międzynarodowych, podarował trzy samoloty Airbus A300, odkupione od Air India.',
  'l2': 'afgański',
  'e2': 'W czasie podróży nauczyłem się kilku **afgańskich** słów.',
  'label': False},
 {'l1': 'afgański',
  'e1': 'związny z Afganistanem.',
  'l2': 'afgański',
  'e2': 'związany z Afganistanem.',
  'label': True},
 {'l1': 'afgański',
  'e1': 'związny z Afganistanem.',
  'l2': 'afgański',
  'e2': 'Też mam długopis zrobiony z łuski, ale mój nie jest **afgański**.',
  'label': False},
 {'l1': 'afgański',
  'e1': 'związany z Afganistanem.',
  'l2': 'afgański',
  'e2': 'Nowy rząd **afgański** w dobrym geście i w celu rozwoju stosunków międzynarodowych

In [42]:
l = [training_df, examples]
with open('wsddata.pkl', 'wb') as fp:
    pickle.dump(l, fp)

NameError: name 'examples' is not defined

Model

In [42]:
def create_tuple(example):
  l1, l2, e1, e2 = example['l1'], example['l2'], example['e1'], example['e2'] 
  return (l1 +': ' + e1, l2 + ': ' + e2)

In [43]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [44]:
class EarlyStopping:
    def __init__(self, patience=7, delta=0.0001):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        self.val_score = -np.Inf
            
    def __call__(self, epoch_score, model, model_path):
      
        score = np.copy(epoch_score)

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(
                "EarlyStopping counter: {} out of {}".format(
                    self.counter, self.patience
                )
            )
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print(
                "Validation score improved ({} --> {}). Saving model!".format(
                    self.val_score, epoch_score
                )
            )
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [93]:
pretrained_model = "allegro/herbert-klej-cased-v1"
LR = 5e-5
WARMUP_STEPS = 8
MODEL_PATH = "WSD_herbert_model1.bin"
TOKENIZER = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 3

In [48]:
pretrained_model = 'dkleczek/bert-base-polish-uncased-v1'
LR = 5e-5
WARMUP_STEPS = 8
MODEL_PATH = "WSD_polbert_model2.bin"
TOKENIZER = BertTokenizer.from_pretrained(pretrained_model)

MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 3

In [46]:
class HerbertWSD(nn.Module):
    def __init__(self, pretrained_herbert_path):
        super(HerbertWSD, self).__init__()
        self.pretrained_herbert_path = pretrained_herbert_path
        self.herbert = AutoModel.from_pretrained(self.pretrained_herbert_path)
        self.dropout = nn.Dropout(0.25)
        self.ranking_linear = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, pooled_output = self.herbert(ids, attention_mask=mask, return_dict=False)
        do = self.dropout(pooled_output)
        return self.ranking_linear(do)

In [95]:
class PolbertWSD(nn.Module):
    def __init__(self, pretrained_polbert_path):
        super(PolbertWSD, self).__init__()
        self.pretrained_polbert_path = pretrained_polbert_path
        self.polbert = BertModel.from_pretrained(self.pretrained_polbert_path)
        self.dropout = nn.Dropout(0.25)
        self.ranking_linear = nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        _, pooled_output = self.polbert(ids, attention_mask=mask, return_dict=False)
        do = self.dropout(pooled_output)
        return self.ranking_linear(do)

In [47]:
class PrepareTrainingDataset:
    def __init__(self, examples, max_length, tokenizer):
        self.examples = examples
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        tpl = create_tuple(self.examples[item])
        tgt = int(self.examples[item]['label'])
        enc = self.tokenizer.encode_plus(tpl[0], tpl[1], padding='max_length', truncation='longest_first', max_length=self.max_length)      
        return {
            "ids": torch.tensor(enc.input_ids, dtype=torch.long),
            "mask": torch.tensor(enc.attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(enc.token_type_ids, dtype=torch.long),
            "target": torch.tensor(tgt, dtype=torch.float) 
        }

In [48]:
class PrepareTestDataset:
    def __init__(self, examples, max_length, tokenizer):
        self.examples = examples
        self.max_length = max_length
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        tpl = create_tuple(self.examples[item])
        enc = self.tokenizer.encode_plus(tpl[0], tpl[1], padding='max_length', truncation='longest_first', max_length=self.max_length)      
        return {
            "ids": torch.tensor(enc.input_ids, dtype=torch.long),
            "mask": torch.tensor(enc.attention_mask, dtype=torch.long),
            "token_type_ids": torch.tensor(enc.token_type_ids, dtype=torch.long)
        }        

In [49]:
def loss_function(logits, targets):
    return nn.BCEWithLogitsLoss()(logits, targets.view(-1, 1))

In [50]:
def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    losses = AverageMeter()

    tk0 = tqdm(data_loader, total=len(data_loader), mininterval=1.)

    for bi, d in enumerate(tk0):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]
        targets = d["target"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        losses.update(loss.item(), ids.size(0))
        tk0.set_postfix(loss=losses.avg)

def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    fin_loss = []

    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader), mininterval=1.)
        for bi, d in enumerate(tk0):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]
            targets = d["target"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            loss = loss_function(outputs, targets)

            fin_targets.append(targets.cpu().detach().numpy())
            fin_outputs.append(outputs.cpu().detach().numpy())
            fin_loss.append(loss.cpu().detach().numpy())

    return np.concatenate(fin_outputs, axis=0), np.concatenate(fin_targets, axis=0), np.average(fin_loss)

def final_eval(data_loader, model, device):
    model.eval()
    fin_outputs = []

    with torch.no_grad():
        for bi, d in enumerate(data_loader):
            ids = d["ids"]
            mask = d["mask"]
            token_type_ids = d["token_type_ids"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            fin_outputs.append(outputs.cpu().detach().numpy())

    return np.concatenate(fin_outputs, axis=0)

In [51]:
def run(train, valid, pretrained_model):

    train_dataset = PrepareTrainingDataset(
        examples=train,   
        max_length=MAX_LEN,
        tokenizer=TOKENIZER
    )
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True,
        pin_memory=True
    )
    valid_dataset = PrepareTrainingDataset(
        examples=valid,   
        max_length=MAX_LEN,
        tokenizer=TOKENIZER
    )
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=VALID_BATCH_SIZE,
        shuffle=False,
        pin_memory=True
    )

    device = 'cuda'
    num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
    #model = PolbertWSD(pretrained_model).to(device)
    model = HerbertWSD(pretrained_model).to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    
    optimizer_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if (any(nd in n for nd in no_decay))], 'weight_decay': 0.0},
    ]
    optimizer = AdamW(optimizer_parameters, lr=LR)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=WARMUP_STEPS, 
        num_training_steps=num_train_steps
    )

    es = EarlyStopping(patience=4)
    print(f"Training is starting")

    for epoch in range(EPOCHS):
        train_loop_fn(train_data_loader, model, optimizer, device, scheduler=scheduler)
        outputs, targets, vloss = eval_loop_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"epoch = {epoch}, accuracy = {accuracy}, valid_loss = {vloss}")
        es(accuracy, model, model_path=MODEL_PATH)
        if es.early_stop:
            print("Early stopping")
            break
        

In [52]:
def evaluate(examples, model):

    test_dataset = PrepareTestDataset(
        examples=examples,   
        max_length=MAX_LEN,
        tokenizer=TOKENIZER
    )
    test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=TEST_BATCH_SIZE,
        shuffle=False,
        pin_memory=True
    )

    device = 'cuda'
    outputs = final_eval(test_data_loader, model, device)

    return outputs

Load data

In [53]:
with open('wsddata.pkl', 'rb') as fp:
    [training_df, examples] = pickle.load(fp)

In [54]:
from numpy.random import shuffle
shuffle(examples)

train = examples[:-20000]
valid = examples[-20000:]

In [64]:
train_dataset = PrepareTrainingDataset(
        examples=train,   
        max_length=MAX_LEN,
        tokenizer=TOKENIZER
    )

In [65]:
(train_dataset.examples)

[{'l1': 'cywil',
  'e1': 'W ostatniej wojnie ucierpiało przede wszystkim wielu **cywilów**.',
  'l2': 'cywil',
  'e2': 'człowiek, który nie jest w czynnej służbie uzbrojonych formacji mundurowych.',
  'label': True},
 {'l1': 'objazd',
  'e1': 'ostre pouczenie, złajanie, **reprymenda**.',
  'l2': 'objazd',
  'e2': 'podróż, kolejne odwiedzanie różnych miejscowości.',
  'label': False},
 {'l1': 'śmierdzący',
  'e1': 'taki, który śmierdzi.',
  'l2': 'śmierdzący',
  'e2': 'Russula foetens to gatunek pospolitych śmierdzących grzybów leśnych znanych jako gołąbki **śmierdzące**.',
  'label': True},
 {'l1': 'kormoran białolicy',
  'e1': 'Microcarbo melanoleucos - gatunek dużego ptaka wodnego z rodziny kormoranów (Phalacrocoracidae); zamieszkuje głównie kontynent australijski, ale bywa również spotykany na Nowej Gwinei, w Nowej Zelandii, Indonezji i na pobliskich wysepkach południowo-zachodniego Pacyfiku oraz na obszarze subantarktycznym.',
  'l2': 'kormoran białolicy',
  'e2': 'Microcarbo melan

In [66]:
pretrained_model

'allegro/herbert-base-cased'

Model running and saving

In [67]:
torch.cuda.is_available()

True

In [68]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [69]:
run(train, valid, pretrained_model)

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.sso.sso_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training is starting


  0%|          | 0/13234 [00:00<?, ?it/s]

RuntimeError: stack expects each tensor to be equal size, but got [1, 48] at entry 0 and [1, 42] at entry 1

In [84]:
train_valid = [train, valid]
with open('train_valid_data_herbert_klej.pkl', 'wb') as fp:
    pickle.dump(train_valid, fp)

# Disambiguation process

In [102]:
def get_related_synsets(synset, synset_rels_df):
  return synset_rels_df['synset2'][synset_rels_df['synset1'] == synset].tolist()

get_related_synsets('s14', synset_rels_df)

['s7639', 's7901', 's79350', 's228864', 's476109']

In [103]:
def disambiguate(lemma, 
                 row, 
                 df_conll, 
                 df_conll_prev, 
                 df_conll_next, 
                 lemma_synset_pairs_df, 
                 synsets_examples_df, 
                 tokenizer, 
                 model):
  
  words = df_conll['ORTH'].tolist() #lista wszystkich słów w zdaniu
  
  if isinstance(row, tuple):
    orth = list(df_conll['ORTH'].iloc[list(row)])
    words = ['**' + x + '**' if x in orth else x for x in words]
    row = [str(r) for r in row]
    row = ",".join(row)
  else:
    orth = df_conll['ORTH'].iloc[row] #właściwa forma słowa 
    words = ['**' + x + '**' if x == orth else x for x in words] #zmienna orth **(obecny wyraz)**
  
  sent1 = ' '.join(words) #całe zdanie z jednym wyrazem **.**
    
  if isinstance(df_conll_prev, str):
    if_prev = False
  else:
    if_prev = True
    sent_prev = ' '.join(df_conll_prev['ORTH'].tolist())
  if isinstance(df_conll_next, str):
    if_next = False
  else:
    if_next = True
    sent_next = ' '.join(df_conll_next['ORTH'].tolist())

  # find all synsets corresponding to lemma
  synsets = lemma_synset_pairs_df['synset'][lemma_synset_pairs_df['lemma'] == lemma].tolist() #wszystkie możliwe synsety dla lematu obecnego wyrazu 
  # find all examples per synset
  scores = []
  scores_prev = []
  scores_next = []
  for x in synsets: #x = każdy synset możliwy
    exmpls = synsets_examples_df[synsets_examples_df['synset'] == x]['example'].tolist() #ściągamy wszystkie zdania z examples dla danego synsetu w pętli
    if len(exmpls) == 0:
      related_synsets = get_related_synsets(x, synset_rels_df)
      for s in related_synsets:
        exmpls.extend(synsets_examples_df[synsets_examples_df['synset'] == s]['example'].tolist())    
    if len(exmpls) == 0:  #dla synsetu który nie posiada żadnego example dajemy score -999
      scores.append(-999)
      scores_prev.append(-999)
      scores_next.append(-999)
      continue #przejdź do kolejnego synsetu
    examples = []
    for ex in exmpls: #dla kazdego example z synsetu w pętli
        examples.append({
          'l1': lemma,
          'e1': sent1,
          'l2': lemma,
          'e2': ex 
        })
    scrs = evaluate(examples, model) #w examples mamy tyle elementów ile jest examples dla danego synset, różniące się tylko tym example -> lista scorów
    scores.append(scrs)
    if if_prev:
        examples = []
        for ex in exmpls: #dla kazdego example z synsetu w pętli
            examples.append({
              'l1': lemma,
              'e1': sent_prev,
              'l2': lemma,
              'e2': ex 
            })
        scrs = evaluate(examples, model) #w examples mamy tyle elementów ile jest examples dla danego synset, różniące się tylko tym example -> lista scorów
        scores_prev.append(scrs)
    else:
        scores_prev.append(-999)

    if if_next:
        examples = []
        for ex in exmpls: #dla kazdego example z synsetu w pętli
            examples.append({
              'l1': lemma,
              'e1': sent_next,
              'l2': lemma,
              'e2': ex 
            })
        scrs = evaluate(examples, model) #w examples mamy tyle elementów ile jest examples dla danego synset, różniące się tylko tym example -> lista scorów
        scores_next.append(scrs)
    else:
        scores_next.append(-999)
        

  return pd.DataFrame({'idx' : str(row),
                         'synset' : synsets,
                         'scores(i-1)' : scores_prev,
                         'scores(i)' : scores,
                         'scores(i+1)' : scores_next})


In [104]:
def disambiguate_lesk(lemma, 
                 row, 
                 df_conll, 
                 df_conll_prev, 
                 df_conll_next, 
                 lemma_synset_pairs_df, 
                 synsets_examples_df
                 ):
  
  words1 = df_conll['ORTH'].tolist() #lista wszystkich słów w zdaniu

  if isinstance(row, tuple):
    orth = list(df_conll['ORTH'].iloc[list(row)])
    words1 = [('**' + x + '**').lower() if x in orth else x.lower() for x in words1]
    words1 = [x for x in words1 if x not in [".", ","]]
    row = [str(r) for r in row]
    row = ",".join(row)
  else:
    orth = df_conll['ORTH'].iloc[row] #właściwa forma słowa 
    words1 = [('**' + x + '**').lower() if x == orth else x.lower() for x in words1] #zmienna orth **(obecny wyraz)**
    words1 = [x for x in words1 if x not in [".", ","]]
    
  if isinstance(df_conll_prev, str):
    if_prev = False
  else:
    if_prev = True
    words_prev = df_conll_prev['ORTH'].tolist() 
    words_prev = [x.lower() for x in words_prev]
    words_prev = [x for x in words_prev if x not in [".", ","]]
  if isinstance(df_conll_next, str):
    if_next = False
  else:
    if_next = True
    words_next = df_conll_next['ORTH'].tolist() 
    words_next = [x.lower() for x in words_next]
    words_next = [x for x in words_next if x not in [".", ","]]

  # find all synsets corresponding to lemma
  synsets = lemma_synset_pairs_df['synset'][lemma_synset_pairs_df['lemma'] == lemma].tolist() #wszystkie możliwe synsety dla lematu obecnego wyrazu 
  # find all examples per synset
  scores = []
  scores_prev = []
  scores_next = []
  for x in synsets: #x = każdy synset możliwy
    exmpls = synsets_examples_df[synsets_examples_df['synset'] == x]['example'].tolist() #ściągamy wszystkie zdania z examples dla danego synsetu w pętli
    if len(exmpls) == 0:
      related_synsets = get_related_synsets(x, synset_rels_df)
      for s in related_synsets:
        exmpls.extend(synsets_examples_df[synsets_examples_df['synset'] == s]['example'].tolist())    
    if len(exmpls) == 0:  #dla synsetu który nie posiada żadnego example dajemy score -999
      scores.append(-999)
      scores_prev.append(-999)
      scores_next.append(-999)
      continue #przejdź do kolejnego synsetu
    examples = []
    scrs = []
    for ex in exmpls: #dla kazdego example z synsetu w pętli
        ex_list = ex.split(" ")
        ex_list = [x.replace(",", "") for x in ex_list]
        ex_list = [x.replace(".", "") for x in ex_list]
        scr = len(set(words1).intersection(set(ex_list)))
        scrs.append(scr)
    scores.append(scrs)
    if if_prev:
        examples = []
        scrs = []
        for ex in exmpls: #dla kazdego example z synsetu w pętli
            ex_list = ex.split(" ")
            ex_list = [x.replace(",", "") for x in ex_list]
            ex_list = [x.replace(".", "") for x in ex_list]
            scr = len(set(words_prev).intersection(set(ex_list)))
            scrs.append(scr)
        scores_prev.append(scrs)
    else:
        scores_prev.append(-999)

    if if_next:
        examples = []
        scrs = []
        for ex in exmpls: #dla kazdego example z synsetu w pętli
            ex_list = ex.split(" ")
            ex_list = [x.replace(",", "") for x in ex_list]
            ex_list = [x.replace(".", "") for x in ex_list]
            scr = len(set(words_next).intersection(set(ex_list)))
            scrs.append(scr)
        scores_next.append(scrs)
    else:
        scores_next.append(-999)
        

  return pd.DataFrame({'idx' : str(row),
                         'synset' : synsets,
                         'scores(i-1)' : scores_prev,
                         'scores(i)' : scores,
                         'scores(i+1)' : scores_next})


In [105]:
import itertools
from datetime import datetime

def findsubsets(s, n): 
    return list(itertools.combinations(s, n)) 

#findsubsets(iter_lemmas, 2).extend(findsubsets(iter_lemmas, 3))

In [106]:
mwe_lemma_synset_pairs_df = lemma_synset_pairs_df.copy()
mwe_lemma_synset_pairs_df["LEMMA_LEN"] = mwe_lemma_synset_pairs_df.lemma.apply(lambda x: len(str(x).split(" ")))
mwe_lemma_synset_pairs_df = mwe_lemma_synset_pairs_df[mwe_lemma_synset_pairs_df["LEMMA_LEN"] > 1]
mwe_lemma_synset_pairs_df = mwe_lemma_synset_pairs_df.reset_index()
mwe_split_lemmas = mwe_lemma_synset_pairs_df.lemma.str.split(" ")

In [107]:
def join_lemmas_and_idx(x):
    lemmas = []
    indexes = []
    for y in x:
        lemmas.append(y[1])
        indexes.append(y[0])
    lemmas = " ".join(lemmas)
    lemmas = lemmas.lower()
    return [lemmas, indexes]

In [108]:
def get_num_list(str_list):
    y = []
    for x in str_list:
        y.append(int(x))
    return y

In [109]:
def get_max_score(score_tuple):
    if isinstance(score_tuple, int):
        return score_tuple
    scores = []
    for x in score_tuple:
        scores.append(x[0])
    return max(scores)

In [110]:
def get_avg_score(score_tuple):
    if isinstance(score_tuple, int):
        return score_tuple
    scores = []
    for x in score_tuple:
        scores.append(x[0])
    return sum(scores)/len(scores)

In [111]:
def get_max_score_lesk(score_tuple):
    if isinstance(score_tuple, int):
        return score_tuple
    return max(score_tuple)

In [132]:
max_id =len(sent_lemmas)
score_type = "avg_score"

In [142]:
single_words_df

Unnamed: 0,idx,synset,scores(i-1),scores(i),scores(i+1),if_mwe,if_best,max_score(i-1),max_score(i),max_score(i+1),avg_score(i-1),avg_score(i),avg_score(i+1),final_score,idx_num
0,[0],s15934,[[-7.158111]],[[-7.234226]],[[-7.0458474]],False,True,-7.158111,-7.234226,-7.045847,-7.158111,-7.234226,-7.045847,-7.234226,0
0,[1],s71295,"[[-3.245772], [-0.31632456], [0.17044613], [-2...","[[-4.4789824], [-2.4577265], [-4.3900414], [-0...","[[-3.656093], [0.9071501], [-6.4459662], [-3.9...",False,True,0.170446,0.620044,0.90715,-1.54114,-2.628428,-3.039646,-2.628428,1
1,[2],s66378,"[[-1.8357943], [-1.2131376], [3.707624], [-0.1...","[[-3.7450478], [2.4078987], [2.7497523], [-0.8...","[[-1.9999372], [0.16313665], [4.6035733], [-2....",False,True,3.707624,2.749752,4.603573,0.126507,0.134032,-0.015744,0.134032,2
3,[4],s50720,"[[-0.53573906], [-1.9566714], [-1.2967299], [-...","[[3.6641474], [-2.0944066], [-2.901428], [-0.8...","[[-0.8249446], [-4.1488614], [0.018809356], [-...",False,True,-0.535739,3.664147,0.018809,-1.293955,-0.554717,-1.649142,-0.554717,4
0,[5],s460680,"[[-1.6091082], [-1.7557523], [2.4647791]]","[[-2.96921], [-2.2987082], [2.9362938]]","[[-5.759439], [-3.849377], [1.8745321]]",False,True,2.464779,2.936294,1.874532,-0.300027,-0.777208,-2.578095,-0.777208,5
4,[6],s7067125,"[[-1.896687], [-6.366314]]","[[2.1000245], [-0.056473486]]","[[-1.4830059], [-1.0833529]]",False,True,-1.896687,2.100024,-1.083353,-4.1315,1.021775,-1.283179,1.021775,6
0,[7],s102822,"[[-3.2860887], [-3.1795993], [-4.1905103]]","[[-3.7476423], [-0.7648507], [-1.6670487]]","[[2.9009638], [-4.4341326], [1.211641]]",False,True,-3.179599,-0.764851,2.900964,-3.552066,-2.059847,-0.107176,-2.059847,7


In [144]:
sentence_df

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,LEMMA,CTAG,FROM,TO,SENT,WN_ID_X,WN_ID_A,WN_ID_A_X,WN_ID_AM,WN_ID_AM_X,WN_ID_MX,WN_ID_MX_X,WN_ID_M,WN_ID_M_X,WN_ID_W_X,WN_ID_W,WN_ID
0,6,0,Na,na,prep:loc,26,27,1,s15934,_,_,_,_,_,_,_,_,_,_,s15934
1,7,1,blogach,blog,subst:pl:loc:m3,28,34,1,s71295,_,_,_,_,_,_,_,_,_,_,s71295
2,8,2,toczą,toczyć,fin:pl:ter:imperf,35,39,1,s250898,_,_,_,_,_,_,_,_,_,_,s250898
3,9,3,się,się,qub,40,42,1,s250898,_,_,_,_,_,_,_,_,_,_,s250898
4,10,4,czasem,czas,subst:sg:inst:m3,43,48,1,s50720,_,_,_,_,_,_,_,_,_,_,s50720
5,11,5,bardzo,bardzo,adv:pos,49,54,1,s460680,_,_,_,_,_,_,_,_,_,_,s460680
6,12,6,inteligentne,inteligentny,adj:pl:nom:m3:pos,55,66,1,s7067125,_,_,_,_,_,_,_,_,_,_,s7067125
7,13,7,spory,spór,subst:pl:nom:m3,67,71,1,s102822,_,_,_,_,_,_,_,_,_,_,s102822
8,14,8,.,.,interp,72,72,1,_,_,_,_,_,_,_,_,_,_,_,_


In [134]:
    
    sentence_results_df["if_best"] = False 
    
    sentence_results_df["max_score(i-1)"] = sentence_results_df["scores(i-1)"].apply(get_max_score)
    sentence_results_df["max_score(i)"] = sentence_results_df["scores(i)"].apply(get_max_score)
    sentence_results_df["max_score(i+1)"] = sentence_results_df["scores(i+1)"].apply(get_max_score)

    sentence_results_df["avg_score(i-1)"] = sentence_results_df["scores(i-1)"].apply(get_avg_score)
    sentence_results_df["avg_score(i)"] = sentence_results_df["scores(i)"].apply(get_avg_score)
    sentence_results_df["avg_score(i+1)"] = sentence_results_df["scores(i+1)"].apply(get_avg_score)
    
    if score_type == "weight_score":
        sentence_results_df["final_score"] = sentence_results_df["max_score(i-1)"]*0.25 +\
                                             sentence_results_df["max_score(i+1)"]*0.25 +\
                                             sentence_results_df["max_score(i)"]*0.5
    elif score_type == "max_avg_score":
        sentence_results_df["final_score"] = sentence_results_df[["avg_score(i)", "avg_score(i+1)", "avg_score(i-1)"]].max(axis=1)
    elif score_type == "max_max_score":
        sentence_results_df["final_score"] = sentence_results_df[["max_score(i)", "max_score(i+1)", "max_score(i-1)"]].max(axis=1)
    elif score_type == "avg_score":
        sentence_results_df["final_score"] = sentence_results_df["avg_score(i)"]
    elif score_type == "max_score":
        sentence_results_df["final_score"] = sentence_results_df["max_score(i)"]

        
    single_words_df = sentence_results_df.copy()
    single_words_df = single_words_df[~single_words_df.if_mwe]
    single_words_df["idx_num"] = single_words_df.idx.apply(lambda x: int(x[0]))
    single_words_df["if_best"] = False
    
    mwe_df = sentence_results_df.copy()
    mwe_df = mwe_df[mwe_df.if_mwe]
    mwe_df["idx_num"] = mwe_df.idx.apply(get_num_list)
    
    result_synsets_list = []
    for i in range(max_id):
        single_candidates = single_words_df.copy()
        single_candidates = single_candidates[single_candidates.idx_num == i]
        if len(single_candidates) < 1:
            continue
        best_synset = single_candidates.loc[single_candidates.final_score == max(single_candidates.final_score), 'synset'].iloc[0]
        single_words_df.loc[single_words_df['synset'] == best_synset, 'if_best'] = True
        sentence_df.loc[sentence_df['TOKEN_ID'] == i, 'WN_ID'] = best_synset
        
    single_words_df = single_words_df[single_words_df.if_best]
    mwe_idxes = mwe_df['idx_num'].drop_duplicates()
    mwe_df['rejected'] = True

    done_idxes = []

    for mwe_idx in mwe_idxes:

        if mwe_idx in done_idxes:
            continue

        done_idxes.append(mwe_idx)
        other_mwe_idxes = [x for x in mwe_idxes if x != mwe_idx]
        for done_idx in done_idxes:
            other_mwe_idxes = [x for x in other_mwe_idxes if x != done_idx]
        intersect_idxes = []
        if_scored = False
        for other_mwe_idx in other_mwe_idxes:
            if len(set(mwe_idx).intersection(other_mwe_idx)) == 0:
                continue
            else:
                possible_synsets = [x[1]['synset'] for x in mwe_df.iterrows() if (x[1]['idx_num'] == other_mwe_idx) | (x[1]['idx_num'] == mwe_idx)]
                mwe_best_synset = mwe_df.copy()
                mwe_best_synset = mwe_best_synset[mwe_best_synset.synset.isin(possible_synsets)]
                mwe_best_synset = mwe_best_synset[mwe_best_synset.final_score == max(mwe_best_synset.final_score)]['synset']
                if len(mwe_best_synset) > 1:
                    continue
                mwe_df.loc[mwe_df['synset'] == mwe_best_synset.iloc[0], 'rejected'] = False
                if_scored = True
                done_idxes.append(other_mwe_idx)
                break
        if if_scored:
            continue

        possible_synsets = [x[1]['synset'] for x in mwe_df.iterrows() if (x[1]['idx_num'] == mwe_idx)]
        mwe_best_synset = mwe_df.copy()
        mwe_best_synset = mwe_best_synset[mwe_best_synset.synset.isin(possible_synsets)]
        mwe_best_synset = mwe_best_synset[mwe_best_synset.final_score == max(mwe_best_synset.final_score)]['synset']
        if len(mwe_best_synset) > 1:
            continue
        mwe_df.loc[mwe_df['synset'] == mwe_best_synset.iloc[0], 'rejected'] = False

    mwe_df = mwe_df[mwe_df.rejected == False]
    
    sentence_df_mwe = sentence_df.copy()
    
    for i in range(len(mwe_df)):
        missing_words = False
        mwe_word_idxes = mwe_df.loc[:, 'idx_num'].iloc[i]
        score_mwe = mwe_df.loc[:, 'final_score'].iloc[i]
        scores_non_mwe = []
        for idx in mwe_word_idxes:
            if len(single_words_df.loc[single_words_df.idx_num == idx, 'final_score']) == 0:
                missing_words = True
                break
            scores_non_mwe.append(single_words_df.loc[single_words_df.idx_num == idx, 'final_score'].iloc[0])
        if missing_words == True:
            choosen_synset = mwe_df.loc[:, 'synset'].iloc[i]
            for idx in mwe_word_idxes:
                sentence_df.loc[sentence_df.TOKEN_ID == idx, 'WN_ID'] = choosen_synset
        elif statistics.mean(scores_non_mwe) < score_mwe:
            choosen_synset = mwe_df.loc[:, 'synset'].iloc[i]
            for idx in mwe_word_idxes:
                sentence_df.loc[sentence_df.TOKEN_ID == idx, 'WN_ID'] = choosen_synset        

    for i in range(len(mwe_df)):
        missing_words = False
        mwe_word_idxes = mwe_df.loc[:, 'idx_num'].iloc[i]
        score_mwe = mwe_df.loc[:, 'final_score'].iloc[i]
        choosen_synset = mwe_df.loc[:, 'synset'].iloc[i]
        for idx in mwe_word_idxes:
            sentence_df_mwe.loc[sentence_df_mwe.TOKEN_ID == idx, 'WN_ID'] = choosen_synset
    
    sentence_df = sentence_df.drop("WN_ID_X", axis = 1)
    sentence_df_mwe = sentence_df_mwe.drop("WN_ID_X", axis = 1)
    sentence_df = sentence_df.rename(columns = {"WN_ID":"WN_ID_X"}).merge(sentence_df_mwe.loc[:, ["TOKEN_ID", "WN_ID"]], on = "TOKEN_ID")


In [112]:
def get_num_list(str_list):
    y = []
    for x in str_list:
        y.append(int(x))
    return y
    

def synset_decision(sentence_results_df, sntc_df, max_id, score_type):
    
    sentence_df = sntc_df.copy()
    
    sentence_results_df["if_best"] = False 
    
    sentence_results_df["max_score(i-1)"] = sentence_results_df["scores(i-1)"].apply(get_max_score)
    sentence_results_df["max_score(i)"] = sentence_results_df["scores(i)"].apply(get_max_score)
    sentence_results_df["max_score(i+1)"] = sentence_results_df["scores(i+1)"].apply(get_max_score)

    sentence_results_df["avg_score(i-1)"] = sentence_results_df["scores(i-1)"].apply(get_avg_score)
    sentence_results_df["avg_score(i)"] = sentence_results_df["scores(i)"].apply(get_avg_score)
    sentence_results_df["avg_score(i+1)"] = sentence_results_df["scores(i+1)"].apply(get_avg_score)
    
    if score_type == "weight_score":
        sentence_results_df["final_score"] = sentence_results_df["max_score(i-1)"]*0.25 +\
                                             sentence_results_df["max_score(i+1)"]*0.25 +\
                                             sentence_results_df["max_score(i)"]*0.5
    elif score_type == "max_avg_score":
        sentence_results_df["final_score"] = sentence_results_df[["avg_score(i)", "avg_score(i+1)", "avg_score(i-1)"]].max(axis=1)
    elif score_type == "max_max_score":
        sentence_results_df["final_score"] = sentence_results_df[["max_score(i)", "max_score(i+1)", "max_score(i-1)"]].max(axis=1)
    elif score_type == "avg_score":
        sentence_results_df["final_score"] = sentence_results_df["avg_score(i)"]
    elif score_type == "max_score":
        sentence_results_df["final_score"] = sentence_results_df["max_score(i)"]

        
    single_words_df = sentence_results_df.copy()
    single_words_df = single_words_df[~single_words_df.if_mwe]
    single_words_df["idx_num"] = single_words_df.idx.apply(lambda x: int(x[0]))
    single_words_df["if_best"] = False
    
    mwe_df = sentence_results_df.copy()
    mwe_df = mwe_df[mwe_df.if_mwe]
    mwe_df["idx_num"] = mwe_df.idx.apply(get_num_list)
    
    result_synsets_list = []
    for i in range(max_id):
        single_candidates = single_words_df.copy()
        single_candidates = single_candidates[single_candidates.idx_num == i]
        if len(single_candidates) < 1:
            continue
        best_synset = single_candidates.loc[single_candidates.final_score == max(single_candidates.final_score), 'synset'].iloc[0]
        single_words_df.loc[single_words_df['synset'] == best_synset, 'if_best'] = True
        sentence_df.loc[sentence_df['TOKEN_ID'] == i, 'WN_ID'] = best_synset
        
    single_words_df = single_words_df[single_words_df.if_best]
    mwe_idxes = mwe_df['idx_num'].drop_duplicates()
    mwe_df['rejected'] = True

    done_idxes = []

    for mwe_idx in mwe_idxes:

        if mwe_idx in done_idxes:
            continue

        done_idxes.append(mwe_idx)
        other_mwe_idxes = [x for x in mwe_idxes if x != mwe_idx]
        for done_idx in done_idxes:
            other_mwe_idxes = [x for x in other_mwe_idxes if x != done_idx]
        intersect_idxes = []
        if_scored = False
        for other_mwe_idx in other_mwe_idxes:
            if len(set(mwe_idx).intersection(other_mwe_idx)) == 0:
                continue
            else:
                possible_synsets = [x[1]['synset'] for x in mwe_df.iterrows() if (x[1]['idx_num'] == other_mwe_idx) | (x[1]['idx_num'] == mwe_idx)]
                mwe_best_synset = mwe_df.copy()
                mwe_best_synset = mwe_best_synset[mwe_best_synset.synset.isin(possible_synsets)]
                mwe_best_synset = mwe_best_synset[mwe_best_synset.final_score == max(mwe_best_synset.final_score)]['synset']
                if len(mwe_best_synset) > 1:
                    continue
                mwe_df.loc[mwe_df['synset'] == mwe_best_synset.iloc[0], 'rejected'] = False
                if_scored = True
                done_idxes.append(other_mwe_idx)
                break
        if if_scored:
            continue

        possible_synsets = [x[1]['synset'] for x in mwe_df.iterrows() if (x[1]['idx_num'] == mwe_idx)]
        mwe_best_synset = mwe_df.copy()
        mwe_best_synset = mwe_best_synset[mwe_best_synset.synset.isin(possible_synsets)]
        mwe_best_synset = mwe_best_synset[mwe_best_synset.final_score == max(mwe_best_synset.final_score)]['synset']
        if len(mwe_best_synset) > 1:
            continue
        mwe_df.loc[mwe_df['synset'] == mwe_best_synset.iloc[0], 'rejected'] = False

    mwe_df = mwe_df[mwe_df.rejected == False]
    
    sentence_df_mwe = sentence_df.copy()
    
    for i in range(len(mwe_df)):
        missing_words = False
        mwe_word_idxes = mwe_df.loc[:, 'idx_num'].iloc[i]
        score_mwe = mwe_df.loc[:, 'final_score'].iloc[i]
        scores_non_mwe = []
        for idx in mwe_word_idxes:
            if len(single_words_df.loc[single_words_df.idx_num == idx, 'final_score']) == 0:
                missing_words = True
                break
            scores_non_mwe.append(single_words_df.loc[single_words_df.idx_num == idx, 'final_score'].iloc[0])
        if missing_words == True:
            choosen_synset = mwe_df.loc[:, 'synset'].iloc[i]
            for idx in mwe_word_idxes:
                sentence_df.loc[sentence_df.TOKEN_ID == idx, 'WN_ID'] = choosen_synset
        elif statistics.mean(scores_non_mwe) < score_mwe:
            choosen_synset = mwe_df.loc[:, 'synset'].iloc[i]
            for idx in mwe_word_idxes:
                sentence_df.loc[sentence_df.TOKEN_ID == idx, 'WN_ID'] = choosen_synset        

    for i in range(len(mwe_df)):
        missing_words = False
        mwe_word_idxes = mwe_df.loc[:, 'idx_num'].iloc[i]
        score_mwe = mwe_df.loc[:, 'final_score'].iloc[i]
        choosen_synset = mwe_df.loc[:, 'synset'].iloc[i]
        for idx in mwe_word_idxes:
            sentence_df_mwe.loc[sentence_df_mwe.TOKEN_ID == idx, 'WN_ID'] = choosen_synset
    
    sentence_df = sentence_df.drop("WN_ID_X", axis = 1)
    sentence_df_mwe = sentence_df_mwe.drop("WN_ID_X", axis = 1)
    sentence_df = sentence_df.rename(columns = {"WN_ID":"WN_ID_X"}).merge(sentence_df_mwe.loc[:, ["TOKEN_ID", "WN_ID"]], on = "TOKEN_ID")
                
    return sentence_df
    

In [113]:
def synset_decision_lesk(sentence_results_df, sntc_df, max_id, weight_score = True):
    
    sentence_df = sntc_df.copy()
    
    sentence_results_df["if_best"] = False 
    
    sentence_results_df["max_score(i-1)"] = sentence_results_df["scores(i-1)"].apply(get_max_score_lesk)
    sentence_results_df["max_score(i)"] = sentence_results_df["scores(i)"].apply(get_max_score_lesk)
    sentence_results_df["max_score(i+1)"] = sentence_results_df["scores(i+1)"].apply(get_max_score_lesk)
    
    if weight_score:
        sentence_results_df["final_score"] = sentence_results_df["max_score(i-1)"]*0.25 +\
                                             sentence_results_df["max_score(i+1)"]*0.25 +\
                                             sentence_results_df["max_score(i)"]*0.5
    else:
        sentence_results_df["final_score"] = sentence_results_df[["max_score(i)", "max_score(i+1)", "max_score(i-1)"]].max(axis=1)
    
    single_words_df = sentence_results_df.copy()
    single_words_df = single_words_df[~single_words_df.if_mwe]
    single_words_df["idx_num"] = single_words_df.idx.apply(lambda x: int(x[0]))
    single_words_df["if_best"] = False
    
    mwe_df = sentence_results_df.copy()
    mwe_df = mwe_df[mwe_df.if_mwe]
    mwe_df["idx_num"] = mwe_df.idx.apply(get_num_list)
    
    result_synsets_list = []
    for i in range(max_id):
        single_candidates = single_words_df.copy()
        single_candidates = single_candidates[single_candidates.idx_num == i]
        if len(single_candidates) < 1:
            continue
        best_synset = single_candidates.loc[single_candidates.final_score == max(single_candidates.final_score), 'synset'].iloc[0]
        single_words_df.loc[single_words_df['synset'] == best_synset, 'if_best'] = True
        sentence_df.loc[sentence_df['TOKEN_ID'] == i, 'WN_ID'] = best_synset
        
    single_words_df = single_words_df[single_words_df.if_best]
    mwe_idxes = mwe_df['idx_num'].drop_duplicates()
    mwe_df['rejected'] = True

    done_idxes = []

    for mwe_idx in mwe_idxes:

        if mwe_idx in done_idxes:
            continue

        done_idxes.append(mwe_idx)
        other_mwe_idxes = [x for x in mwe_idxes if x != mwe_idx]
        for done_idx in done_idxes:
            other_mwe_idxes = [x for x in other_mwe_idxes if x != done_idx]
        intersect_idxes = []
        if_scored = False
        for other_mwe_idx in other_mwe_idxes:
            if len(set(mwe_idx).intersection(other_mwe_idx)) == 0:
                continue
            else:
                possible_synsets = [x[1]['synset'] for x in mwe_df.iterrows() if (x[1]['idx_num'] == other_mwe_idx) | (x[1]['idx_num'] == mwe_idx)]
                mwe_best_synset = mwe_df.copy()
                mwe_best_synset = mwe_best_synset[mwe_best_synset.synset.isin(possible_synsets)]
                mwe_best_synset = mwe_best_synset[mwe_best_synset.final_score == max(mwe_best_synset.final_score)]['synset']
                if len(mwe_best_synset) > 1:
                    continue
                mwe_df.loc[mwe_df['synset'] == mwe_best_synset.iloc[0], 'rejected'] = False
                if_scored = True
                done_idxes.append(other_mwe_idx)
                break
        if if_scored:
            continue

        possible_synsets = [x[1]['synset'] for x in mwe_df.iterrows() if (x[1]['idx_num'] == mwe_idx)]
        mwe_best_synset = mwe_df.copy()
        mwe_best_synset = mwe_best_synset[mwe_best_synset.synset.isin(possible_synsets)]
        mwe_best_synset = mwe_best_synset[mwe_best_synset.final_score == max(mwe_best_synset.final_score)]['synset']
        if len(mwe_best_synset) > 1:
            continue
        mwe_df.loc[mwe_df['synset'] == mwe_best_synset.iloc[0], 'rejected'] = False

    mwe_df = mwe_df[mwe_df.rejected == False]
    
    for i in range(len(mwe_df)):
        missing_words = False
        mwe_word_idxes = mwe_df.loc[:, 'idx_num'].iloc[i]
        score_mwe = mwe_df.loc[:, 'final_score'].iloc[i]
        scores_non_mwe = []
        for idx in mwe_word_idxes:
            if len(single_words_df.loc[single_words_df.idx_num == idx, 'final_score']) == 0:
                missing_words = True
                break
            scores_non_mwe.append(single_words_df.loc[single_words_df.idx_num == idx, 'final_score'].iloc[0])
        if missing_words == True:
            choosen_synset = mwe_df.loc[:, 'synset'].iloc[i]
            for idx in mwe_word_idxes:
                sentence_df.loc[sentence_df.TOKEN_ID == idx, 'WN_ID'] = choosen_synset
        elif statistics.mean(scores_non_mwe) < score_mwe:
            choosen_synset = mwe_df.loc[:, 'synset'].iloc[i]
            for idx in mwe_word_idxes:
                sentence_df.loc[sentence_df.TOKEN_ID == idx, 'WN_ID'] = choosen_synset        
        
    return sentence_df
    

In [114]:
def attach_disambiguated_synsets(sentence_df, df, column_name):
    df_ = df.copy()
    for ordr_id in sentence_df.ORDER_ID:
        df_.loc[df_.ORDER_ID == ordr_id, column_name] = sentence_df.loc[sentence_df.ORDER_ID == ordr_id, "WN_ID"]
    return df_

In [115]:
def attach_disambiguated_synsets(sentence_df, df, column_names):
    column_name1 = column_names[0]
    column_name2 = column_names[1]
    df_ = df.copy()

    for ordr_id in sentence_df.ORDER_ID:
        df_.loc[df_.ORDER_ID == ordr_id, column_name1] = sentence_df.loc[sentence_df.ORDER_ID == ordr_id, "WN_ID"].iloc[0]
        df_.loc[df_.ORDER_ID == ordr_id, column_name2] = sentence_df.loc[sentence_df.ORDER_ID == ordr_id, "WN_ID_X"].iloc[0]
    return df_

In [116]:
def process_conll(path):
    df = pd.read_csv(path, sep='\t')
    tknids = df.TOKEN_ID.to_list()
    sntids = []
    s = -1
    for tknid in tknids:
        if tknid == 0: s += 1
        sntids.append(s)

    df['SENT'] = sntids
    df['WN_ID'] = "_"
    df['WN_ID_X'] = "_"
    df['WN_ID_A'] = "_"
    df['WN_ID_A_X'] = "_"
    df['WN_ID_AM'] = "_"
    df['WN_ID_AM_X'] = "_"
    df['WN_ID_MX'] = "_"
    df['WN_ID_MX_X'] = "_"
    df['WN_ID_M'] = "_"
    df['WN_ID_M_X'] = "_"
    df['WN_ID_W_X'] = "_"
    df['WN_ID_W'] = "_"
    ###

    print(time.gmtime())

    for sent in set(sntids):
        sentence_results_df = pd.DataFrame(columns = ['idx', 'synset', 'scores(i-1)', 'scores(i)', 'scores(i+1)'])
        #sent = 1 #do usunięcia!!!!!
        sentence_df = df.copy()
        sentence_df = sentence_df[sentence_df.SENT == sent]
        sent_lemmas = list(sentence_df['LEMMA'])
        sent_orth = list(sentence_df.ORTH.str.lower())
        max_id = len(sent_lemmas)-4
        sent_possible_mwes = []

        for idx in range(0, max_id):
            iter_lemmas = list(enumerate(sent_lemmas[idx:idx+4], idx))
            iter_orth = list(enumerate(sent_orth[idx:idx+4], idx))
            possible_mwes = (findsubsets(iter_lemmas, 4) + findsubsets(iter_orth, 4) + 
                           findsubsets(iter_lemmas, 3) + findsubsets(iter_orth, 3))
            sent_possible_mwes.extend(possible_mwes)
            
        for idx in range(0, max_id+1):
            iter_lemmas = list(enumerate(sent_lemmas[idx:idx+3], idx))
            iter_orth = list(enumerate(sent_orth[idx:idx+3], idx))
            possible_mwes = (findsubsets(iter_lemmas, 2) + findsubsets(iter_orth, 2))
            sent_possible_mwes.extend(possible_mwes)        

        sent_possible_mwes = [join_lemmas_and_idx(x) for x in set(sent_possible_mwes)]
        possible_mwes = [[x[0].split(" ")[1] + " " + x[0].split(" ")[0], 
                          [x[1][1], x[1][0]]] for x in sent_possible_mwes if len(x[1]) == 2]
        sent_possible_mwes.extend(possible_mwes)
        
        found_mwes_df = pd.DataFrame(columns=['lemma', 'idxes'])

        for mwe in sent_possible_mwes:
            filtered_pairs_df = mwe_lemma_synset_pairs_df[mwe_lemma_synset_pairs_df.lemma == mwe[0]]
            if len(filtered_pairs_df) > 0:
                for fnd_lemma_idx in range(len(filtered_pairs_df)):
                        found_mwes_df = found_mwes_df.append({'lemma' : mwe[0],
                                                              #'synset' : filtered_pairs_df.iloc[fnd_lemma_idx, 2],
                                                              'idxes' : mwe[1]}, ignore_index=True)
        found_mwes_df['idxes'] = found_mwes_df['idxes'].apply(tuple)
        found_mwes_df = found_mwes_df.drop_duplicates()

        for i in range(len(df[df.SENT == sent])):
            lemma = sentence_df['LEMMA'].iloc[i].lower()
            synsets = lemma_synset_pairs_df['synset'][lemma_synset_pairs_df['lemma'] == lemma].tolist()
            if len(synsets) > 0:
                if sent == 0:
                    if sent == max(set(sntids)):
                        new_row = disambiguate(lemma, 
                                               i, 
                                               sentence_df,
                                               "missing",
                                               "missing",
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df, 
                                               TOKENIZER, 
                                               model)
                        sentence_results_df = sentence_results_df.append(new_row)
                    else:
                        next_sentence_df = df[df.SENT == sent+1]
                        new_row = disambiguate(lemma, 
                                               i, 
                                               sentence_df,
                                               "missing",
                                               next_sentence_df,
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df, 
                                               TOKENIZER, 
                                               model)
                        sentence_results_df = sentence_results_df.append(new_row)
                else:
                    if sent == max(set(sntids)):
                        prev_sentence_df = df[df.SENT == sent-1]
                        new_row = disambiguate(lemma, 
                                               i, 
                                               sentence_df,
                                               prev_sentence_df,
                                               "missing",
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df, 
                                               TOKENIZER, 
                                               model)
                        sentence_results_df = sentence_results_df.append(new_row)
                    else:
                        next_sentence_df = df[df.SENT == sent+1]
                        prev_sentence_df = df[df.SENT == sent-1]
                        new_row = disambiguate(lemma, 
                                               i, 
                                               sentence_df,
                                               prev_sentence_df,
                                               next_sentence_df,
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df, 
                                               TOKENIZER, 
                                               model)
                        sentence_results_df = sentence_results_df.append(new_row)


        for mwe in range(len(found_mwes_df)):
            lemma = found_mwes_df.iloc[mwe, 0]
            mwe_idxes = found_mwes_df.iloc[mwe, 1]
            if sent == 0:
                if sent == max(set(sntids)):
                    new_row = disambiguate(lemma, 
                                               mwe_idxes, 
                                               sentence_df,
                                               "missing",
                                               "missing",
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df, 
                                               TOKENIZER, 
                                               model)
                    sentence_results_df = sentence_results_df.append(new_row)
                else:
                    next_sentence_df = df[df.SENT == sent+1]
                    new_row = disambiguate(lemma, 
                                               mwe_idxes, 
                                               sentence_df,
                                               "missing",
                                               next_sentence_df,
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df, 
                                               TOKENIZER, 
                                               model)
                    sentence_results_df = sentence_results_df.append(new_row)
            else:
                if sent == max(set(sntids)):
                    prev_sentence_df = df[df.SENT == sent-1]
                    new_row = disambiguate(lemma, 
                                               mwe_idxes, 
                                               sentence_df,
                                               prev_sentence_df,
                                               "missing",
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df, 
                                               TOKENIZER, 
                                               model)
                    sentence_results_df = sentence_results_df.append(new_row)
                else:
                    next_sentence_df = df[df.SENT == sent+1]
                    prev_sentence_df = df[df.SENT == sent-1]
                    new_row = disambiguate(lemma, 
                                               mwe_idxes, 
                                               sentence_df,
                                               prev_sentence_df,
                                               next_sentence_df,
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df, 
                                               TOKENIZER, 
                                               model)
                    sentence_results_df = sentence_results_df.append(new_row)

        sentence_results_df.idx = sentence_results_df.idx.str.split(',')        
        sentence_results_df["if_mwe"] = [True if len(x) > 1 else False for x in sentence_results_df.idx]    

        if len(sentence_results_df) > 0:
            sentence_df_avg = synset_decision(sentence_results_df, sentence_df, len(sent_lemmas), score_type = "avg_score")
            sentence_df_avg_max = synset_decision(sentence_results_df, sentence_df, len(sent_lemmas), score_type = "max_avg_score")
            sentence_df_max = synset_decision(sentence_results_df, sentence_df, len(sent_lemmas), score_type = "max_score")
            sentence_df_max_max = synset_decision(sentence_results_df, sentence_df, len(sent_lemmas), score_type = "max_max_score")
            sentence_df_weight = synset_decision(sentence_results_df, sentence_df, len(sent_lemmas), score_type = "weight_score")

            df = attach_disambiguated_synsets(sentence_df_avg, df, ["WN_ID_A", "WN_ID_A_X"])
            df = attach_disambiguated_synsets(sentence_df_avg_max, df, ["WN_ID_AM", "WN_ID_AM_X"])
            df = attach_disambiguated_synsets(sentence_df_max, df, ["WN_ID_MX", "WN_ID_MX_X"])
            df = attach_disambiguated_synsets(sentence_df_max_max, df, ["WN_ID_M", "WN_ID_M_X"])
            df = attach_disambiguated_synsets(sentence_df_weight, df, ["WN_ID_W", "WN_ID_W_X"])
            

    df = df.drop(columns=['SENT', 'WN_ID'])
    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_A"] = "_"

    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_AM"] = "_"

    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_MX"] = "_"

    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_M"] = "_"
    
    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_W"] = "_"
    
    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_A_X"] = "_"

    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_AM_X"] = "_"

    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_MX_X"] = "_"

    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_M_X"] = "_"
    
    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_W_X"] = "_"

    
    [folder, fname] = path.split('/')[-2:]
    rpath = 'results_herbert/' + folder + '/' + fname
    df.to_csv(rpath, sep='\t', index=False)

In [117]:
def process_conll_lesk(path):
    df = pd.read_csv(path, sep='\t')
    tknids = df.TOKEN_ID.to_list()
    sntids = []
    s = -1
    for tknid in tknids:
        if tknid == 0: s += 1
        sntids.append(s)

    df['SENT'] = sntids
    df['WN_ID'] = "_"

    print(time.gmtime())

    for sent in set(sntids):
        sentence_results_df = pd.DataFrame(columns = ['idx', 'synset', 'scores(i-1)', 'scores(i)', 'scores(i+1)'])
        #sent = 1 #do usunięcia!!!!!
        sentence_df = df.copy()
        sentence_df = sentence_df[sentence_df.SENT == sent]
        sent_lemmas = list(sentence_df['LEMMA'])
        sent_orth = list(sentence_df.ORTH.str.lower())
        max_id = len(sent_lemmas)-4
        sent_possible_mwes = []

        for idx in range(0, max_id):
            iter_lemmas = list(enumerate(sent_lemmas[idx:idx+4], idx))
            iter_orth = list(enumerate(sent_orth[idx:idx+4], idx))
            possible_mwes = (findsubsets(iter_lemmas, 4) + findsubsets(iter_orth, 4) + 
                           findsubsets(iter_lemmas, 3) + findsubsets(iter_orth, 3))
            sent_possible_mwes.extend(possible_mwes)
            
        for idx in range(0, max_id+1):
            iter_lemmas = list(enumerate(sent_lemmas[idx:idx+3], idx))
            iter_orth = list(enumerate(sent_orth[idx:idx+3], idx))
            possible_mwes = (findsubsets(iter_lemmas, 2) + findsubsets(iter_orth, 2))
            sent_possible_mwes.extend(possible_mwes)        

        sent_possible_mwes = [join_lemmas_and_idx(x) for x in set(sent_possible_mwes)]
        possible_mwes = [[x[0].split(" ")[1] + " " + x[0].split(" ")[0], 
                          [x[1][1], x[1][0]]] for x in sent_possible_mwes if len(x[1]) == 2]
        sent_possible_mwes.extend(possible_mwes)
        
        found_mwes_df = pd.DataFrame(columns=['lemma', 'idxes'])

        for mwe in sent_possible_mwes:
            filtered_pairs_df = mwe_lemma_synset_pairs_df[mwe_lemma_synset_pairs_df.lemma == mwe[0]]
            if len(filtered_pairs_df) > 0:
                for fnd_lemma_idx in range(len(filtered_pairs_df)):
                        found_mwes_df = found_mwes_df.append({'lemma' : mwe[0],
                                                              #'synset' : filtered_pairs_df.iloc[fnd_lemma_idx, 2],
                                                              'idxes' : mwe[1]}, ignore_index=True)
        found_mwes_df['idxes'] = found_mwes_df['idxes'].apply(tuple)
        found_mwes_df = found_mwes_df.drop_duplicates()

        for i in range(len(df[df.SENT == sent])):
            lemma = sentence_df['LEMMA'].iloc[i].lower()
            synsets = lemma_synset_pairs_df['synset'][lemma_synset_pairs_df['lemma'] == lemma].tolist()
            if len(synsets) > 0:
                if sent == 0:
                    if sent == max(set(sntids)):
                        new_row = disambiguate_lesk(lemma, 
                                               i, 
                                               sentence_df,
                                               "missing",
                                               "missing",
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df)
                        sentence_results_df = sentence_results_df.append(new_row)
                    else:
                        next_sentence_df = df[df.SENT == sent+1]
                        new_row = disambiguate_lesk(lemma, 
                                               i, 
                                               sentence_df,
                                               "missing",
                                               next_sentence_df,
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df)
                        sentence_results_df = sentence_results_df.append(new_row)
                else:
                    if sent == max(set(sntids)):
                        prev_sentence_df = df[df.SENT == sent-1]
                        new_row = disambiguate_lesk(lemma, 
                                               i, 
                                               sentence_df,
                                               prev_sentence_df,
                                               "missing",
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df)
                        sentence_results_df = sentence_results_df.append(new_row)
                    else:
                        next_sentence_df = df[df.SENT == sent+1]
                        prev_sentence_df = df[df.SENT == sent-1]
                        new_row = disambiguate_lesk(lemma, 
                                               i, 
                                               sentence_df,
                                               prev_sentence_df,
                                               next_sentence_df,
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df)
                        sentence_results_df = sentence_results_df.append(new_row)


        for mwe in range(len(found_mwes_df)):
            lemma = found_mwes_df.iloc[mwe, 0]
            mwe_idxes = found_mwes_df.iloc[mwe, 1]
            if sent == 0:
                if sent == max(set(sntids)):
                    new_row = disambiguate_lesk(lemma, 
                                               mwe_idxes, 
                                               sentence_df,
                                               "missing",
                                               "missing",
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df)
                    sentence_results_df = sentence_results_df.append(new_row)
                else:
                    next_sentence_df = df[df.SENT == sent+1]
                    new_row = disambiguate_lesk(lemma, 
                                               mwe_idxes, 
                                               sentence_df,
                                               "missing",
                                               next_sentence_df,
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df)
                    sentence_results_df = sentence_results_df.append(new_row)
            else:
                if sent == max(set(sntids)):
                    prev_sentence_df = df[df.SENT == sent-1]
                    new_row = disambiguate_lesk(lemma, 
                                               mwe_idxes, 
                                               sentence_df,
                                               prev_sentence_df,
                                               "missing",
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df)
                    sentence_results_df = sentence_results_df.append(new_row)
                else:
                    next_sentence_df = df[df.SENT == sent+1]
                    prev_sentence_df = df[df.SENT == sent-1]
                    new_row = disambiguate_lesk(lemma, 
                                               mwe_idxes, 
                                               sentence_df,
                                               prev_sentence_df,
                                               next_sentence_df,
                                               lemma_synset_pairs_df, 
                                               synsets_examples_df)
                    sentence_results_df = sentence_results_df.append(new_row)

        sentence_results_df.idx = sentence_results_df.idx.str.split(',')        
        sentence_results_df["if_mwe"] = [True if len(x) > 1 else False for x in sentence_results_df.idx]    

        if len(sentence_results_df) > 0:
            sentence_df_max = synset_decision_lesk(sentence_results_df, sentence_df, len(sent_lemmas), weight_score = False)

            df = attach_disambiguated_synsets(sentence_df_max, df, "WN_ID_L")

    df = df.drop(columns=['SENT', 'WN_ID'])
    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_W_L"] = "_"

    [folder, fname] = path.split('/')[-2:]
    rpath = 'results_lesk/' + folder + '/' + fname
    df.to_csv(rpath, sep='\t', index=False)

## Polbert

In [118]:
model = PolbertWSD(pretrained_model)
#model = HerbertWSD(pretrained_model)

model.to('cuda')
model.load_state_dict(torch.load('WSD_polbert_model2.bin'))
model.eval();

RuntimeError: Error(s) in loading state_dict for HerbertWSD:
	Missing key(s) in state_dict: "herbert.embeddings.position_ids", "herbert.embeddings.word_embeddings.weight", "herbert.embeddings.position_embeddings.weight", "herbert.embeddings.token_type_embeddings.weight", "herbert.embeddings.LayerNorm.weight", "herbert.embeddings.LayerNorm.bias", "herbert.encoder.layer.0.attention.self.query.weight", "herbert.encoder.layer.0.attention.self.query.bias", "herbert.encoder.layer.0.attention.self.key.weight", "herbert.encoder.layer.0.attention.self.key.bias", "herbert.encoder.layer.0.attention.self.value.weight", "herbert.encoder.layer.0.attention.self.value.bias", "herbert.encoder.layer.0.attention.output.dense.weight", "herbert.encoder.layer.0.attention.output.dense.bias", "herbert.encoder.layer.0.attention.output.LayerNorm.weight", "herbert.encoder.layer.0.attention.output.LayerNorm.bias", "herbert.encoder.layer.0.intermediate.dense.weight", "herbert.encoder.layer.0.intermediate.dense.bias", "herbert.encoder.layer.0.output.dense.weight", "herbert.encoder.layer.0.output.dense.bias", "herbert.encoder.layer.0.output.LayerNorm.weight", "herbert.encoder.layer.0.output.LayerNorm.bias", "herbert.encoder.layer.1.attention.self.query.weight", "herbert.encoder.layer.1.attention.self.query.bias", "herbert.encoder.layer.1.attention.self.key.weight", "herbert.encoder.layer.1.attention.self.key.bias", "herbert.encoder.layer.1.attention.self.value.weight", "herbert.encoder.layer.1.attention.self.value.bias", "herbert.encoder.layer.1.attention.output.dense.weight", "herbert.encoder.layer.1.attention.output.dense.bias", "herbert.encoder.layer.1.attention.output.LayerNorm.weight", "herbert.encoder.layer.1.attention.output.LayerNorm.bias", "herbert.encoder.layer.1.intermediate.dense.weight", "herbert.encoder.layer.1.intermediate.dense.bias", "herbert.encoder.layer.1.output.dense.weight", "herbert.encoder.layer.1.output.dense.bias", "herbert.encoder.layer.1.output.LayerNorm.weight", "herbert.encoder.layer.1.output.LayerNorm.bias", "herbert.encoder.layer.2.attention.self.query.weight", "herbert.encoder.layer.2.attention.self.query.bias", "herbert.encoder.layer.2.attention.self.key.weight", "herbert.encoder.layer.2.attention.self.key.bias", "herbert.encoder.layer.2.attention.self.value.weight", "herbert.encoder.layer.2.attention.self.value.bias", "herbert.encoder.layer.2.attention.output.dense.weight", "herbert.encoder.layer.2.attention.output.dense.bias", "herbert.encoder.layer.2.attention.output.LayerNorm.weight", "herbert.encoder.layer.2.attention.output.LayerNorm.bias", "herbert.encoder.layer.2.intermediate.dense.weight", "herbert.encoder.layer.2.intermediate.dense.bias", "herbert.encoder.layer.2.output.dense.weight", "herbert.encoder.layer.2.output.dense.bias", "herbert.encoder.layer.2.output.LayerNorm.weight", "herbert.encoder.layer.2.output.LayerNorm.bias", "herbert.encoder.layer.3.attention.self.query.weight", "herbert.encoder.layer.3.attention.self.query.bias", "herbert.encoder.layer.3.attention.self.key.weight", "herbert.encoder.layer.3.attention.self.key.bias", "herbert.encoder.layer.3.attention.self.value.weight", "herbert.encoder.layer.3.attention.self.value.bias", "herbert.encoder.layer.3.attention.output.dense.weight", "herbert.encoder.layer.3.attention.output.dense.bias", "herbert.encoder.layer.3.attention.output.LayerNorm.weight", "herbert.encoder.layer.3.attention.output.LayerNorm.bias", "herbert.encoder.layer.3.intermediate.dense.weight", "herbert.encoder.layer.3.intermediate.dense.bias", "herbert.encoder.layer.3.output.dense.weight", "herbert.encoder.layer.3.output.dense.bias", "herbert.encoder.layer.3.output.LayerNorm.weight", "herbert.encoder.layer.3.output.LayerNorm.bias", "herbert.encoder.layer.4.attention.self.query.weight", "herbert.encoder.layer.4.attention.self.query.bias", "herbert.encoder.layer.4.attention.self.key.weight", "herbert.encoder.layer.4.attention.self.key.bias", "herbert.encoder.layer.4.attention.self.value.weight", "herbert.encoder.layer.4.attention.self.value.bias", "herbert.encoder.layer.4.attention.output.dense.weight", "herbert.encoder.layer.4.attention.output.dense.bias", "herbert.encoder.layer.4.attention.output.LayerNorm.weight", "herbert.encoder.layer.4.attention.output.LayerNorm.bias", "herbert.encoder.layer.4.intermediate.dense.weight", "herbert.encoder.layer.4.intermediate.dense.bias", "herbert.encoder.layer.4.output.dense.weight", "herbert.encoder.layer.4.output.dense.bias", "herbert.encoder.layer.4.output.LayerNorm.weight", "herbert.encoder.layer.4.output.LayerNorm.bias", "herbert.encoder.layer.5.attention.self.query.weight", "herbert.encoder.layer.5.attention.self.query.bias", "herbert.encoder.layer.5.attention.self.key.weight", "herbert.encoder.layer.5.attention.self.key.bias", "herbert.encoder.layer.5.attention.self.value.weight", "herbert.encoder.layer.5.attention.self.value.bias", "herbert.encoder.layer.5.attention.output.dense.weight", "herbert.encoder.layer.5.attention.output.dense.bias", "herbert.encoder.layer.5.attention.output.LayerNorm.weight", "herbert.encoder.layer.5.attention.output.LayerNorm.bias", "herbert.encoder.layer.5.intermediate.dense.weight", "herbert.encoder.layer.5.intermediate.dense.bias", "herbert.encoder.layer.5.output.dense.weight", "herbert.encoder.layer.5.output.dense.bias", "herbert.encoder.layer.5.output.LayerNorm.weight", "herbert.encoder.layer.5.output.LayerNorm.bias", "herbert.encoder.layer.6.attention.self.query.weight", "herbert.encoder.layer.6.attention.self.query.bias", "herbert.encoder.layer.6.attention.self.key.weight", "herbert.encoder.layer.6.attention.self.key.bias", "herbert.encoder.layer.6.attention.self.value.weight", "herbert.encoder.layer.6.attention.self.value.bias", "herbert.encoder.layer.6.attention.output.dense.weight", "herbert.encoder.layer.6.attention.output.dense.bias", "herbert.encoder.layer.6.attention.output.LayerNorm.weight", "herbert.encoder.layer.6.attention.output.LayerNorm.bias", "herbert.encoder.layer.6.intermediate.dense.weight", "herbert.encoder.layer.6.intermediate.dense.bias", "herbert.encoder.layer.6.output.dense.weight", "herbert.encoder.layer.6.output.dense.bias", "herbert.encoder.layer.6.output.LayerNorm.weight", "herbert.encoder.layer.6.output.LayerNorm.bias", "herbert.encoder.layer.7.attention.self.query.weight", "herbert.encoder.layer.7.attention.self.query.bias", "herbert.encoder.layer.7.attention.self.key.weight", "herbert.encoder.layer.7.attention.self.key.bias", "herbert.encoder.layer.7.attention.self.value.weight", "herbert.encoder.layer.7.attention.self.value.bias", "herbert.encoder.layer.7.attention.output.dense.weight", "herbert.encoder.layer.7.attention.output.dense.bias", "herbert.encoder.layer.7.attention.output.LayerNorm.weight", "herbert.encoder.layer.7.attention.output.LayerNorm.bias", "herbert.encoder.layer.7.intermediate.dense.weight", "herbert.encoder.layer.7.intermediate.dense.bias", "herbert.encoder.layer.7.output.dense.weight", "herbert.encoder.layer.7.output.dense.bias", "herbert.encoder.layer.7.output.LayerNorm.weight", "herbert.encoder.layer.7.output.LayerNorm.bias", "herbert.encoder.layer.8.attention.self.query.weight", "herbert.encoder.layer.8.attention.self.query.bias", "herbert.encoder.layer.8.attention.self.key.weight", "herbert.encoder.layer.8.attention.self.key.bias", "herbert.encoder.layer.8.attention.self.value.weight", "herbert.encoder.layer.8.attention.self.value.bias", "herbert.encoder.layer.8.attention.output.dense.weight", "herbert.encoder.layer.8.attention.output.dense.bias", "herbert.encoder.layer.8.attention.output.LayerNorm.weight", "herbert.encoder.layer.8.attention.output.LayerNorm.bias", "herbert.encoder.layer.8.intermediate.dense.weight", "herbert.encoder.layer.8.intermediate.dense.bias", "herbert.encoder.layer.8.output.dense.weight", "herbert.encoder.layer.8.output.dense.bias", "herbert.encoder.layer.8.output.LayerNorm.weight", "herbert.encoder.layer.8.output.LayerNorm.bias", "herbert.encoder.layer.9.attention.self.query.weight", "herbert.encoder.layer.9.attention.self.query.bias", "herbert.encoder.layer.9.attention.self.key.weight", "herbert.encoder.layer.9.attention.self.key.bias", "herbert.encoder.layer.9.attention.self.value.weight", "herbert.encoder.layer.9.attention.self.value.bias", "herbert.encoder.layer.9.attention.output.dense.weight", "herbert.encoder.layer.9.attention.output.dense.bias", "herbert.encoder.layer.9.attention.output.LayerNorm.weight", "herbert.encoder.layer.9.attention.output.LayerNorm.bias", "herbert.encoder.layer.9.intermediate.dense.weight", "herbert.encoder.layer.9.intermediate.dense.bias", "herbert.encoder.layer.9.output.dense.weight", "herbert.encoder.layer.9.output.dense.bias", "herbert.encoder.layer.9.output.LayerNorm.weight", "herbert.encoder.layer.9.output.LayerNorm.bias", "herbert.encoder.layer.10.attention.self.query.weight", "herbert.encoder.layer.10.attention.self.query.bias", "herbert.encoder.layer.10.attention.self.key.weight", "herbert.encoder.layer.10.attention.self.key.bias", "herbert.encoder.layer.10.attention.self.value.weight", "herbert.encoder.layer.10.attention.self.value.bias", "herbert.encoder.layer.10.attention.output.dense.weight", "herbert.encoder.layer.10.attention.output.dense.bias", "herbert.encoder.layer.10.attention.output.LayerNorm.weight", "herbert.encoder.layer.10.attention.output.LayerNorm.bias", "herbert.encoder.layer.10.intermediate.dense.weight", "herbert.encoder.layer.10.intermediate.dense.bias", "herbert.encoder.layer.10.output.dense.weight", "herbert.encoder.layer.10.output.dense.bias", "herbert.encoder.layer.10.output.LayerNorm.weight", "herbert.encoder.layer.10.output.LayerNorm.bias", "herbert.encoder.layer.11.attention.self.query.weight", "herbert.encoder.layer.11.attention.self.query.bias", "herbert.encoder.layer.11.attention.self.key.weight", "herbert.encoder.layer.11.attention.self.key.bias", "herbert.encoder.layer.11.attention.self.value.weight", "herbert.encoder.layer.11.attention.self.value.bias", "herbert.encoder.layer.11.attention.output.dense.weight", "herbert.encoder.layer.11.attention.output.dense.bias", "herbert.encoder.layer.11.attention.output.LayerNorm.weight", "herbert.encoder.layer.11.attention.output.LayerNorm.bias", "herbert.encoder.layer.11.intermediate.dense.weight", "herbert.encoder.layer.11.intermediate.dense.bias", "herbert.encoder.layer.11.output.dense.weight", "herbert.encoder.layer.11.output.dense.bias", "herbert.encoder.layer.11.output.LayerNorm.weight", "herbert.encoder.layer.11.output.LayerNorm.bias", "herbert.pooler.dense.weight", "herbert.pooler.dense.bias". 
	Unexpected key(s) in state_dict: "polbert.embeddings.position_ids", "polbert.embeddings.word_embeddings.weight", "polbert.embeddings.position_embeddings.weight", "polbert.embeddings.token_type_embeddings.weight", "polbert.embeddings.LayerNorm.weight", "polbert.embeddings.LayerNorm.bias", "polbert.encoder.layer.0.attention.self.query.weight", "polbert.encoder.layer.0.attention.self.query.bias", "polbert.encoder.layer.0.attention.self.key.weight", "polbert.encoder.layer.0.attention.self.key.bias", "polbert.encoder.layer.0.attention.self.value.weight", "polbert.encoder.layer.0.attention.self.value.bias", "polbert.encoder.layer.0.attention.output.dense.weight", "polbert.encoder.layer.0.attention.output.dense.bias", "polbert.encoder.layer.0.attention.output.LayerNorm.weight", "polbert.encoder.layer.0.attention.output.LayerNorm.bias", "polbert.encoder.layer.0.intermediate.dense.weight", "polbert.encoder.layer.0.intermediate.dense.bias", "polbert.encoder.layer.0.output.dense.weight", "polbert.encoder.layer.0.output.dense.bias", "polbert.encoder.layer.0.output.LayerNorm.weight", "polbert.encoder.layer.0.output.LayerNorm.bias", "polbert.encoder.layer.1.attention.self.query.weight", "polbert.encoder.layer.1.attention.self.query.bias", "polbert.encoder.layer.1.attention.self.key.weight", "polbert.encoder.layer.1.attention.self.key.bias", "polbert.encoder.layer.1.attention.self.value.weight", "polbert.encoder.layer.1.attention.self.value.bias", "polbert.encoder.layer.1.attention.output.dense.weight", "polbert.encoder.layer.1.attention.output.dense.bias", "polbert.encoder.layer.1.attention.output.LayerNorm.weight", "polbert.encoder.layer.1.attention.output.LayerNorm.bias", "polbert.encoder.layer.1.intermediate.dense.weight", "polbert.encoder.layer.1.intermediate.dense.bias", "polbert.encoder.layer.1.output.dense.weight", "polbert.encoder.layer.1.output.dense.bias", "polbert.encoder.layer.1.output.LayerNorm.weight", "polbert.encoder.layer.1.output.LayerNorm.bias", "polbert.encoder.layer.2.attention.self.query.weight", "polbert.encoder.layer.2.attention.self.query.bias", "polbert.encoder.layer.2.attention.self.key.weight", "polbert.encoder.layer.2.attention.self.key.bias", "polbert.encoder.layer.2.attention.self.value.weight", "polbert.encoder.layer.2.attention.self.value.bias", "polbert.encoder.layer.2.attention.output.dense.weight", "polbert.encoder.layer.2.attention.output.dense.bias", "polbert.encoder.layer.2.attention.output.LayerNorm.weight", "polbert.encoder.layer.2.attention.output.LayerNorm.bias", "polbert.encoder.layer.2.intermediate.dense.weight", "polbert.encoder.layer.2.intermediate.dense.bias", "polbert.encoder.layer.2.output.dense.weight", "polbert.encoder.layer.2.output.dense.bias", "polbert.encoder.layer.2.output.LayerNorm.weight", "polbert.encoder.layer.2.output.LayerNorm.bias", "polbert.encoder.layer.3.attention.self.query.weight", "polbert.encoder.layer.3.attention.self.query.bias", "polbert.encoder.layer.3.attention.self.key.weight", "polbert.encoder.layer.3.attention.self.key.bias", "polbert.encoder.layer.3.attention.self.value.weight", "polbert.encoder.layer.3.attention.self.value.bias", "polbert.encoder.layer.3.attention.output.dense.weight", "polbert.encoder.layer.3.attention.output.dense.bias", "polbert.encoder.layer.3.attention.output.LayerNorm.weight", "polbert.encoder.layer.3.attention.output.LayerNorm.bias", "polbert.encoder.layer.3.intermediate.dense.weight", "polbert.encoder.layer.3.intermediate.dense.bias", "polbert.encoder.layer.3.output.dense.weight", "polbert.encoder.layer.3.output.dense.bias", "polbert.encoder.layer.3.output.LayerNorm.weight", "polbert.encoder.layer.3.output.LayerNorm.bias", "polbert.encoder.layer.4.attention.self.query.weight", "polbert.encoder.layer.4.attention.self.query.bias", "polbert.encoder.layer.4.attention.self.key.weight", "polbert.encoder.layer.4.attention.self.key.bias", "polbert.encoder.layer.4.attention.self.value.weight", "polbert.encoder.layer.4.attention.self.value.bias", "polbert.encoder.layer.4.attention.output.dense.weight", "polbert.encoder.layer.4.attention.output.dense.bias", "polbert.encoder.layer.4.attention.output.LayerNorm.weight", "polbert.encoder.layer.4.attention.output.LayerNorm.bias", "polbert.encoder.layer.4.intermediate.dense.weight", "polbert.encoder.layer.4.intermediate.dense.bias", "polbert.encoder.layer.4.output.dense.weight", "polbert.encoder.layer.4.output.dense.bias", "polbert.encoder.layer.4.output.LayerNorm.weight", "polbert.encoder.layer.4.output.LayerNorm.bias", "polbert.encoder.layer.5.attention.self.query.weight", "polbert.encoder.layer.5.attention.self.query.bias", "polbert.encoder.layer.5.attention.self.key.weight", "polbert.encoder.layer.5.attention.self.key.bias", "polbert.encoder.layer.5.attention.self.value.weight", "polbert.encoder.layer.5.attention.self.value.bias", "polbert.encoder.layer.5.attention.output.dense.weight", "polbert.encoder.layer.5.attention.output.dense.bias", "polbert.encoder.layer.5.attention.output.LayerNorm.weight", "polbert.encoder.layer.5.attention.output.LayerNorm.bias", "polbert.encoder.layer.5.intermediate.dense.weight", "polbert.encoder.layer.5.intermediate.dense.bias", "polbert.encoder.layer.5.output.dense.weight", "polbert.encoder.layer.5.output.dense.bias", "polbert.encoder.layer.5.output.LayerNorm.weight", "polbert.encoder.layer.5.output.LayerNorm.bias", "polbert.encoder.layer.6.attention.self.query.weight", "polbert.encoder.layer.6.attention.self.query.bias", "polbert.encoder.layer.6.attention.self.key.weight", "polbert.encoder.layer.6.attention.self.key.bias", "polbert.encoder.layer.6.attention.self.value.weight", "polbert.encoder.layer.6.attention.self.value.bias", "polbert.encoder.layer.6.attention.output.dense.weight", "polbert.encoder.layer.6.attention.output.dense.bias", "polbert.encoder.layer.6.attention.output.LayerNorm.weight", "polbert.encoder.layer.6.attention.output.LayerNorm.bias", "polbert.encoder.layer.6.intermediate.dense.weight", "polbert.encoder.layer.6.intermediate.dense.bias", "polbert.encoder.layer.6.output.dense.weight", "polbert.encoder.layer.6.output.dense.bias", "polbert.encoder.layer.6.output.LayerNorm.weight", "polbert.encoder.layer.6.output.LayerNorm.bias", "polbert.encoder.layer.7.attention.self.query.weight", "polbert.encoder.layer.7.attention.self.query.bias", "polbert.encoder.layer.7.attention.self.key.weight", "polbert.encoder.layer.7.attention.self.key.bias", "polbert.encoder.layer.7.attention.self.value.weight", "polbert.encoder.layer.7.attention.self.value.bias", "polbert.encoder.layer.7.attention.output.dense.weight", "polbert.encoder.layer.7.attention.output.dense.bias", "polbert.encoder.layer.7.attention.output.LayerNorm.weight", "polbert.encoder.layer.7.attention.output.LayerNorm.bias", "polbert.encoder.layer.7.intermediate.dense.weight", "polbert.encoder.layer.7.intermediate.dense.bias", "polbert.encoder.layer.7.output.dense.weight", "polbert.encoder.layer.7.output.dense.bias", "polbert.encoder.layer.7.output.LayerNorm.weight", "polbert.encoder.layer.7.output.LayerNorm.bias", "polbert.encoder.layer.8.attention.self.query.weight", "polbert.encoder.layer.8.attention.self.query.bias", "polbert.encoder.layer.8.attention.self.key.weight", "polbert.encoder.layer.8.attention.self.key.bias", "polbert.encoder.layer.8.attention.self.value.weight", "polbert.encoder.layer.8.attention.self.value.bias", "polbert.encoder.layer.8.attention.output.dense.weight", "polbert.encoder.layer.8.attention.output.dense.bias", "polbert.encoder.layer.8.attention.output.LayerNorm.weight", "polbert.encoder.layer.8.attention.output.LayerNorm.bias", "polbert.encoder.layer.8.intermediate.dense.weight", "polbert.encoder.layer.8.intermediate.dense.bias", "polbert.encoder.layer.8.output.dense.weight", "polbert.encoder.layer.8.output.dense.bias", "polbert.encoder.layer.8.output.LayerNorm.weight", "polbert.encoder.layer.8.output.LayerNorm.bias", "polbert.encoder.layer.9.attention.self.query.weight", "polbert.encoder.layer.9.attention.self.query.bias", "polbert.encoder.layer.9.attention.self.key.weight", "polbert.encoder.layer.9.attention.self.key.bias", "polbert.encoder.layer.9.attention.self.value.weight", "polbert.encoder.layer.9.attention.self.value.bias", "polbert.encoder.layer.9.attention.output.dense.weight", "polbert.encoder.layer.9.attention.output.dense.bias", "polbert.encoder.layer.9.attention.output.LayerNorm.weight", "polbert.encoder.layer.9.attention.output.LayerNorm.bias", "polbert.encoder.layer.9.intermediate.dense.weight", "polbert.encoder.layer.9.intermediate.dense.bias", "polbert.encoder.layer.9.output.dense.weight", "polbert.encoder.layer.9.output.dense.bias", "polbert.encoder.layer.9.output.LayerNorm.weight", "polbert.encoder.layer.9.output.LayerNorm.bias", "polbert.encoder.layer.10.attention.self.query.weight", "polbert.encoder.layer.10.attention.self.query.bias", "polbert.encoder.layer.10.attention.self.key.weight", "polbert.encoder.layer.10.attention.self.key.bias", "polbert.encoder.layer.10.attention.self.value.weight", "polbert.encoder.layer.10.attention.self.value.bias", "polbert.encoder.layer.10.attention.output.dense.weight", "polbert.encoder.layer.10.attention.output.dense.bias", "polbert.encoder.layer.10.attention.output.LayerNorm.weight", "polbert.encoder.layer.10.attention.output.LayerNorm.bias", "polbert.encoder.layer.10.intermediate.dense.weight", "polbert.encoder.layer.10.intermediate.dense.bias", "polbert.encoder.layer.10.output.dense.weight", "polbert.encoder.layer.10.output.dense.bias", "polbert.encoder.layer.10.output.LayerNorm.weight", "polbert.encoder.layer.10.output.LayerNorm.bias", "polbert.encoder.layer.11.attention.self.query.weight", "polbert.encoder.layer.11.attention.self.query.bias", "polbert.encoder.layer.11.attention.self.key.weight", "polbert.encoder.layer.11.attention.self.key.bias", "polbert.encoder.layer.11.attention.self.value.weight", "polbert.encoder.layer.11.attention.self.value.bias", "polbert.encoder.layer.11.attention.output.dense.weight", "polbert.encoder.layer.11.attention.output.dense.bias", "polbert.encoder.layer.11.attention.output.LayerNorm.weight", "polbert.encoder.layer.11.attention.output.LayerNorm.bias", "polbert.encoder.layer.11.intermediate.dense.weight", "polbert.encoder.layer.11.intermediate.dense.bias", "polbert.encoder.layer.11.output.dense.weight", "polbert.encoder.layer.11.output.dense.bias", "polbert.encoder.layer.11.output.LayerNorm.weight", "polbert.encoder.layer.11.output.LayerNorm.bias", "polbert.pooler.dense.weight", "polbert.pooler.dense.bias". 

In [77]:
model

PolbertWSD(
  (polbert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(60000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

In [261]:
pathlist = list(Path('testdata/testdata').glob('**/*.conll'))
pathlist_str = [str(x).replace("\\", "/") for x in pathlist]
pathlist_done = list(Path('results').glob('**/*.conll'))
pathlist_done_str = ["testdata/testdata" + str(x).replace("\\", "/")[7:] for x in pathlist_done]

pathlist_to_do = list(set(pathlist_str) - set(pathlist_done_str))

for path in tqdm(pathlist_to_do):
    process_conll(path)

  0%|          | 0/129 [00:00<?, ?it/s]

time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=0, tm_min=38, tm_sec=22, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=0, tm_min=38, tm_sec=34, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=0, tm_min=39, tm_sec=32, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=0, tm_min=41, tm_sec=9, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=0, tm_min=41, tm_sec=30, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=0, tm_min=44, tm_sec=20, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=0, tm_min=44, tm_sec=38, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=0, tm_min=46, tm_sec=18, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=

time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=2, tm_min=39, tm_sec=14, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=2, tm_min=41, tm_sec=35, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=2, tm_min=43, tm_sec=28, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=2, tm_min=48, tm_sec=4, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=2, tm_min=50, tm_sec=20, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=2, tm_min=50, tm_sec=38, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=2, tm_min=52, tm_sec=32, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=2, tm_min=56, tm_sec=17, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=

In [263]:
import gc

gc.collect()

torch.cuda.empty_cache()

## LESK ALGORITHM

In [283]:

pathlist = list(Path('testdata/testdata').glob('**/*.conll'))
pathlist_str = [str(x).replace("\\", "/") for x in pathlist]
pathlist_done = list(Path('results_lesk').glob('**/*.conll'))
pathlist_done_str = ["testdata/testdata" + str(x).replace("\\", "/")[7:] for x in pathlist_done]

pathlist_to_do = list(set(pathlist_str) - set(pathlist_done_str))

for path in tqdm(pathlist_to_do):
    process_conll_lesk(path)

  0%|          | 0/129 [00:00<?, ?it/s]

time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=10, tm_min=22, tm_sec=39, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=10, tm_min=22, tm_sec=46, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=10, tm_min=23, tm_sec=20, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=10, tm_min=24, tm_sec=17, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=10, tm_min=24, tm_sec=30, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=10, tm_min=26, tm_sec=9, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=10, tm_min=26, tm_sec=22, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=10, tm_min=27, tm_sec=22, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(

time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=11, tm_min=33, tm_sec=32, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=11, tm_min=34, tm_sec=28, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=11, tm_min=35, tm_sec=58, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=11, tm_min=37, tm_sec=10, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=11, tm_min=40, tm_sec=5, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=11, tm_min=41, tm_sec=31, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=11, tm_min=41, tm_sec=42, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=11, tm_min=42, tm_sec=54, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(

## Herbert

In [264]:
pretrained_model = "allegro/herbert-klej-cased-v1"
LR = 5e-5
WARMUP_STEPS = 8
MODEL_PATH = "WSD_herbert_model1.bin"
TOKENIZER = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")

MAX_LEN = 64
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 3

In [119]:
#model = PolbertWSD(pretrained_model)
model = HerbertWSD(pretrained_model)

model.to('cuda')
model.load_state_dict(torch.load('WSD_herbert_model1.bin'))
model.eval();

In [266]:
pathlist = list(Path('testdata/testdata').glob('**/*.conll'))
pathlist_str = [str(x).replace("\\", "/") for x in pathlist]
pathlist_done = list(Path('results_herbert').glob('**/*.conll'))
pathlist_done_str = ["poltestdata/testdata" + str(x).replace("\\", "/")[7:] for x in pathlist_done]

pathlist_to_do = list(set(pathlist_str) - set(pathlist_done_str))

for path in tqdm(pathlist_to_do):
    process_conll(path)

  0%|          | 0/129 [00:00<?, ?it/s]

time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=4, tm_min=32, tm_sec=32, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=4, tm_min=32, tm_sec=44, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=4, tm_min=33, tm_sec=40, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=4, tm_min=35, tm_sec=14, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=4, tm_min=35, tm_sec=35, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=4, tm_min=38, tm_sec=22, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=4, tm_min=38, tm_sec=40, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=4, tm_min=40, tm_sec=19, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year

time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=6, tm_min=32, tm_sec=40, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=6, tm_min=35, tm_sec=1, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=6, tm_min=36, tm_sec=56, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=6, tm_min=41, tm_sec=33, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=6, tm_min=43, tm_sec=49, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=6, tm_min=44, tm_sec=6, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=6, tm_min=46, tm_sec=1, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=2021, tm_mon=8, tm_mday=24, tm_hour=6, tm_min=49, tm_sec=47, tm_wday=1, tm_yday=236, tm_isdst=0)
time.struct_time(tm_year=20

# Test data metrics

## Polbert

In [293]:
metrics_df = pd.DataFrame(columns = ["Precision_kpwr", "Recall_kpwr", "Precision_sherlock", "Recall_sherlock"])

In [294]:
path_kpwr = 'test_gold_standard/test_gold_standard/kpwr/'
path_results = 'results/kpwr/'
files_kpwr = os.listdir(path_results)
full_kpwr_results_df = pd.DataFrame()

for file_name in files_kpwr:
  result_file_name = path_results + file_name
  file_name = path_kpwr + file_name
  df = pd.read_csv(file_name, sep = '\t', quoting=3, error_bad_lines=False)[["ORDER_ID", "WN_ID"]]
  result_df = pd.read_csv(result_file_name, sep = '\t')
  df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
  result_df = pd.merge(result_df, df, how='inner', on='ORDER_ID')
  full_kpwr_results_df = full_kpwr_results_df.append(result_df)
  
full_kpwr_results_df = full_kpwr_results_df.reset_index(drop=True)

full_kpwr_results_df['WN_ID_A'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_A'])
full_kpwr_results_df['WN_ID_AM'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_AM'])
full_kpwr_results_df['WN_ID_MX'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_MX'])
full_kpwr_results_df['WN_ID_M'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_M'])
full_kpwr_results_df['WN_ID_W'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_W'])
full_kpwr_results_df['WN_ID_A_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_A_X'])
full_kpwr_results_df['WN_ID_AM_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_AM_X'])
full_kpwr_results_df['WN_ID_MX_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_MX_X'])
full_kpwr_results_df['WN_ID_M_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_M_X'])
full_kpwr_results_df['WN_ID_W_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_W_X'])

full_kpwr_results_df['correct_classif_weight'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_W'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_W'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_weight'] = full_kpwr_results_df['WN_ID_W'] != "_"
full_kpwr_results_df['if_annotated_in_test_data'] = full_kpwr_results_df['WN_ID_ACTUAL'] != "_"

correctly_predicted_senses_count_weight = sum(full_kpwr_results_df['correct_classif_weight'])
count_decisions_made_by_model_weight = sum(full_kpwr_results_df['if_decision_made_by_model_weight'])
count_annotated_in_test_data = sum(full_kpwr_results_df['if_annotated_in_test_data'])

metrics_df.loc["POLBERT_Weighted_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight
metrics_df.loc["POLBERT_Weighted_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_weight/count_annotated_in_test_data

print(f"precision for kpwr weighted = {correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight}")
print(f"recall for kpwr weighted = {correctly_predicted_senses_count_weight/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_MX'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_MX'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max'] = full_kpwr_results_df['WN_ID_MX'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max'])

metrics_df.loc["POLBERT_Max_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Max_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_M'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_M'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_max'] = full_kpwr_results_df['WN_ID_M'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max_max'])

metrics_df.loc["POLBERT_Maxfrommax_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Maxfrommax_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max max = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max max = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_avg'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_A'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_A'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_avg'] = full_kpwr_results_df['WN_ID_A'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_avg'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_avg'])

metrics_df.loc["POLBERT_Avg_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Avg_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr avg = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr avg = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_avg'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_AM'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_AM'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_avg'] = full_kpwr_results_df['WN_ID_AM'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max'])

metrics_df.loc["POLBERT_Maxfromavg_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Maxfromavg_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max avg = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max avg = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_W_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_W_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_max'] = full_kpwr_results_df['WN_ID_W_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max_max'])

metrics_df.loc["POLBERT_Weighted_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Weighted_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr weighted not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr weighted not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_MX_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_MX_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max'] = full_kpwr_results_df['WN_ID_MX_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max'])

metrics_df.loc["POLBERT_Max_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Max_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_M_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_M_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_max'] = full_kpwr_results_df['WN_ID_M_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max_max'])

metrics_df.loc["POLBERT_Maxfrommax_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Maxfrommax_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max max not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max max not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_avg'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_A_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_A_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_avg'] = full_kpwr_results_df['WN_ID_A_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_avg'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_avg'])

metrics_df.loc["POLBERT_Avg_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Avg_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr avg not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr avg not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_avg'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_AM_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_AM_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_avg'] = full_kpwr_results_df['WN_ID_AM_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max'])

metrics_df.loc["POLBERT_Maxfromavg_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Maxfromavg_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max avg not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max avg not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

precision for kpwr weighted = 0.40145057970247233
recall for kpwr weighted = 0.5458930242603124
precision for kpwr max = 0.40264690312334567
recall for kpwr max = 0.5475487725865669
precision for kpwr max max = 0.391147818720881
recall for kpwr max max = 0.5318551580159816
precision for kpwr avg = 0.4003176283748015
recall for kpwr avg = 0.5443812540493845
precision for kpwr max avg = 0.40264690312334567
recall for kpwr max avg = 0.5475487725865669
precision for kpwr weighted not always mwe = 0.3966859018476362
recall for kpwr weighted not always mwe = 0.5394140090706213
precision for kpwr max not always mwe = 0.39756484912652196
recall for kpwr max not always mwe = 0.5406378230508962
precision for kpwr max max not always mwe = 0.3865417196103346
recall for kpwr max max not always mwe = 0.5255921099992801
precision for kpwr avg not always mwe = 0.3966119640021175
recall for kpwr avg not always mwe = 0.5393420200129581
precision for kpwr max avg not always mwe = 0.39756484912652196
reca

In [295]:
path_sherlock = 'test_gold_standard/test_gold_standard/sherlock/'
path_results = 'results/sherlock/'
files_sherlock = os.listdir(path_results)
full_sherlock_results_df = pd.DataFrame()

for file_name in files_sherlock:
  result_file_name = path_results + file_name
  file_name = path_sherlock + file_name
  df = pd.read_csv(file_name, sep = '\t', quoting=3, error_bad_lines=False)[["ORDER_ID", "WN_ID"]]
  result_df = pd.read_csv(result_file_name, sep = '\t')
  df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
  result_df = pd.merge(result_df, df, how='inner', on='ORDER_ID')
  full_sherlock_results_df = full_sherlock_results_df.append(result_df)
  
full_sherlock_results_df = full_sherlock_results_df.reset_index(drop=True)

full_sherlock_results_df['WN_ID_A'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_A'])
full_sherlock_results_df['WN_ID_AM'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_AM'])
full_sherlock_results_df['WN_ID_MX'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_MX'])
full_sherlock_results_df['WN_ID_M'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_M'])
full_sherlock_results_df['WN_ID_W'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_W'])
full_sherlock_results_df['WN_ID_A_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_A_X'])
full_sherlock_results_df['WN_ID_AM_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_AM_X'])
full_sherlock_results_df['WN_ID_MX_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_MX_X'])
full_sherlock_results_df['WN_ID_M_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_M_X'])
full_sherlock_results_df['WN_ID_W_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_W_X'])

full_sherlock_results_df['correct_classif_weight'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_W'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_W'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_weight'] = full_sherlock_results_df['WN_ID_W'] != "_"
full_sherlock_results_df['if_annotated_in_test_data'] = full_sherlock_results_df['WN_ID_ACTUAL'] != "_"

correctly_predicted_senses_count_weight = sum(full_sherlock_results_df['correct_classif_weight'])
count_decisions_made_by_model_weight = sum(full_sherlock_results_df['if_decision_made_by_model_weight'])
count_annotated_in_test_data = sum(full_sherlock_results_df['if_annotated_in_test_data'])

metrics_df.loc["POLBERT_Weighted_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight
metrics_df.loc["POLBERT_Weighted_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_weight/count_annotated_in_test_data

print(f"precision for sherlock weighted = {correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight}")
print(f"recall for sherlock weighted = {correctly_predicted_senses_count_weight/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_MX'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_MX'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max'] = full_sherlock_results_df['WN_ID_MX'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max'])

metrics_df.loc["POLBERT_Max_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Max_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_M'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_M'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_max'] = full_sherlock_results_df['WN_ID_M'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max_max'])

metrics_df.loc["POLBERT_Maxfrommax_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Maxfrommax_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max max = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max max = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_avg'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_A'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_A'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_avg'] = full_sherlock_results_df['WN_ID_A'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_avg'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_avg'])

metrics_df.loc["POLBERT_Avg_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Avg_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock avg = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock avg = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_avg'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_AM'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_AM'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_avg'] = full_sherlock_results_df['WN_ID_AM'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max'])

metrics_df.loc["POLBERT_Maxfromavg_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Maxfromavg_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max avg = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max avg = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_W_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_W_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_max'] = full_sherlock_results_df['WN_ID_W_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max_max'])

metrics_df.loc["POLBERT_Weighted_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Weighted_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock weighted not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock weighted not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_MX_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_MX_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max'] = full_sherlock_results_df['WN_ID_MX_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max'])

metrics_df.loc["POLBERT_Max_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Max_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_M_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_M_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_max'] = full_sherlock_results_df['WN_ID_M_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max_max'])

metrics_df.loc["POLBERT_Maxfrommax_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Maxfrommax_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max max not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max max not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_avg'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_A_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_A_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_avg'] = full_sherlock_results_df['WN_ID_A_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_avg'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_avg'])

metrics_df.loc["POLBERT_Avg_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Avg_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock avg not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock avg not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_avg'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_AM_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_AM_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_avg'] = full_sherlock_results_df['WN_ID_AM_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max'])

metrics_df.loc["POLBERT_Maxfromavg_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
metrics_df.loc["POLBERT_Maxfromavg_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max avg not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max avg not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

precision for sherlock weighted = 0.3607734806629834
recall for sherlock weighted = 0.5306067172264355
precision for sherlock max = 0.3704181248848775
recall for sherlock max = 0.5446912242686891
precision for sherlock max max = 0.35556374355195286
recall for sherlock max max = 0.5227518959913326
precision for sherlock avg = 0.36892488954344627
recall for sherlock avg = 0.542795232936078
precision for sherlock max avg = 0.3704181248848775
recall for sherlock max avg = 0.5446912242686891
precision for sherlock weighted not always mwe = 0.36187845303867405
recall for sherlock weighted not always mwe = 0.5322318526543879
precision for sherlock max not always mwe = 0.37023392890034995
recall for sherlock max not always mwe = 0.5444203683640303
precision for sherlock max max not always mwe = 0.355379513633014
recall for sherlock max max not always mwe = 0.5224810400866738
precision for sherlock avg not always mwe = 0.3698453608247423
recall for sherlock avg not always mwe = 0.54414951245937

In [1335]:
list_final_results = [full_kpwr_results_df, full_sherlock_results_df]
with open('list_final_results_polbert_v2.pkl', 'wb') as fp:
    pickle.dump(list_final_results, fp)

## Herbert

In [3]:
path_kpwr = 'test_gold_standard/test_gold_standard/kpwr/'
path_results = 'results_herbert/kpwr/'
files_kpwr = os.listdir(path_results)
full_kpwr_results_df = pd.DataFrame()

for file_name in files_kpwr:
  result_file_name = path_results + file_name
  file_name = path_kpwr + file_name
  df = pd.read_csv(file_name, sep = '\t', quoting=3, error_bad_lines=False)[["ORDER_ID", "WN_ID"]]
  result_df = pd.read_csv(result_file_name, sep = '\t')
  df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
  result_df = pd.merge(result_df, df, how='inner', on='ORDER_ID')
  full_kpwr_results_df = full_kpwr_results_df.append(result_df)
  
full_kpwr_results_df = full_kpwr_results_df.reset_index(drop=True)

full_kpwr_results_df['WN_ID_A'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_A'])
full_kpwr_results_df['WN_ID_AM'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_AM'])
full_kpwr_results_df['WN_ID_MX'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_MX'])
full_kpwr_results_df['WN_ID_M'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_M'])
full_kpwr_results_df['WN_ID_W'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_W'])
full_kpwr_results_df['WN_ID_A_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_A_X'])
full_kpwr_results_df['WN_ID_AM_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_AM_X'])
full_kpwr_results_df['WN_ID_MX_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_MX_X'])
full_kpwr_results_df['WN_ID_M_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_M_X'])
full_kpwr_results_df['WN_ID_W_X'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_W_X'])

full_kpwr_results_df['correct_classif_weight'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_W'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_W'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_weight'] = full_kpwr_results_df['WN_ID_W'] != "_"
full_kpwr_results_df['if_annotated_in_test_data'] = full_kpwr_results_df['WN_ID_ACTUAL'] != "_"

correctly_predicted_senses_count_weight = sum(full_kpwr_results_df['correct_classif_weight'])
count_decisions_made_by_model_weight = sum(full_kpwr_results_df['if_decision_made_by_model_weight'])
count_annotated_in_test_data = sum(full_kpwr_results_df['if_annotated_in_test_data'])

#metrics_df.loc["HERBERT_Weighted_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight
#metrics_df.loc["HERBERT_Weighted_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_weight/count_annotated_in_test_data

print(f"precision for kpwr weighted = {correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight}")
print(f"recall for kpwr weighted = {correctly_predicted_senses_count_weight/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_MX'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_MX'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max'] = full_kpwr_results_df['WN_ID_MX'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max'])

#metrics_df.loc["HERBERT_Max_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Max_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_M'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_M'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_max'] = full_kpwr_results_df['WN_ID_M'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max_max'])

#metrics_df.loc["HERBERT_Maxfrommax_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Maxfrommax_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max max = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max max = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_avg'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_A'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_A'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_avg'] = full_kpwr_results_df['WN_ID_A'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_avg'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_avg'])

#metrics_df.loc["HERBERT_Avg_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Avg_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr avg = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr avg = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_avg'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_AM'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_AM'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_avg'] = full_kpwr_results_df['WN_ID_AM'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max'])

#metrics_df.loc["HERBERT_Maxfromavg_alwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Maxfromavg_alwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max avg = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max avg = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_W_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_W_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_max'] = full_kpwr_results_df['WN_ID_W_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max_max'])

#metrics_df.loc["HERBERT_Weighted_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Weighted_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr weighted not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr weighted not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_MX_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_MX_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max'] = full_kpwr_results_df['WN_ID_MX_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max'])

#metrics_df.loc["HERBERT_Max_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Max_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_M_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_M_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_max'] = full_kpwr_results_df['WN_ID_M_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max_max'])

#metrics_df.loc["HERBERT_Maxfrommax_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Maxfrommax_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max max not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max max not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_avg'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_A_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_A_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_avg'] = full_kpwr_results_df['WN_ID_A_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_avg'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_avg'])

#metrics_df.loc["HERBERT_Avg_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Avg_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr avg not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr avg not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_kpwr_results_df['correct_classif_max_avg'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_AM_X'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_AM_X'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_max_avg'] = full_kpwr_results_df['WN_ID_AM_X'] != "_"

correctly_predicted_senses_count_max = sum(full_kpwr_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_kpwr_results_df['if_decision_made_by_model_max'])

#metrics_df.loc["HERBERT_Maxfromavg_notalwaysmwe", "Precision_kpwr"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Maxfromavg_notalwaysmwe", "Recall_kpwr"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for kpwr max avg not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for kpwr max avg not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

precision for kpwr weighted = 0.41880160914672876
recall for kpwr weighted = 0.5695774242315168
precision for kpwr max = 0.42356553038323097
recall for kpwr max = 0.576056439421208
precision for kpwr max max = 0.3999258710155671
recall for kpwr max max = 0.5437333525304153
precision for kpwr avg = 0.4246241795468982
recall for kpwr avg = 0.5774962205744727
precision for kpwr max avg = 0.42356553038323097
recall for kpwr max avg = 0.576056439421208
precision for kpwr weighted not always mwe = 0.4143552826593267
recall for kpwr weighted not always mwe = 0.563530343387805
precision for kpwr max not always mwe = 0.41959559601947916
recall for kpwr max not always mwe = 0.5706572600964653
precision for kpwr max max not always mwe = 0.39595467542094676
recall for kpwr max max not always mwe = 0.5383341732056728
precision for kpwr avg not always mwe = 0.4225598136777472
recall for kpwr avg not always mwe = 0.5746886473256065
precision for kpwr max avg not always mwe = 0.41959559601947916
recal

In [4]:
path_sherlock = 'test_gold_standard/test_gold_standard/sherlock/'
path_results = 'results_herbert/sherlock/'
files_sherlock = os.listdir(path_results)
full_sherlock_results_df = pd.DataFrame()

for file_name in files_sherlock:
  result_file_name = path_results + file_name
  file_name = path_sherlock + file_name
  df = pd.read_csv(file_name, sep = '\t', quoting=3, error_bad_lines=False)[["ORDER_ID", "WN_ID"]]
  result_df = pd.read_csv(result_file_name, sep = '\t')
  df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
  result_df = pd.merge(result_df, df, how='inner', on='ORDER_ID')
  full_sherlock_results_df = full_sherlock_results_df.append(result_df)
  
full_sherlock_results_df = full_sherlock_results_df.reset_index(drop=True)

full_sherlock_results_df['WN_ID_A'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_A'])
full_sherlock_results_df['WN_ID_AM'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_AM'])
full_sherlock_results_df['WN_ID_MX'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_MX'])
full_sherlock_results_df['WN_ID_M'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_M'])
full_sherlock_results_df['WN_ID_W'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_W'])
full_sherlock_results_df['WN_ID_A_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_A_X'])
full_sherlock_results_df['WN_ID_AM_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_AM_X'])
full_sherlock_results_df['WN_ID_MX_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_MX_X'])
full_sherlock_results_df['WN_ID_M_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_M_X'])
full_sherlock_results_df['WN_ID_W_X'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_W_X'])

full_sherlock_results_df['correct_classif_weight'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_W'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_W'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_weight'] = full_sherlock_results_df['WN_ID_W'] != "_"
full_sherlock_results_df['if_annotated_in_test_data'] = full_sherlock_results_df['WN_ID_ACTUAL'] != "_"

correctly_predicted_senses_count_weight = sum(full_sherlock_results_df['correct_classif_weight'])
count_decisions_made_by_model_weight = sum(full_sherlock_results_df['if_decision_made_by_model_weight'])
count_annotated_in_test_data = sum(full_sherlock_results_df['if_annotated_in_test_data'])

#metrics_df.loc["HERBERT_Weighted_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight
#metrics_df.loc["HERBERT_Weighted_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_weight/count_annotated_in_test_data

print(f"precision for sherlock weighted = {correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight}")
print(f"recall for sherlock weighted = {correctly_predicted_senses_count_weight/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_MX'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_MX'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max'] = full_sherlock_results_df['WN_ID_MX'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max'])

#metrics_df.loc["HERBERT_Max_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Max_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_M'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_M'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_max'] = full_sherlock_results_df['WN_ID_M'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max_max'])

#metrics_df.loc["HERBERT_Maxfrommax_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Maxfrommax_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max max = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max max = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_avg'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_A'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_A'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_avg'] = full_sherlock_results_df['WN_ID_A'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_avg'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_avg'])

#metrics_df.loc["HERBERT_Avg_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Avg_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock avg = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock avg = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_avg'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_AM'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_AM'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_avg'] = full_sherlock_results_df['WN_ID_AM'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max'])

#metrics_df.loc["HERBERT_Maxfromavg_alwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Maxfromavg_alwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max avg = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max avg = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_W_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_W_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_max'] = full_sherlock_results_df['WN_ID_W_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max_max'])

#metrics_df.loc["HERBERT_Weighted_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Weighted_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock weighted not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock weighted not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_MX_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_MX_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max'] = full_sherlock_results_df['WN_ID_MX_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max'])

#metrics_df.loc["HERBERT_Max_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Max_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_max'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_M_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_M_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_max'] = full_sherlock_results_df['WN_ID_M_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max_max'])

#metrics_df.loc["HERBERT_Maxfrommax_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Maxfrommax_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max max not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max max not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_avg'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_A_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_A_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_avg'] = full_sherlock_results_df['WN_ID_A_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_avg'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_avg'])

#metrics_df.loc["HERBERT_Avg_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Avg_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock avg not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock avg not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

full_sherlock_results_df['correct_classif_max_avg'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_AM_X'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_AM_X'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_max_avg'] = full_sherlock_results_df['WN_ID_AM_X'] != "_"

correctly_predicted_senses_count_max = sum(full_sherlock_results_df['correct_classif_max'])
count_decisions_made_by_model_max = sum(full_sherlock_results_df['if_decision_made_by_model_max'])

#metrics_df.loc["HERBERT_Maxfromavg_notalwaysmwe", "Precision_sherlock"] = correctly_predicted_senses_count_max/count_decisions_made_by_model_max
#metrics_df.loc["HERBERT_Maxfromavg_notalwaysmwe", "Recall_sherlock"] = correctly_predicted_senses_count_max/count_annotated_in_test_data

print(f"precision for sherlock max avg not always mwe = {correctly_predicted_senses_count_max/count_decisions_made_by_model_max}")
print(f"recall for sherlock max avg not always mwe = {correctly_predicted_senses_count_max/count_annotated_in_test_data}")

precision for sherlock weighted = 0.3743325354446695
recall for sherlock weighted = 0.550650054171181
precision for sherlock max = 0.3751381215469613
recall for sherlock max = 0.5517334777898159
precision for sherlock max max = 0.3574585635359116
recall for sherlock max max = 0.5257313109425785
precision for sherlock avg = 0.3815135334192598
recall for sherlock avg = 0.5612134344528711
precision for sherlock max avg = 0.3751381215469613
recall for sherlock max avg = 0.5517334777898159
precision for sherlock weighted not always mwe = 0.37470079175105875
recall for sherlock weighted not always mwe = 0.5511917659804983
precision for sherlock max not always mwe = 0.3753222836095764
recall for sherlock max not always mwe = 0.5520043336944745
precision for sherlock max max not always mwe = 0.358195211786372
recall for sherlock max max not always mwe = 0.5268147345612134
precision for sherlock avg not always mwe = 0.3829865586448168
recall for sherlock avg not always mwe = 0.5633802816901409


In [97]:
lemmas_with_synset_counts_df[lemmas_with_synset_counts_df.lemma == "sprawa"]

Unnamed: 0,lemma,n_synset_per_lemma,n_synset_per_lemma_with_example,n_syns_per_lemma_all_with_example
6238,sprawa,4.0,4,4_4


In [102]:
lemmas_with_synset_counts_df[lemmas_with_synset_counts_df.lemma == "okno"]

Unnamed: 0,lemma,n_synset_per_lemma,n_synset_per_lemma_with_example,n_syns_per_lemma_all_with_example
3776,okno,7.0,7,7_7


In [125]:
full_df[full_df.lemma == "zapalić"]

Unnamed: 0,lemma,POS,synset,example
23742,zapalić,verb,s4097,Po wielu próbach silnik maszyny wreszcie **zap...
23743,zapalić,verb,s4097,o urządzeniu posiadającym silnik: uruchomić się.
23744,zapalić,verb,s4097,"Był taki mróz, że samochód nie chciał mi rano ..."
49357,zapalić,verb,s15220,"Nie jest pewne, kto dokładnie **zapalił** ten ..."
49358,zapalić,verb,s15220,"**Zapal** świeczkę, lubię, kiedy pachnie."
49359,zapalić,verb,s15220,"sprawić, że coś zaczyna płonąć, jest ogniem lu..."
49360,zapalić,verb,s15220,Szarmanckim gestem **zapalił** jej papierosa.
49361,zapalić,verb,s15220,Ryszard sprawnie **zapalił** ogień w palenisku.
121612,zapalić,verb,s51828,"Najpierw **zapalili**, a dopiero potem wzięli ..."
121613,zapalić,verb,s51828,Już nigdy więcej nie **zapalę** fajki bez filt...


In [11]:
sher_kpwr_df = full_sherlock_results_df.copy()
sher_kpwr_df = sher_kpwr_df.append(full_kpwr_results_df)

In [40]:
sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"]

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,LEMMA,CTAG,FROM,TO,WN_ID_X,WN_ID_A,WN_ID_A_X,...,if_decision_made_by_model_weight,if_annotated_in_test_data,correct_classif_max,if_decision_made_by_model_max,correct_classif_max_max,if_decision_made_by_model_max_max,correct_classif_avg,if_decision_made_by_model_avg,correct_classif_max_avg,if_decision_made_by_model_max_avg
69,69,64,spraw,sprawa,subst:pl:gen:f,365,369.0,_,s102695,s102695,...,True,True,False,True,False,True,True,True,False,True
356,65,26,sprawę,sprawa,subst:sg:acc:f,260,265.0,_,s102695,s102695,...,True,True,True,True,False,True,True,True,False,True
2950,251,1,sprawę,sprawa,subst:sg:acc:f,1102,1107.0,_,s3200,s3200,...,True,True,False,True,False,True,False,True,False,True
3528,129,3,sprawa,sprawa,subst:sg:nom:f,625,630.0,_,s102695,s102695,...,True,True,True,True,True,True,True,True,True,True
3656,14,2,sprawy,sprawa,subst:pl:acc:f,57,62.0,_,s1107,s1107,...,True,True,False,True,False,True,False,True,False,True
3696,54,6,sprawami,sprawa,subst:pl:inst:f,250,257.0,_,s102695,s102695,...,True,True,True,True,False,True,True,True,False,True
5540,59,10,sprawę,sprawa,subst:sg:acc:f,292,297.0,_,s102695,s102695,...,True,True,True,True,True,True,True,True,True,True
6420,75,0,Sprawa,sprawa,subst:sg:nom:f,329,334.0,_,s3200,s3200,...,True,True,False,True,False,True,False,True,False,True
1017,180,22,sprawie,sprawa,subst:sg:loc:f,947,953.0,_,s3200,s3200,...,True,True,False,True,False,True,False,True,False,True
2858,155,3,sprawy,sprawa,subst:pl:acc:f,724,729.0,_,s4664,s4664,...,True,True,True,True,True,True,False,True,False,True


In [37]:
sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"][sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"].WN_ID_A == sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"].WN_ID_ACTUAL]

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,LEMMA,CTAG,FROM,TO,WN_ID_X,WN_ID_A,WN_ID_A_X,...,if_decision_made_by_model_weight,if_annotated_in_test_data,correct_classif_max,if_decision_made_by_model_max,correct_classif_max_max,if_decision_made_by_model_max_max,correct_classif_avg,if_decision_made_by_model_avg,correct_classif_max_avg,if_decision_made_by_model_max_avg
69,69,64,spraw,sprawa,subst:pl:gen:f,365,369.0,_,s102695,s102695,...,True,True,False,True,False,True,True,True,False,True
356,65,26,sprawę,sprawa,subst:sg:acc:f,260,265.0,_,s102695,s102695,...,True,True,True,True,False,True,True,True,False,True
3528,129,3,sprawa,sprawa,subst:sg:nom:f,625,630.0,_,s102695,s102695,...,True,True,True,True,True,True,True,True,True,True
3696,54,6,sprawami,sprawa,subst:pl:inst:f,250,257.0,_,s102695,s102695,...,True,True,True,True,False,True,True,True,False,True
5540,59,10,sprawę,sprawa,subst:sg:acc:f,292,297.0,_,s102695,s102695,...,True,True,True,True,True,True,True,True,True,True
9010,121,9,sprawie,sprawa,subst:sg:loc:f,551,557.0,_,s102695,s102695,...,True,True,False,True,False,True,True,True,True,True
12950,371,0,Sprawa,sprawa,subst:sg:nom:f,2084,2089.0,_,s102695,s102695,...,True,True,True,True,False,True,True,True,False,True
14975,63,19,sprawie,sprawa,subst:sg:loc:f,308,314.0,_,s102695,s102695,...,True,True,True,True,True,True,True,True,True,True
14990,78,11,sprawie,sprawa,subst:sg:loc:f,400,406.0,_,s102695,s102695,...,True,True,True,True,True,True,True,True,True,True
15080,168,24,spraw,sprawa,subst:pl:gen:f,855,859.0,_,s102695,s102695,...,True,True,True,True,False,True,True,True,False,True


In [39]:
len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"][sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"].WN_ID_A == sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"].WN_ID_ACTUAL])/len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"])

0.5

In [48]:
len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s102695"])

22

In [42]:
len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s3200"][sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s3200"].WN_ID_A == sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s3200"].WN_ID_ACTUAL])/len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s3200"])

0.6

In [49]:
len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s3200"])

10

In [44]:
len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s4664"])

0

In [47]:
len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s1107"][sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s1107"].WN_ID_A == sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s1107"].WN_ID_ACTUAL])/len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s1107"])

0.0

In [50]:
len(sher_kpwr_df[sher_kpwr_df.WN_ID_ACTUAL == "s1107"])

1

In [12]:
sher_kpwr_df[sher_kpwr_df.LEMMA == "sprawa"]

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,LEMMA,CTAG,FROM,TO,WN_ID_X,WN_ID_A,WN_ID_A_X,...,if_decision_made_by_model_weight,if_annotated_in_test_data,correct_classif_max,if_decision_made_by_model_max,correct_classif_max_max,if_decision_made_by_model_max_max,correct_classif_avg,if_decision_made_by_model_avg,correct_classif_max_avg,if_decision_made_by_model_max_avg
15,15,10,spraw,sprawa,subst:pl:gen:f,86,90.0,_,s102695,s102695,...,True,False,False,True,False,True,False,True,False,True
69,69,64,spraw,sprawa,subst:pl:gen:f,365,369.0,_,s102695,s102695,...,True,True,False,True,False,True,True,True,False,True
170,170,4,sprawy,sprawa,subst:sg:gen:f,869,874.0,_,s102695,s102695,...,True,False,False,True,False,True,False,True,False,True
356,65,26,sprawę,sprawa,subst:sg:acc:f,260,265.0,_,s102695,s102695,...,True,True,True,True,False,True,True,True,False,True
985,191,9,sprawę,sprawa,subst:sg:acc:f,836,841.0,_,s4664,s4664,...,True,False,False,True,False,True,False,True,False,True
1019,225,13,sprawie,sprawa,subst:sg:dat:f,964,970.0,_,s4664,s4664,...,True,False,False,True,False,True,False,True,False,True
1025,231,19,sprawie,sprawa,subst:sg:dat:f,987,993.0,_,s4664,s4664,...,True,False,False,True,False,True,False,True,False,True
1065,271,22,sprawą,sprawa,subst:sg:inst:f,1160,1165.0,_,s1107,s1107,...,True,False,False,True,False,True,False,True,False,True
1318,234,22,sprawę,sprawa,subst:sg:acc:f,1166,1171.0,_,s102695,s102695,...,True,False,False,True,False,True,False,True,False,True
1665,41,11,sprawami,sprawa,subst:pl:inst:f,189,196.0,_,s3200,s3200,...,True,True,True,True,True,True,True,True,True,True


In [5]:
full_sherlock_results_df[full_sherlock_results_df.LEMMA == "sprawa"]

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,LEMMA,CTAG,FROM,TO,WN_ID_X,WN_ID_A,WN_ID_A_X,...,if_decision_made_by_model_weight,if_annotated_in_test_data,correct_classif_max,if_decision_made_by_model_max,correct_classif_max_max,if_decision_made_by_model_max_max,correct_classif_avg,if_decision_made_by_model_avg,correct_classif_max_avg,if_decision_made_by_model_max_avg
15,15,10,spraw,sprawa,subst:pl:gen:f,86,90,_,s102695,s102695,...,True,False,False,True,False,True,False,True,False,True
69,69,64,spraw,sprawa,subst:pl:gen:f,365,369,_,s102695,s102695,...,True,True,False,True,False,True,True,True,False,True
170,170,4,sprawy,sprawa,subst:sg:gen:f,869,874,_,s102695,s102695,...,True,False,False,True,False,True,False,True,False,True
356,65,26,sprawę,sprawa,subst:sg:acc:f,260,265,_,s102695,s102695,...,True,True,True,True,False,True,True,True,False,True
985,191,9,sprawę,sprawa,subst:sg:acc:f,836,841,_,s4664,s4664,...,True,False,False,True,False,True,False,True,False,True
1019,225,13,sprawie,sprawa,subst:sg:dat:f,964,970,_,s4664,s4664,...,True,False,False,True,False,True,False,True,False,True
1025,231,19,sprawie,sprawa,subst:sg:dat:f,987,993,_,s4664,s4664,...,True,False,False,True,False,True,False,True,False,True
1065,271,22,sprawą,sprawa,subst:sg:inst:f,1160,1165,_,s1107,s1107,...,True,False,False,True,False,True,False,True,False,True
1318,234,22,sprawę,sprawa,subst:sg:acc:f,1166,1171,_,s102695,s102695,...,True,False,False,True,False,True,False,True,False,True
1665,41,11,sprawami,sprawa,subst:pl:inst:f,189,196,_,s3200,s3200,...,True,True,True,True,True,True,True,True,True,True


In [28]:
full_sherlock_results_df.iloc[360:390, [1,2,3,8,9,10,11,12,13,14,15,16,17,18]]

Unnamed: 0,TOKEN_ID,ORTH,LEMMA,WN_ID_A,WN_ID_A_X,WN_ID_AM,WN_ID_AM_X,WN_ID_MX,WN_ID_MX_X,WN_ID_M,WN_ID_M_X,WN_ID_W_X,WN_ID_W,WN_ID_ACTUAL
360,0,Mniemam,mniemać,s3009,s3009,s3009,s3009,s3009,s3009,s3009,s3009,s3009,s3009,s3009
361,1,",",",",_,_,_,_,_,_,_,_,_,_,_
362,2,że,że,_,_,_,_,_,_,_,_,_,_,_
363,3,zechciał,zechcieć,s57936,s57936,s57936,s57936,s57936,s57936,s57936,s57936,s57936,s57936,s57936
364,4,by,by,s284021,s284021,s284021,s284021,s284021,s284021,s284021,s284021,s284021,s284021,_
365,5,ś,być,_,_,_,_,_,_,_,_,_,_,_
366,6,być,być,s250925,s250925,s250925,s250925,s50321,s50321,s50321,s50321,s50321,s50321,_
367,7,w,w,_,_,_,_,_,_,_,_,_,_,_
368,8,nią,on,s281572,s281572,s281572,s281572,s281572,s281572,s281572,s281572,s281572,s281572,_
369,9,wtajemniczony,wtajemniczyć,s54630,s54630,s54630,s54630,s54630,s54630,s54630,s54630,s54630,s54630,s54630


In [6]:
full_sherlock_results_df.iloc[330:360, [1,2,3,8,9,10,11,12,13,14,15,16,17,18]]

Unnamed: 0,TOKEN_ID,ORTH,LEMMA,WN_ID_A,WN_ID_A_X,WN_ID_AM,WN_ID_AM_X,WN_ID_MX,WN_ID_MX_X,WN_ID_M,WN_ID_M_X,WN_ID_W_X,WN_ID_W,WN_ID_ACTUAL
330,0,Watsonie,Watson,s237651,s237651,s237651,s237651,s237651,s237651,s237651,s237651,s237651,s237651,_
331,1,",",",",_,_,_,_,_,_,_,_,_,_,_
332,2,młode,młody,s16524,s16524,s16524,s16524,s16524,s16524,s223693,s223693,s16524,s16524,s1733
333,3,damy,dama,s512,s512,s512,s512,s75775,s75775,s512,s512,s6065,s6065,s512
334,4,",",",",_,_,_,_,_,_,_,_,_,_,_
335,5,które,który,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,_
336,6,zapuszczają,zapuszczać,s51203,s51203,s51203,s51203,s51203,s51203,s51203,s51203,s51203,s51203,s51203
337,7,się,się,s51203,s51203,s51203,s51203,s51203,s51203,s51203,s51203,s51203,s51203,s51203
338,8,do,do,_,_,_,_,_,_,_,_,_,_,_
339,9,miasta,miasto,s385227,s385227,s385227,s385227,s385227,s385227,s385227,s385227,s385227,s385227,_


In [7]:
full_sherlock_results_df.iloc[1654:1684, [1,2,3,8,9,10,11,12,13,14,15,16,17,18]]

Unnamed: 0,TOKEN_ID,ORTH,LEMMA,WN_ID_A,WN_ID_A_X,WN_ID_AM,WN_ID_AM_X,WN_ID_MX,WN_ID_MX_X,WN_ID_M,WN_ID_M_X,WN_ID_W_X,WN_ID_W,WN_ID_ACTUAL
1654,0,Ciągle,ciągle,s452232,s452232,s452232,s452232,s452232,s452232,s452232,s452232,s452232,s452232,s452232
1655,1,wdawał,wdawać,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s61014
1656,2,się,się,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s7075516,s61014
1657,3,w,w,_,_,_,_,_,_,_,_,_,_,_
1658,4,awantury,awantura,s102835,s102835,s102835,s102835,s102835,s102835,s102835,s102835,s102835,s102835,s102835
1659,5,",",",",_,_,_,_,_,_,_,_,_,_,_
1660,6,z,z,_,_,_,_,_,_,_,_,_,_,_
1661,7,których,który,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,_
1662,8,dwie,dwa,_,_,_,_,_,_,_,_,_,_,_
1663,9,skończyły,skończyć,s55785,s55785,s55785,s55785,s55785,s55785,s55785,s55785,s55785,s55785,_


In [8]:
full_sherlock_results_df.iloc[3654:3684, [1,2,3,8,9,10,11,12,13,14,15,16,17,18]]

Unnamed: 0,TOKEN_ID,ORTH,LEMMA,WN_ID_A,WN_ID_A_X,WN_ID_AM,WN_ID_AM_X,WN_ID_MX,WN_ID_MX_X,WN_ID_M,WN_ID_M_X,WN_ID_W_X,WN_ID_W,WN_ID_ACTUAL
3654,0,Mam,mieć,s8913,s8913,s1719,s1719,s8913,s8913,s1719,s1719,s1719,s1719,_
3655,1,pewne,pewien,s104789,s104789,s1151,s1151,s104789,s104789,s104789,s104789,s104789,s104789,_
3656,2,sprawy,sprawa,s1107,s1107,s1107,s1107,s1107,s1107,s1107,s1107,s1107,s1107,s102695
3657,3,",",",",_,_,_,_,_,_,_,_,_,_,_
3658,4,którymi,który,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,s9558,_
3659,5,chciała,chcieć,s9529,s9529,s9529,s9529,s9529,s9529,s9529,s9529,s9529,s9529,s9529
3660,6,by,by,s284021,s284021,s284021,s284021,s284021,s284021,s284021,s284021,s284021,s284021,_
3661,7,m,być,_,_,_,_,_,_,_,_,_,_,_
3662,8,się,się,s54470,s54470,s54470,s54470,s2497,s2497,s2497,s2497,s2497,s2497,s2497
3663,9,zająć,zająć,s54470,s54470,s54470,s54470,s2497,s2497,s2497,s2497,s2497,s2497,s2497


In [26]:
list(full_df.loc[full_df.synset == "s3200", "example"])[2]

'Przebieg tego **procesu** wszystkich zupełnie zaskoczył, bo w wyniku niespodziewanie okazanych dowodów oskarżony został uniewinniony.'

In [19]:
full_df[full_df.lemma == "sprawa"]

Unnamed: 0,lemma,POS,synset,example
18081,sprawa,noun,s102695,Idź z tym do dziekana od **spraw** dydaktycznych.
18082,sprawa,noun,s102695,"Nie da się tej **sprawy** obejść. **sprawa**, ..."
18083,sprawa,noun,s102695,Głównym przedmiotem jego zainteresowań w karie...
18084,sprawa,noun,s102695,"**materia**, zespół określonych okoliczności, ..."
18085,sprawa,noun,s102695,Rozmowa dotyczyła delikatnej **materii**.
18086,sprawa,noun,s3200,"Ta **sprawa** ciągnęła się bardzo długo, bo gł..."
18087,sprawa,noun,s3200,"**proces** sądowy prowadzony przez sąd, któreg..."
18088,sprawa,noun,s3200,Przebieg tego **procesu** wszystkich zupełnie ...
18089,sprawa,noun,s3200,Ten **przewód** sądowy ciągnął się bardzo dług...
18090,sprawa,noun,s3200,"Ten **przewód** ciągnął się bardzo długo, bo g..."


In [124]:
full_sherlock_results_df.groupby("LEMMA").size().sort_values(ascending=False)[150:180]

LEMMA
wszystek        7
dać             7
wąż             7
korytarz        7
wskazywać       7
krzesło         7
zapalić         7
lampa           7
między          7
gwizd           7
cel             7
temu            7
zaś             7
uwaga           7
posiadłość      7
Cygan           7
pomyśleć        7
nad             7
pierwszy        7
mniemać         6
wiele           6
pod             6
dobrze          6
spojrzeć        6
wyjść           6
gepard          6
wentylacyjny    6
zwrócić         6
trzymać         6
znajdować       6
dtype: int64

In [1338]:
list_final_results = [full_kpwr_results_df, full_sherlock_results_df]
with open('list_final_results_herbert_v2.pkl', 'wb') as fp:
    pickle.dump(list_final_results, fp)

# Lesk

In [126]:
path_results = 'results_lesk/kpwr/'
files_kpwr = os.listdir(path_results)
files_kpwr

for file_name in files_kpwr:
    #print(file_name)
    result_file_name = path_results + file_name
    df = pd.read_csv(result_file_name, sep = '\t')
    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_L"] = "_"
    df.loc[df.WN_ID_L.isna(), "WN_ID_L"] = "_"
    df = df.drop(['WN_ID_W_L'], axis=1)
    df.rename(columns={'WN_ID_L': 'WN_ID'}, inplace=True)
    df.to_csv(result_file_name, sep='\t', index=False)

In [127]:
path_results = 'results_lesk/sherlock/'
files_kpwr = os.listdir(path_results)
files_kpwr

for file_name in files_kpwr:
    #print(file_name)
    result_file_name = path_results + file_name
    df = pd.read_csv(result_file_name, sep = '\t')
    df.loc[df.CTAG.str.startswith('num') |
           df.CTAG.str.startswith('conj') |
           (df.CTAG.str.startswith('brev') & ~df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           df.CTAG.str.startswith('prep') |
           df.CTAG.str.startswith('aglt'), "WN_ID_L"] = "_"
    df.loc[df.WN_ID_L.isna(), "WN_ID_L"] = "_"
    df = df.drop(['WN_ID_W_L'], axis=1)
    df.rename(columns={'WN_ID_L': 'WN_ID'}, inplace=True)
    df.to_csv(result_file_name, sep='\t', index=False)

In [299]:
path_kpwr = 'test_gold_standard/test_gold_standard/kpwr/'
path_results = 'results_lesk/kpwr/'
files_kpwr = os.listdir(path_results)
full_kpwr_results_df = pd.DataFrame()

for file_name in files_kpwr:
  result_file_name = path_results + file_name
  file_name = path_kpwr + file_name
  df = pd.read_csv(file_name, sep = '\t', quoting=3, error_bad_lines=False)[["ORDER_ID", "WN_ID"]]
  result_df = pd.read_csv(result_file_name, sep = '\t')
  df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
  result_df = pd.merge(result_df, df, how='inner', on='ORDER_ID')
  full_kpwr_results_df = full_kpwr_results_df.append(result_df)

full_kpwr_results_df = full_kpwr_results_df.reset_index(drop=True)

full_kpwr_results_df.loc[full_kpwr_results_df.CTAG.str.startswith('num') |
           full_kpwr_results_df.CTAG.str.startswith('conj') |
           (full_kpwr_results_df.CTAG.str.startswith('brev') & ~full_kpwr_results_df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           full_kpwr_results_df.CTAG.str.startswith('prep') |
           full_kpwr_results_df.CTAG.str.startswith('aglt'), "WN_ID_L"] = "_"

full_kpwr_results_df['WN_ID_L'] = np.where(full_kpwr_results_df['LEMMA'].str.isdigit(), "_", full_kpwr_results_df['WN_ID_L'])

full_kpwr_results_df['correct_classif_weight'] = np.where(np.logical_and(full_kpwr_results_df['WN_ID_L'] == full_kpwr_results_df['WN_ID_ACTUAL'], full_kpwr_results_df['WN_ID_L'] != "_"), True, False)
full_kpwr_results_df['if_decision_made_by_model_weight'] = full_kpwr_results_df['WN_ID_L'] != "_"
full_kpwr_results_df['if_annotated_in_test_data'] = full_kpwr_results_df['WN_ID_ACTUAL'] != "_"

correctly_predicted_senses_count_weight = sum(full_kpwr_results_df['correct_classif_weight'])
count_decisions_made_by_model_weight = sum(full_kpwr_results_df['if_decision_made_by_model_weight'])
count_annotated_in_test_data = sum(full_kpwr_results_df['if_annotated_in_test_data'])

metrics_df.loc["LESK_Max", "Precision_kpwr"] = correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight
metrics_df.loc["LESK_Max", "Recall_kpwr"] = correctly_predicted_senses_count_weight/count_annotated_in_test_data

print(f"precision for kpwr lesk = {correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight}")
print(f"recall for kpwr lesk = {correctly_predicted_senses_count_weight/count_annotated_in_test_data}")

precision for kpwr lesk = 0.3413663145602208
recall for kpwr lesk = 0.4629616298322655


In [300]:
path_sherlock = 'test_gold_standard/test_gold_standard/sherlock/'
path_results = 'results_lesk/sherlock/'
files_sherlock = os.listdir(path_results)
full_sherlock_results_df = pd.DataFrame()

for file_name in files_sherlock:
  result_file_name = path_results + file_name
  file_name = path_sherlock + file_name
  df = pd.read_csv(file_name, sep = '\t', quoting=3, error_bad_lines=False)[["ORDER_ID", "WN_ID"]]
  result_df = pd.read_csv(result_file_name, sep = '\t')
  df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
  result_df = pd.merge(result_df, df, how='inner', on='ORDER_ID')
  full_sherlock_results_df = full_sherlock_results_df.append(result_df)

full_sherlock_results_df = full_sherlock_results_df.reset_index(drop=True)

full_sherlock_results_df.loc[full_sherlock_results_df.CTAG.str.startswith('num') |
           full_sherlock_results_df.CTAG.str.startswith('conj') |
           (full_sherlock_results_df.CTAG.str.startswith('brev') & ~full_sherlock_results_df.ORTH.isin(['km', 'ul', 'mln', 'mld', 'NDK', 'tab', 'kg', 'ks', 'godz', 'km2'])) |
           full_sherlock_results_df.CTAG.str.startswith('prep') |
           full_sherlock_results_df.CTAG.str.startswith('aglt'), "WN_ID_L"] = "_"

full_sherlock_results_df['WN_ID_L'] = np.where(full_sherlock_results_df['LEMMA'].str.isdigit(), "_", full_sherlock_results_df['WN_ID_L'])

full_sherlock_results_df['correct_classif_weight'] = np.where(np.logical_and(full_sherlock_results_df['WN_ID_L'] == full_sherlock_results_df['WN_ID_ACTUAL'], full_sherlock_results_df['WN_ID_L'] != "_"), True, False)
full_sherlock_results_df['if_decision_made_by_model_weight'] = full_sherlock_results_df['WN_ID_L'] != "_"
full_sherlock_results_df['if_annotated_in_test_data'] = full_sherlock_results_df['WN_ID_ACTUAL'] != "_"

correctly_predicted_senses_count_weight = sum(full_sherlock_results_df['correct_classif_weight'])
count_decisions_made_by_model_weight = sum(full_sherlock_results_df['if_decision_made_by_model_weight'])
count_annotated_in_test_data = sum(full_sherlock_results_df['if_annotated_in_test_data'])

metrics_df.loc["LESK_Max", "Precision_sherlock"] = correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight
metrics_df.loc["LESK_Max", "Recall_sherlock"] = correctly_predicted_senses_count_weight/count_annotated_in_test_data

print(f"precision for sherlock lesk = {correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight}")
print(f"recall for sherlock lesk = {correctly_predicted_senses_count_weight/count_annotated_in_test_data}")

precision for sherlock lesk = 0.3107030235577815
recall for sherlock lesk = 0.4536836403033586


In [307]:
metrics_df.to_csv("final_results_metrics.csv", sep = ";", decimal = ",")

# For WoSeDoN Clarin WSD

In [22]:
pathlist = list(Path('testdata/testdata').glob('**/*.conll'))
pathlist_str = [str(x).replace("\\", "/") for x in pathlist]

pathlist_to_do = set(pathlist_str)

pathlist_to_do_kpwr = [x for x in pathlist_to_do if "kpwr" in x]
pathlist_to_do_kpwr = sorted(pathlist_to_do_kpwr)

pathlist_to_do_sherlock = [x for x in pathlist_to_do if "sherlock" in x]
pathlist_to_do_sherlock = sorted(pathlist_to_do_sherlock)

text_kpwr = ""
for path in tqdm(pathlist_to_do_kpwr):
    df = pd.read_csv(path, sep='\t')
    text_kpwr = text_kpwr + " ".join(df["ORTH"]) + " "

text_sherlock = ""
for path in tqdm(pathlist_to_do_sherlock):
    df = pd.read_csv(path, sep='\t')
    text_sherlock = text_sherlock + " ".join(df["ORTH"]) + " "


  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

In [133]:
#!pip install xmltodict
import xmltodict

sherlock_clarin = open("sherlock_clarin.xml", "r", encoding='utf-8')
sherlock_clarin = sherlock_clarin.read()

sherlock_clarin_result = xmltodict.parse(sherlock_clarin)


In [197]:
results_clarin_df = pd.DataFrame(columns = ['ORTH',
                                            'WN_ID_C'])

for chunk in sherlock_clarin_result['chunkList']['chunk']:
    sentence_mwe = dict()
    sentence_results_clarin_df = pd.DataFrame(columns = ['ORTH',
                                                         'WN_ID_C',
                                                         'MWE'])
    if isinstance(chunk['sentence']['tok'], dict):
        tok = chunk['sentence']['tok']
        if 'prop' in tok.keys():
            if isinstance(tok['prop'], list):
                for prop in tok['prop']:
                    if prop['@key'] == 'sense:ukb:syns_id':
                        tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                              'WN_ID_C': [prop['#text']]})
                if 'ann' in tok.keys():
                    if tok['ann']['#text'] != "0":
                        sentence_mwe[tok['ann']['#text']] = tok_results_clarin_df['WN_ID_C'][0]
            
            else: 
                tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                          'WN_ID_C': ["_"]})
                
        else:
            tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                              'WN_ID_C': ["_"]})
                
        if 'ann' in tok.keys():
            tok_results_clarin_df['MWE'] = tok['ann']['#text']
        
        sentence_results_clarin_df = sentence_results_clarin_df.append(tok_results_clarin_df)
        
        for key, value in sentence_mwe.items():
            sentence_results_clarin_df.loc[sentence_results_clarin_df.MWE == key, "WN_ID_C"] = value

    
    else:
        for i, tok in enumerate(chunk['sentence']['tok']):
            if 'prop' in tok.keys():
                if isinstance(tok['prop'], list):
                    for prop in tok['prop']:
                        if prop['@key'] == 'sense:ukb:syns_id':
                            tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                                  'WN_ID_C': [prop['#text']]})
                    if 'ann' in tok.keys():
                        if tok['ann']['#text'] != "0":
                            sentence_mwe[tok['ann']['#text']] = tok_results_clarin_df['WN_ID_C'][0]

                else: 
                    tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                              'WN_ID_C': ["_"]})

            else:
                tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                  'WN_ID_C': ["_"]})

            if 'ann' in tok.keys():
                tok_results_clarin_df['MWE'] = tok['ann']['#text']

            sentence_results_clarin_df = sentence_results_clarin_df.append(tok_results_clarin_df)

            for key, value in sentence_mwe.items():
                sentence_results_clarin_df.loc[sentence_results_clarin_df.MWE == key, "WN_ID_C"] = value
            
    sentence_results_clarin_df = sentence_results_clarin_df.drop(['MWE'], axis=1)

    results_clarin_df = results_clarin_df.append(sentence_results_clarin_df)
        

In [221]:
sherlock_full_df = pd.DataFrame()
pathlist_to_do_sherlock_gold = ['test_gold_standard/test_gold_standard/sherlock/' + x[-14:] for x in pathlist_to_do_sherlock]
for path in tqdm(pathlist_to_do_sherlock_gold):
    df = pd.read_csv(path, sep='\t')
    sherlock_full_df = sherlock_full_df.append(df)
    
sherlock_full_df['WN_ID_C'] = ["s" + x if x != "_" else x for x in list(results_clarin_df['WN_ID_C']) ]



  0%|          | 0/31 [00:00<?, ?it/s]

In [224]:
sherlock_full_df['correct_classif_weight'] = np.where(np.logical_and(sherlock_full_df['WN_ID_C'] == sherlock_full_df['WN_ID'], sherlock_full_df['WN_ID_C'] != "_"), True, False)
sherlock_full_df['if_decision_made_by_model_weight'] = sherlock_full_df['WN_ID_C'] != "_"
sherlock_full_df['if_annotated_in_test_data'] = sherlock_full_df['WN_ID'] != "_"

correctly_predicted_senses_count_weight = sum(sherlock_full_df['correct_classif_weight'])
count_decisions_made_by_model_weight = sum(sherlock_full_df['if_decision_made_by_model_weight'])
count_annotated_in_test_data = sum(sherlock_full_df['if_annotated_in_test_data'])

print(f"precision for sherlock lesk = {correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight}")
print(f"recall for sherlock lesk = {correctly_predicted_senses_count_weight/count_annotated_in_test_data}")

precision for sherlock lesk = 0.42121082009446115
recall for sherlock lesk = 0.5314192849404117


In [226]:
kpwr_clarin = open("kpwr_clarin.xml", "r", encoding='utf-8')
kpwr_clarin = kpwr_clarin.read()

kpwr_clarin_result = xmltodict.parse(kpwr_clarin)

results_clarin_df = pd.DataFrame(columns = ['ORTH',
                                            'WN_ID_C'])

for chunk in kpwr_clarin_result['chunkList']['chunk']:
    sentence_mwe = dict()
    sentence_results_clarin_df = pd.DataFrame(columns = ['ORTH',
                                                         'WN_ID_C',
                                                         'MWE'])
    if isinstance(chunk['sentence']['tok'], dict):
        tok = chunk['sentence']['tok']
        if 'prop' in tok.keys():
            if isinstance(tok['prop'], list):
                for prop in tok['prop']:
                    if prop['@key'] == 'sense:ukb:syns_id':
                        tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                              'WN_ID_C': [prop['#text']]})
                if 'ann' in tok.keys():
                    if tok['ann']['#text'] != "0":
                        sentence_mwe[tok['ann']['#text']] = tok_results_clarin_df['WN_ID_C'][0]
            
            else: 
                tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                          'WN_ID_C': ["_"]})
                
        else:
            tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                              'WN_ID_C': ["_"]})
                
        if 'ann' in tok.keys():
            tok_results_clarin_df['MWE'] = tok['ann']['#text']
        
        sentence_results_clarin_df = sentence_results_clarin_df.append(tok_results_clarin_df)
        
        for key, value in sentence_mwe.items():
            sentence_results_clarin_df.loc[sentence_results_clarin_df.MWE == key, "WN_ID_C"] = value

    
    else:
        for i, tok in enumerate(chunk['sentence']['tok']):
            if 'prop' in tok.keys():
                if isinstance(tok['prop'], list):
                    for prop in tok['prop']:
                        if prop['@key'] == 'sense:ukb:syns_id':
                            tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                                  'WN_ID_C': [prop['#text']]})
                    if 'ann' in tok.keys():
                        if tok['ann']['#text'] != "0":
                            sentence_mwe[tok['ann']['#text']] = tok_results_clarin_df['WN_ID_C'][0]

                else: 
                    tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                              'WN_ID_C': ["_"]})

            else:
                tok_results_clarin_df = pd.DataFrame({'ORTH': [tok['orth']],
                                                  'WN_ID_C': ["_"]})

            if 'ann' in tok.keys():
                tok_results_clarin_df['MWE'] = tok['ann']['#text']

            sentence_results_clarin_df = sentence_results_clarin_df.append(tok_results_clarin_df)

            for key, value in sentence_mwe.items():
                sentence_results_clarin_df.loc[sentence_results_clarin_df.MWE == key, "WN_ID_C"] = value
            
    sentence_results_clarin_df = sentence_results_clarin_df.drop(['MWE'], axis=1)

    results_clarin_df = results_clarin_df.append(sentence_results_clarin_df)
        
kpwr_full_df = pd.DataFrame()
pathlist_to_do_kpwr_gold = ['test_gold_standard/test_gold_standard/kpwr/' + x[-14:] for x in pathlist_to_do_kpwr]
for path in tqdm(pathlist_to_do_kpwr_gold):
    df = pd.read_csv(path, sep='\t', quoting=3, error_bad_lines=False)
    kpwr_full_df = kpwr_full_df.append(df)


  0%|          | 0/98 [00:00<?, ?it/s]

ValueError: Length of values (32620) does not match length of index (32522)

# Brudnopis

In [258]:
results_clarin_df.loc[6800, "ORTH"] = results_clarin_df.loc[6800, "ORTH"] + "m"
results_clarin_df = results_clarin_df.drop(6801, axis=0)

In [265]:
results_clarin_df = results_clarin_df.drop(19933, axis=0)

In [272]:
results_clarin_df = results_clarin_df.drop(22784, axis=0)

In [276]:
results_clarin_df = results_clarin_df.drop(23330, axis=0)

In [277]:
results_clarin_df

Unnamed: 0,level_0,index,ORTH,WN_ID_C
0,0,0,Toronto,_
1,1,0,Dominion,_
2,2,0,Centre,_
3,3,0,Toronto,_
4,4,0,Dominion,_
...,...,...,...,...
32520,32521,0,zostało,49997
32521,32522,0,zmuszonych,4398
32522,32523,0,do,_
32523,32524,0,emigracji,26581


In [278]:
kpwr_full_df

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,CTAG,FROM,TO,WN_ID
0,0,0,Toronto,subst:sg:nom:n,0,6,_
1,1,1,Dominion,subst:sg:nom:n,7,14,_
2,2,2,Centre,subst:sg:nom:n,15,20,_
3,3,0,Toronto,subst:sg:nom:n,21,27,_
4,4,1,Dominion,subst:sg:nom:n,28,35,_
...,...,...,...,...,...,...,...
642,642,35,zostało,praet:sg:n:perf,3327,3333,_
643,643,36,zmuszonych,ppas:pl:gen:m1:perf:aff,3334,3343,s57965
644,644,37,do,prep:gen,3344,3345,_
645,645,38,emigracji,subst:sg:gen:f,3346,3354,s26581


In [279]:
kpwr_full_df['WN_ID_C'] = ["s" + x if x != "_" else x for x in list(results_clarin_df['WN_ID_C']) ]

kpwr_full_df['correct_classif_weight'] = np.where(np.logical_and(kpwr_full_df['WN_ID_C'] == kpwr_full_df['WN_ID'], kpwr_full_df['WN_ID_C'] != "_"), True, False)
kpwr_full_df['if_decision_made_by_model_weight'] = kpwr_full_df['WN_ID_C'] != "_"
kpwr_full_df['if_annotated_in_test_data'] = kpwr_full_df['WN_ID'] != "_"

correctly_predicted_senses_count_weight = sum(kpwr_full_df['correct_classif_weight'])
count_decisions_made_by_model_weight = sum(kpwr_full_df['if_decision_made_by_model_weight'])
count_annotated_in_test_data = sum(kpwr_full_df['if_annotated_in_test_data'])

print(f"precision for kpwr lesk = {correctly_predicted_senses_count_weight/count_decisions_made_by_model_weight}")
print(f"recall for kpwr lesk = {correctly_predicted_senses_count_weight/count_annotated_in_test_data}")

precision for kpwr lesk = 0.4100435898966979
recall for kpwr lesk = 0.49434885897343606


In [282]:
kpwr_full_df.to_csv("kpwr_clarin_results.csv", sep = ";", encoding='utf-8-sig', decimal = ',')

In [283]:
sherlock_full_df.to_csv("sherlock_clarin_results.csv", sep = ";", encoding='utf-8-sig', decimal = ',')

In [208]:
kpwr_full_df = pd.read_csv("kpwr_clarin_results.csv", sep = ";", encoding='utf-8-sig', decimal = ',')
kpwr_full_df = kpwr_full_df.iloc[:, 1:9]
kpwr_full_df = kpwr_full_df.drop("WN_ID", axis = 1)
kpwr_full_df

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,CTAG,FROM,TO,WN_ID_C
0,0,0,Toronto,subst:sg:nom:n,0,6,_
1,1,1,Dominion,subst:sg:nom:n,7,14,_
2,2,2,Centre,subst:sg:nom:n,15,20,_
3,3,0,Toronto,subst:sg:nom:n,21,27,_
4,4,1,Dominion,subst:sg:nom:n,28,35,_
...,...,...,...,...,...,...,...
32517,642,35,zostało,praet:sg:n:perf,3327,3333,s49997
32518,643,36,zmuszonych,ppas:pl:gen:m1:perf:aff,3334,3343,s4398
32519,644,37,do,prep:gen,3344,3345,_
32520,645,38,emigracji,subst:sg:gen:f,3346,3354,s26581


In [216]:
sherlock_full_df = pd.read_csv("sherlock_clarin_results.csv", sep = ";", encoding='utf-8-sig', decimal = ',')
sherlock_full_df = sherlock_full_df.iloc[:, 1:9]
sherlock_full_df = sherlock_full_df.drop("WN_ID", axis = 1)
sherlock_full_df

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,CTAG,FROM,TO,WN_ID_C
0,0,0,Sherlock,subst:sg:nom:m1,0,7,_
1,1,1,Holmes,subst:sg:nom:m1,8,13,s75885
2,2,2,i,conj,14,14,_
3,3,3,Pstrokata,adj:sg:nom:f:pos,15,23,s56611
4,4,4,Opaska,subst:sg:nom:f,24,29,_
...,...,...,...,...,...,...,...
9082,327,18,bardzo,adv:pos,1601,1606,s460680
9083,328,19,obciążyć,inf:perf,1607,1614,s1986
9084,329,20,moje,adj:sg:acc:n:pos,1615,1618,_
9085,330,21,sumienie,subst:sg:acc:n,1619,1626,s3320


In [210]:
list(kpwr_full_df.columns[0:-1])

['ORDER_ID', 'TOKEN_ID', 'ORTH', 'CTAG', 'FROM', 'TO']

In [223]:
print(kpwr_full_df.drop("WN_ID_C", axis = 1).drop_duplicates().shape)
print(kpwr_full_df.shape)

(32486, 6)
(32487, 7)


In [225]:
kpwr_full_df.columns[0:-1]

Index(['ORDER_ID', 'TOKEN_ID', 'ORTH', 'CTAG', 'FROM', 'TO'], dtype='object')

In [239]:
kpwr_full_df["problem"] = kpwr_full_df.groupby(list(kpwr_full_df.columns[0:-1])).transform(len)

In [253]:
kpwr_full_df[kpwr_full_df.problem > 1]

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,CTAG,FROM,TO,WN_ID_C,problem
18556,4,4,się,qub,29,31,s60377,2


In [254]:
#kpwr_full_df = kpwr_full_df.drop(1530)
kpwr_full_df = kpwr_full_df.drop("problem", axis = 1)

In [224]:
print(sherlock_full_df.drop("WN_ID_C", axis = 1).drop_duplicates().shape)
print(sherlock_full_df.shape)

(9083, 6)
(9083, 7)


In [256]:
pathlist = list(Path('testdata/testdata').glob('**/*.conll'))
pathlist_str = [str(x).replace("\\", "/") for x in pathlist]

pathlist_to_do = set(pathlist_str)

pathlist_to_do_kpwr = [x for x in pathlist_to_do if "kpwr" in x]
pathlist_to_do_kpwr = sorted(pathlist_to_do_kpwr)

pathlist_to_do_sherlock = [x for x in pathlist_to_do if "sherlock" in x]
pathlist_to_do_sherlock = sorted(pathlist_to_do_sherlock)

join_columns = list(kpwr_full_df.columns[0:-1])

kpwr_full_df = kpwr_full_df.drop_duplicates()
sherlock_full_df = sherlock_full_df.drop_duplicates()

pathlist_to_do_kpwr_gold = ['testdata/testdata/kpwr/' + x[-14:] for x in pathlist_to_do_kpwr]
for path in tqdm(pathlist_to_do_kpwr_gold):
    df = pd.read_csv(path, sep='\t', quoting=3, error_bad_lines=False)
    df = df.merge(kpwr_full_df, how = 'left', on = join_columns)
    df.loc[df.WN_ID_C.isna(), "WN_ID_C"] = "_"
    df = df.rename({"WN_ID_C" : "WN_ID"}, axis = 1)
    result_path = "results_wosedon/kpwr" + path[-15:]
    df.to_csv(result_path, sep='\t', index=False)
    
pathlist_to_do_sherlock_gold = ['testdata/testdata/sherlock/' + x[-14:] for x in pathlist_to_do_sherlock]
for path in tqdm(pathlist_to_do_sherlock_gold):
    df = pd.read_csv(path, sep='\t', quoting=3, error_bad_lines=False)
    df = df.merge(sherlock_full_df, how = 'left', on = join_columns)
    df.loc[df.WN_ID_C.isna(), "WN_ID_C"] = "_"
    df = df.rename({"WN_ID_C" : "WN_ID"}, axis = 1)
    result_path = "results_wosedon/sherlock" + path[-15:]
    df.to_csv(result_path, sep='\t', index=False)

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

In [25]:
columns = ['WN_ID_A', 'WN_ID_A_X', 'WN_ID_MX', 'WN_ID_MX_X', 'WN_ID_M', 'WN_ID_M_X', 'WN_ID_W', 'WN_ID_W_X',
          'WN_ID_AM', 'WN_ID_AM_X']
keep_cols = ['ORDER_ID', 'TOKEN_ID', 'ORTH', 'LEMMA', 'CTAG', 'FROM', 'TO']
for col in columns:
    all_cols = keep_cols.copy()
    all_cols.append(col)
    df = df.loc[:, all_cols]
    df.rename(columns={col: 'WN_ID'}, inplace=True)
    break

In [26]:
df

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,LEMMA,CTAG,FROM,TO,WN_ID
0,0,0,2010-07-17,2010-07-17,adj:sg:nom:m3:pos,0,9,_
1,1,1,:,:,interp,10,10,_
2,2,2,Darfur,darfur,subst:sg:nom:f,11,16,_
3,3,3,:,:,interp,17,17,_
4,4,4,w,w,prep:loc:nwok,18,18,_
...,...,...,...,...,...,...,...,...
642,642,35,zostało,zostać,praet:sg:n:perf,3327,3333,s49997
643,643,36,zmuszonych,zmusić,ppas:pl:gen:m1:perf:aff,3334,3343,s4398
644,644,37,do,do,prep:gen,3344,3345,_
645,645,38,emigracji,emigracja,subst:sg:gen:f,3346,3354,s26581


In [87]:
path_results = 'results/kpwr/'
files_kpwr = os.listdir(path_results)
columns = ['WN_ID_A', 'WN_ID_A_X', 'WN_ID_MX', 'WN_ID_MX_X', 'WN_ID_M', 'WN_ID_M_X', 'WN_ID_W', 'WN_ID_W_X',
          'WN_ID_AM', 'WN_ID_AM_X']
keep_cols = ['ORDER_ID', 'TOKEN_ID', 'ORTH', 'LEMMA', 'CTAG', 'FROM', 'TO']


for file_name in files_kpwr:
    file_name1 = path_results + file_name
    df = pd.read_csv(file_name1, sep = '\t')
    df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
    df['WN_ID_A'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_A'])
    df['WN_ID_AM'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_AM'])
    df['WN_ID_MX'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_MX'])
    df['WN_ID_M'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_M'])
    df['WN_ID_W'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_W'])
    df['WN_ID_A_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_A_X'])
    df['WN_ID_AM_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_AM_X'])
    df['WN_ID_MX_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_MX_X'])
    df['WN_ID_M_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_M_X'])
    df['WN_ID_W_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_W_X'])
    for col in columns:
        df1 = df.copy()
        all_cols = keep_cols.copy()
        all_cols.append(col)
        df1 = df1.loc[:, all_cols]
        df1.rename(columns={col: 'WN_ID'}, inplace=True)
        results_path = "results_rozbite/" + col + "/kpwr/" + file_name
        df1.to_csv(results_path, sep='\t', index=False)

In [88]:
path_results = 'results/sherlock/'
files_kpwr = os.listdir(path_results)
columns = ['WN_ID_A', 'WN_ID_A_X', 'WN_ID_MX', 'WN_ID_MX_X', 'WN_ID_M', 'WN_ID_M_X', 'WN_ID_W', 'WN_ID_W_X',
          'WN_ID_AM', 'WN_ID_AM_X']
keep_cols = ['ORDER_ID', 'TOKEN_ID', 'ORTH', 'LEMMA', 'CTAG', 'FROM', 'TO']


for file_name in files_kpwr:
    file_name1 = path_results + file_name
    df = pd.read_csv(file_name1, sep = '\t')
    df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
    df['WN_ID_A'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_A'])
    df['WN_ID_AM'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_AM'])
    df['WN_ID_MX'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_MX'])
    df['WN_ID_M'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_M'])
    df['WN_ID_W'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_W'])
    df['WN_ID_A_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_A_X'])
    df['WN_ID_AM_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_AM_X'])
    df['WN_ID_MX_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_MX_X'])
    df['WN_ID_M_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_M_X'])
    df['WN_ID_W_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_W_X'])
    for col in columns:
        df1 = df.copy()
        all_cols = keep_cols.copy()
        all_cols.append(col)
        df1 = df1.loc[:, all_cols]
        df1.rename(columns={col: 'WN_ID'}, inplace=True)
        results_path = "results_rozbite/" + col + "/sherlock/" + file_name
        df1.to_csv(results_path, sep='\t', index=False)

In [91]:
path_results = 'results_herbert/kpwr/'
files_kpwr = os.listdir(path_results)
columns = ['WN_ID_A', 'WN_ID_A_X', 'WN_ID_MX', 'WN_ID_MX_X', 'WN_ID_M', 'WN_ID_M_X', 'WN_ID_W', 'WN_ID_W_X',
          'WN_ID_AM', 'WN_ID_AM_X']
keep_cols = ['ORDER_ID', 'TOKEN_ID', 'ORTH', 'LEMMA', 'CTAG', 'FROM', 'TO']


for file_name in files_kpwr:
    file_name = "00103632.conll"
    file_name1 = path_results + file_name
    df = pd.read_csv(file_name1, sep = '\t')
    df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
    df['WN_ID_A'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_A'])
    df['WN_ID_AM'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_AM'])
    df['WN_ID_MX'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_MX'])
    df['WN_ID_M'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_M'])
    df['WN_ID_W'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_W'])
    df['WN_ID_A_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_A_X'])
    df['WN_ID_AM_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_AM_X'])
    df['WN_ID_MX_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_MX_X'])
    df['WN_ID_M_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_M_X'])
    df['WN_ID_W_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_W_X'])
    for col in columns:
        df1 = df.copy()
        all_cols = keep_cols.copy()
        all_cols.append(col)
        df1 = df1.loc[:, all_cols]
        df1.rename(columns={col: 'WN_ID'}, inplace=True)
        results_path = "results_rozbite_herbert/" + col + "/kpwr/" + file_name
        df1.to_csv(results_path, sep='\t', index=False)
        break
    break


In [103]:
path = "results_herbert/kpwr/00103632.conll"
df = pd.read_csv(path, sep = '\t').replace('"','', regex=True)

In [104]:
df[130:160]

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,LEMMA,CTAG,FROM,TO,WN_ID_X,WN_ID_A,WN_ID_A_X,WN_ID_AM,WN_ID_AM_X,WN_ID_MX,WN_ID_MX_X,WN_ID_M,WN_ID_M_X,WN_ID_W_X,WN_ID_W
130,130,11,na,na,prep:loc,607,608.0,_,_,_,_,_,_,_,_,_,_,_
131,131,12,wierze,Wiera,subst:sg:loc:f,609,614.0,_,_,_,_,_,_,_,_,_,_,_
132,132,13,Piotra,Piotr,subst:sg:gen:m1,615,620.0,_,s400159,s400159,s400159,s400159,s400159,s400159,s400159,s400159,s400159,s400159
133,133,14,jakąś,jakiś,adj:sg:inst:f:pos,621,625.0,_,s9558,s9558,s9558,s9558,s7060149,s7060149,s9558,s9558,s9558,s9558
134,134,15,wiarą,wiara,subst:sg:inst:f,626,630.0,_,s3764,s3764,s248091,s248091,s3764,s3764,s248091,s248091,s248091,s248091
135,135,16,jest,być,fin:sg:ter:imperf,631,634.0,_,s250925,s250925,s250925,s250925,s55138,s55138,s55138,s55138,s55138,s55138
136,136,17,",",",",interp,635,635.0,_,_,_,_,_,_,_,_,_,_,_
137,137,18,ale,ale,conj,636,638.0,_,_,_,_,_,_,_,_,_,_,_
138,138,19,nie,nie,qub,639,641.0,_,s235290,s235290,s235290,s235290,s235290,s235290,s235290,s235290,s235290,s235290
139,139,20,tą,ten,adj:sg:inst:f:pos,642,643.0,_,s514,s514,s514,s514,s514,s514,s514,s514,s514,s514


In [105]:
path = "results_rozbite_herbert/WN_ID_AM_X/kpwr/00103632.conll"
dftst = pd.read_csv(path, sep = '\t')

In [106]:
dftst[130:160]

Unnamed: 0,ORDER_ID,TOKEN_ID,ORTH,LEMMA,CTAG,FROM,TO,WN_ID_X,WN_ID_A,WN_ID_A_X,WN_ID_AM,WN_ID_AM_X,WN_ID_MX,WN_ID_MX_X,WN_ID_M,WN_ID_M_X,WN_ID_W_X,WN_ID_W
130,130,11,na,na,prep:loc,607,608.0,_,_,_,_,_,_,_,_,_,_,_
131,131,12,wierze,Wiera,subst:sg:loc:f,609,614.0,_,_,_,_,_,_,_,_,_,_,_
132,132,13,Piotra,Piotr,subst:sg:gen:m1,615,620.0,_,s400159,s400159,s400159,s400159,s400159,s400159,s400159,s400159,s400159,s400159
133,133,14,jakąś,jakiś,adj:sg:inst:f:pos,621,625.0,_,s9558,s9558,s9558,s9558,s7060149,s7060149,s9558,s9558,s9558,s9558
134,134,15,wiarą,wiara,subst:sg:inst:f,626,630.0,_,s3764,s3764,s248091,s248091,s3764,s3764,s248091,s248091,s248091,s248091
135,135,16,jest,być,fin:sg:ter:imperf,631,634.0,_,s250925,s250925,s250925,s250925,s55138,s55138,s55138,s55138,s55138,s55138
136,136,17,",",",",interp,635,635.0,_,_,_,_,_,_,_,_,_,_,_
137,137,18,ale,ale,conj,636,638.0,_,_,_,_,_,_,_,_,_,_,_
138,138,19,nie,nie,qub,639,641.0,_,s235290,s235290,s235290,s235290,s235290,s235290,s235290,s235290,s235290,s235290
139,139,20,tą,ten,adj:sg:inst:f:pos,642,643.0,_,s514,s514,s514,s514,s514,s514,s514,s514,s514,s514


In [57]:
import csv

In [90]:
path_results = 'results_herbert/sherlock/'
files_kpwr = os.listdir(path_results)
columns = ['WN_ID_A', 'WN_ID_A_X', 'WN_ID_MX', 'WN_ID_MX_X', 'WN_ID_M', 'WN_ID_M_X', 'WN_ID_W', 'WN_ID_W_X',
          'WN_ID_AM', 'WN_ID_AM_X']
keep_cols = ['ORDER_ID', 'TOKEN_ID', 'ORTH', 'LEMMA', 'CTAG', 'FROM', 'TO']


for file_name in files_kpwr:
    file_name1 = path_results + file_name
    df = pd.read_csv(file_name1, sep = '\t')
    df.rename(columns={'WN_ID': 'WN_ID_ACTUAL'}, inplace=True)
    df['WN_ID_A'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_A'])
    df['WN_ID_AM'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_AM'])
    df['WN_ID_MX'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_MX'])
    df['WN_ID_M'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_M'])
    df['WN_ID_W'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_W'])
    df['WN_ID_A_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_A_X'])
    df['WN_ID_AM_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_AM_X'])
    df['WN_ID_MX_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_MX_X'])
    df['WN_ID_M_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_M_X'])
    df['WN_ID_W_X'] = np.where(df['LEMMA'].str.isdigit(), "_", df['WN_ID_W_X'])
    for col in columns:
        df1 = df.copy()
        all_cols = keep_cols.copy()
        all_cols.append(col)
        df1 = df1.loc[:, all_cols]
        df1.rename(columns={col: 'WN_ID'}, inplace=True)
        results_path = "results_rozbite_herbert/" + col + "/sherlock/" + file_name
        df1.to_csv(results_path, sep='\t', index=False)

In [116]:
path = "results_rozbite_herbert/WN_ID_AM_X/kpwr/00103632.conll"
with open(path, 'r', encoding="utf-8") as file:
    data = file.read().replace('"', '&')

file1 = open("00103632_.conll","w", encoding="utf-8")#write mode
file1.write(data)
file1.close()