## Data extraction notebook

This notebook was used to format the data according to the WS method of GlossBERT.


In [None]:
!pip install transformers
# from google.colab import drive
# drive.mount('/content/drive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
[0m  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[0mCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0
Mounted at /content/drive


In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
from nltk.toolbox import to_settings_string
import pandas as pd
from nltk.corpus import wordnet
from nltk.corpus.reader.wordnet import WordNetError
part = 'test'
data_path = f'/data/only_sns/{part}'
with open(data_path, 'r') as f:
    raw_sent = ""
    sents = []
    sns_values = []
    labels = []
    offsets = []
    concat = []
    errors = []
    uniques = 0
    for line in f:
        # check if line starts with 'raw sent ='
        if line.startswith('# raw sent ='):
            raw_sent = line.strip().split('=')[1].strip()
        elif line.startswith('#'):
            continue
        else:
            # split line by tab and get the second column (sns value)
            split_row = line.strip().split('\t')
            if len(split_row) > 1:
              tok, sns = split_row[0], split_row[1]
            if sns != "O":
              # Get all wordnet synsets
              synsets = wordnet.synsets(tok)
              # we only want to store sentences for which we have at least 4 senses
              if len(synsets) >= 4:
                # store the correct sysnet
                WS_sent = raw_sent.replace(tok, f'"{tok}"')
                try:
                  wn_synset = wordnet.synset(sns)
                except WordNetError as e:
                  errors.append((tok, sns))
                  continue
                correct_def = wn_synset.definition()
                uniques += 1
                
                sents.append(WS_sent)
                sns_values.append(f'{tok}: {correct_def}')
                labels.append('Yes')
                concat.append(f'{WS_sent} [SEP] {correct_def}')
                offsets.append(wn_synset.offset())

                # retrieve 3 different senses
                for s in synsets[:4]:
                  definition = s.definition()
                  if definition != correct_def:
                    sents.append(WS_sent)
                    sns_values.append(f'{tok}: {definition}')
                    labels.append('No')
                    concat.append(f'{WS_sent} [SEP] {tok}: {definition}')
                    offsets.append(s.offset())

            # reset the sentence and continue
            if line.strip() == ".":
                raw_sent = ""


df = pd.DataFrame({"sent": sents, "sns": sns_values, "labels": labels, "input": concat, "offset": offsets})

print(df.head())
print(f'Done processing {part}, Total sentences: {len(sents)}, Unique sentences: {len(set(sents))}')
print(f'During processing, we encountered {len(errors)} tokens for which we could not retrieve the WordNet definition')
print(f'{uniques} Unique Context-Gloss pairs created')
store_path = f'/data/processed/{part}.csv'
df.to_csv(store_path)

                               sent  \
0  A animal is "grazing" in a field   
1  A animal is "grazing" in a field   
2  A animal is "grazing" in a field   
3  A animal is "grazing" in a field   
4  A animal is grazing in a "field"   

                                                 sns labels  \
0            grazing: feed as in a meadow or pasture    Yes   
1                        grazing: the act of grazing     No   
2  grazing: the act of brushing against while pas...     No   
3  grazing: break the skin (of a body part) by sc...     No   
4  field: a piece of land cleared of trees and us...    Yes   

                                               input   offset  
0  A animal is "grazing" in a field [SEP] feed as...  1576165  
1  A animal is "grazing" in a field [SEP] the act...   841091  
2  A animal is "grazing" in a field [SEP] the act...   150762  
3  A animal is "grazing" in a field [SEP] break t...  1608508  
4  A animal is grazing in a "field" [SEP] a piece...  8569998  
Do