In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datasets
import transformers
from transformers import AutoTokenizer, AutoModelForMaskedLM

import mlm
from mlm.scorers import MLMScorerPT 
from mlm.models import get_pretrained

import mxnet as mx
import torch

import pathlib
import os
import warnings

from dataset_orm import *
from wordbank_tasks import *

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker



## NOTES

* **TODO**: verify the surgery I did to the mlm.scorers codebase to accept RoBERTa is legit
* If we want other models, we'll have to add them there, too, perhaps with a bit more work if their output format is very different
* The function below implements the very basic test. Next steps I can see us wanting to do:
    * Combine it with sentences from the real data
    * Check at least two alternative word-replacement strategies (within category, between categories)
    * Write more of a pipeline that samples words, sentences, replacement words for each sentence, and spits out scorers
* Open questions:
    * How do we measure how well the model did? Rank of the correct sentence? NLL difference from correct and other best-performing sentence? Both? 

In [3]:
USE_GPU = False

if USE_GPU:
    CONTEXTS = [mx.gpu(0)]
    DEVICE = torch.device('cuda:0')
    
else:
    CONTEXTS = [mx.cpu()]
    DEVICE = torch.device('cpu')
    
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '10000'
os.environ['WORLD_SIZE'] = '1'
os.environ['RANK'] = '0'

In [4]:
def scorer_from_transformers_checkpoint(checkpotint_name):
    tokenizer = AutoTokenizer.from_pretrained(checkpotint_name)
    model = AutoModelForMaskedLM.from_pretrained(checkpotint_name)
    return MLMScorerPT(model, None, tokenizer, CONTEXTS, device=DEVICE)

roberta_scorer = scorer_from_transformers_checkpoint('nyu-mll/roberta-base-100M-1')
bert_scorer = scorer_from_transformers_checkpoint('bert-base-uncased')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
DB_FILE = 'wordbank.db'
DB_PATH = pathlib.Path(os.getcwd()).parent.absolute() / 'data' / DB_FILE
engine = create_engine(f'sqlite:///{DB_PATH}')
Session = sessionmaker(bind=engine)

In [6]:
warnings.filterwarnings('ignore', category=UserWarning, module='gluonnlp.data')

In [7]:
session = Session()
all_words_df = discriminative_task_all_words(
    session_maker=Session, n_sentences_per_word=10, n_alternative_words=10,
    model_names=('bert', 'roberta'), model_scorers=[bert_scorer, roberta_scorer],
    criterion_func=smallest_nll_criterion)

  3%|▎         | 20/592 [01:01<29:08,  3.06s/it]


RuntimeError: CUDA out of memory. Tried to allocate 7.75 GiB (GPU 0; 11.78 GiB total capacity; 1.27 GiB already allocated; 7.19 GiB free; 3.60 GiB reserved in total by PyTorch)

In [None]:
session = Session()
table_word = session.query(WordbankWord).filter(WordbankWord.word == 'table').one()

In [None]:
word_query = session.query(WordbankWord.id, WordbankWord.word)

In [None]:
l = [[(0, 'a'), (1, 'b'), (2, 'c')], [(10, 'd'), (11, 'e'), (12, 'f')]]
ids, words = list(zip(*[list(zip(*x)) for x in l]))
print(ids)
print(words)

In [None]:
list(zip(*[(0, 'a'), (1, 'b'), (2, 'c')]))

In [None]:
import pandas as pd
words_df = pd.read_csv('../data/worbank_with_category.tsv', delimiter='\t')

In [None]:
sum([len(s.split(' ')) > 1 for s in words_df.value])

In [None]:
words_df.value[[len(s.split(' ')) > 1 for s in words_df.value]]