In [2]:
from functools import partial

import torch
from transformers import BertTokenizer, BertForNextSentencePrediction


def get_most_probable_following_sentence(tokenizer, model, text1, text2):
    text1_toks = ["[CLS]"] + tokenizer.tokenize(text1) + ["[SEP]"]
    text2_toks = tokenizer.tokenize(text2) + ["[SEP]"]
    text = text1_toks+text2_toks
    indexed_tokens = tokenizer.convert_tokens_to_ids(text)
    segments_ids = [0]*len(text1_toks) + [1]*len(text2_toks)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    model.eval()
    prediction = model(tokens_tensor, token_type_ids=segments_tensors)
    prediction=prediction[0] # tuple to tensor
    softmax = torch.nn.Softmax(dim=1)
    prediction_sm = softmax(prediction)

    return prediction_sm[0]

I0127 23:54:07.764195 140057921382144 file_utils.py:35] PyTorch version 1.0.1.post2 available.
W0127 23:54:08.549579 140057921382144 __init__.py:28] To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

partial_get_most_probable_following_sentence = partial(get_most_probable_following_sentence, tokenizer, model)

I0127 23:04:39.549858 139702180816640 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/gabrielamelo/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0127 23:04:40.129470 139702180816640 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/gabrielamelo/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0127 23:04:40.130248 139702180816640 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "ini

In [9]:
text1 = "How old are you?"
text2 = "The Eiffel Tower is in Paris"
prediction = partial_get_most_probable_following_sentence(text1, text2)
print(prediction)

tensor([4.1673e-04, 9.9958e-01], grad_fn=<SelectBackward>)


In [29]:
text1 = "How old are you?"
text2 = "I am 22 years old"
prediction = partial_get_most_probable_following_sentence(text1, text2)
print(prediction)
print(prediction[0])

tensor([9.9999e-01, 9.6342e-06], grad_fn=<SelectBackward>)
tensor(1.0000, grad_fn=<SelectBackward>)


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-multilingual-cased')

partial_get_most_probable_following_sentence = partial(get_most_probable_following_sentence, tokenizer, model)

I0127 23:54:14.157467 140057921382144 file_utils.py:362] https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt not found in cache or force_download set to True, downloading to /tmp/tmpcuwu985q
I0127 23:54:15.504419 140057921382144 file_utils.py:377] copying /tmp/tmpcuwu985q to cache at /home/gabrielamelo/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
I0127 23:54:15.506328 140057921382144 file_utils.py:381] creating metadata file for /home/gabrielamelo/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
I0127 23:54:15.507202 140057921382144 file_utils.py:390] removing temp file /tmp/tmpcuwu985q
I0127 23:54:15.507760 140057921382144 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multiling

In [None]:
text1 = "Quantos anos você tem?"
text2 = "The Eiffel Tower is in Paris"
prediction = partial_get_most_probable_following_sentence(text1, text2)
print(prediction)

In [19]:
def get_sentence_breaks(first_sentence, second_sentence):
    for i in range(len(first_sentence.split())):
        if first_sentence.split()[i] != second_sentence.split()[i]:  # noqaE226
            break
    return i

In [28]:
def test_get_sentence_breaks():
    first_sentence = 'The city councilmen refused the demonstrators a permit because the city councilmen feared violence.'
    second_sentence = 'The city councilmen refused the demonstrators a permit because the demonstrators feared violence.'
    i = get_sentence_breaks(first_sentence, second_sentence)
    assert ' '.join(first_sentence.split()[:i]) == \
        'The city councilmen refused the demonstrators a permit because the'
    assert ' '.join(second_sentence.split()[:i]) == \
        'The city councilmen refused the demonstrators a permit because the'
    assert ' '.join(first_sentence.split()[i:]) == \
        'city councilmen feared violence.'
    assert ' '.join(second_sentence.split()[i:]) == \
        'demonstrators feared violence.'
    
    first_sentence = 'Os vereadores recusaram a autorização aos manifestantes porque os vereadores temiam a violência.'
    second_sentence = 'Os vereadores recusaram a autorização aos manifestantes porque os manifestantes temiam a violência.'
    
    i = get_sentence_breaks(first_sentence, second_sentence)
    assert ' '.join(first_sentence.split()[:i]) == \
        'Os vereadores recusaram a autorização aos manifestantes porque os'
    assert ' '.join(second_sentence.split()[:i]) == \
        'Os vereadores recusaram a autorização aos manifestantes porque os'
    assert ' '.join(first_sentence.split()[i:]) == \
        'vereadores temiam a violência.'
    assert ' '.join(second_sentence.split()[i:]) == \
        'manifestantes temiam a violência.'
    
test_get_sentence_breaks()
# test with how sentences are after tokenizing

In [30]:
# function call is different
def analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence):
    if correct_sentence == '' or wrong_sentence == '':
        return False, False

    i = get_sentence_breaks(correct_sentence, wrong_sentence)
    
    text1 = correct_sentence[:i]
    text2 = correct_sentence[i:]
    prob_correct_sentence_correct = get_most_probable_following_sentence(tokenizer, model, text1, text2)[0]

    text1 = wrong_sentence[:i]
    text2 = wrong_sentence[i:]
    prob_wrong_sentence_correct = get_most_probable_following_sentence(tokenizer, model, text1, text2)[0]

    result = prob_correct_sentence_correct > prob_wrong_sentence_correct
    
    return result, 0 # let's always return 0 for partial result

In [31]:
def run_bert_test_for_col(df, model, tokenizer, result_col):
    if result_col == 'original':
        correct_column = 'correct_sentence'
        incorrect_column = 'incorrect_sentence'
    elif result_col == 'switched':
        correct_column = 'correct_switched'
        incorrect_column = 'incorrect_switched'
    else:
        correct_column = 'manually_fixed_correct_sentence'
        incorrect_column = 'manually_fixed_incorrect_sentence'

    partial_analyse_single_wsc_bert = partial(analyse_single_wsc_bert, model, tokenizer)
    
    for i, row in df.iterrows():
        df.loc[i, f'{result_col}_result_full'], df.loc[i, f'{result_col}_result_partial'] = \
            partial_analyse_single_wsc(row[correct_column], row[incorrect_column])

    return df

I0127 23:23:43.880078 139702180816640 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/gabrielamelo/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0127 23:23:44.446662 139702180816640 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/gabrielamelo/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I0127 23:23:44.447212 139702180816640 configuration_utils.py:199] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "ini

In [None]:
def winograd_test(df, corpus, model_file_name, device, model, english=False, use_bert=False):
    df = df[df.translated].copy()
    df = prepare_text_cols(df, corpus, english)
    df = add_results_columns(df)

    if use_bert:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')        
        partial_run_test_for_col = partial(run_bert_test_for_col, df, model, tokenizer)
    else:
        partial_run_test_for_col = partial(run_test_for_col, df, model, model_file_name, corpus, device)

    df = partial_run_test_for_col(result_col='original')
    df = partial_run_test_for_col(result_col='switched')

    test_on_manually_fixed = (
        'manually_fixed_correct_sentence' in df.columns and
        df.iloc[0]['manually_fixed_correct_sentence'] != ''
    )
    if test_on_manually_fixed:
        df = partial_run_test_for_col(result_col='manually_fixed')

    metrics = calculate_metrics(df, test_on_manually_fixed)
    generate_report(metrics)