In [1]:
import json
import torch
import torch.nn as nn
import random
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
dev_dict = json.load(open('../data/coqa-dev-v1.0.json', encoding='utf8'))

In [4]:
train_list = json.load(open('../data/qa_train_list.json', encoding='utf8'))

In [5]:
dev_list = json.load(open('../data/qa_dev_list.json', encoding='utf8'))

In [6]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def batchify(data, n):
    len_dict = {}
    for item in data:
        length = item.shape[1]
        try:
            len_dict[length].append(item)
        except:
            len_dict[length] = [item]

    batch_chunks = []
    for k in len_dict.keys():
        vectors = len_dict[k]
        batch_chunks += chunks(vectors, n)

    batches = []
    for chunk in batch_chunks:
        inputs = torch.stack([item[0] for item in chunk])
        batches.append((inputs))

    return batches

In [7]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.cuda()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [8]:
_question_prompt = '\nQ: '
_answer_prompt = '\nA: '
    
def get_text_up_to_question_number(text, number):
    pos = text.find(_answer_prompt)
    for _ in range(number):
        pos = text.find(_answer_prompt, pos + 1)
    return text[0:pos + 1]
    
def get_answers_number(text):
    return text.count(_answer_prompt)

def get_answer_number(text, number):
    pos = text.find(_answer_prompt)
    for _ in range(number):
        pos = text.find(_answer_prompt, pos + 1)
    end = text.find('\n', pos + len(_answer_prompt))
    return text[pos + len(_answer_prompt):end]

def get_question_number(text, number):
    pos = text.find(_question_prompt)
    for _ in range(number):
        pos = text.find(_question_prompt, pos + 1)
    end = text.find('\n', pos + len(_question_prompt))
    return text[pos + len(_question_prompt):end]

def get_all_answers(dev_dict, dev_index):
    answers = [[item['input_text'] for item in dev_dict['data'][dev_index]['answers']]]
    answers += [[item['input_text'] for item in dev_dict['data'][dev_index]['additional_answers'][str(index)]] for index in range(3)]
    return [list(set([answers[j][i] for j in range(len(answers))])) for i in range(len(answers[0]))]

In [9]:
def generate_answer_number(model, text, number):
    prompt = get_text_up_to_question_number(text, number)
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 20
    tokens_length = tokens.shape[1]
    output = model.generate(
             tokens,
             max_length=tokens_length + _length,
             temperature=0,
             pad_token_id=50256
    )
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return get_answer_number(output, number)

In [10]:
def get_text_from_data_item(item, max_num_questions=0, question_number=-1, last_question=True):
    text = 'In the text below two people are discussing a story.\n\n'
    text += 'Story:\n' + item['story'] + '\n\n'
    text += 'Discussion:\n'
    text += '\n'.join(['Q: ' + q['input_text'] 
                       + '\nA: ' + a['input_text'] 
                       for q, a in zip(item['questions'][max(0,question_number-max_num_questions):question_number+1], 
                                       item['answers'][max(0,question_number-max_num_questions):question_number+1]) 
                      ])
    if not last_question:
        text = '\n'.join(text.split('\n')[:-1]) + '\n'
    return text

In [11]:
def generate_answer(model, prompt):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 50
    tokens_length = tokens.shape[1]
    if tokens_length + _length > 1024:
        return ''
    output = model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length,
             #temperature=0,
             pad_token_id=50256
    )
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    offset = len(prompt)
    start = offset + 1
    end = output.find('\n', start)
    return output[start:end].split(':')[-1].strip()

In [12]:
def generate_answer_with_typical_decoding(model, prompt):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 50
    tokens_length = tokens.shape[1]
    if tokens_length + _length > 1024:
        return ''
    generated_entropy = 0
    while tokens.shape[-1] < tokens_length + _length:
        new_tokens = model(tokens.cuda())
        normalized = torch.nn.functional.log_softmax(new_tokens.logits, dim=-1)
        p = torch.exp(normalized)
        entropy = -(normalized * p).nansum(-1, keepdim=True)
        shifted_scores = torch.abs(normalized + entropy)
        pred_ids = torch.argmin(shifted_scores, dim=-1)
        last_token = pred_ids[:, -1].cpu().detach()
        tokens = torch.cat([tokens, torch.tensor([[last_token]])], dim=-1)
        last_output = tokenizer.decode(last_token, skip_special_tokens=True)
        if last_output == '\n':
            break
        
    generated_output = tokens[:, tokens_length:]
    output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    end = output.find('\n')
    return output[:end].replace('A: ', '').strip()

In [13]:
def compute_accuracy_of_model(model):
    total_number_of_questions = 0
    correct_answers = 0
    wrong_predictions = []

    false_positives = []
    dlist = dev_list[:100]
    for index, text in tqdm(enumerate(dlist), total=len(dlist)):

        all_answers = get_all_answers(dev_dict, index)
        total_questions = len(all_answers)        
        
        for number in range(total_questions):
            small_text = get_text_from_data_item(dev_dict['data'][index], 
                                                 max_num_questions=8,
                                                 question_number=number,
                                                 last_question=False)
            prediction = generate_answer(model, small_text)
            if not prediction:
                print('NO PREDICTION!!')
                continue
            prediction = prediction.replace('.', '').replace('"', '')
            it_was_answered = False
            for label in all_answers[number]:
                label = label.replace('.', '').replace('"', '')

                if prediction.lower() != 'unknown' and label.lower() == 'unknown':
                    false_positives.append(prediction)
                
                if prediction.lower() == label.lower():
                    correct_answers += 1
                    it_was_answered = True
                    break
                elif prediction.lower() in label.lower():
                    correct_answers += 1
                    it_was_answered = True
                    break
                elif label.lower() in prediction.lower():
                    correct_answers += 1
                    it_was_answered = True
                    break
                else:
                    wrong_predictions.append({'label': label, 'prediction': prediction})
            total_number_of_questions += 1

    return correct_answers/total_number_of_questions, wrong_predictions, false_positives

In [14]:
checkpoint = torch.load('save_small' + str(6))
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DebertaTokenizer
import torch

model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [27]:
device = "cuda"

model.to(device)
premise = 'The user says: "My name is Alberto."'
hypothesis = 'The user says their name.'
input = tokenizer(premise, hypothesis, truncation=False, return_tensors="pt")
print(input.input_ids)
output = model(input["input_ids"].to(device))
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)

tensor([[    1,   133,  3018,   161,    35,    22,  2387,   766,    16, 21071,
            72,     2,   133,  3018,   161,    49,   766,     4,     2]])
{'entailment': 26.3, 'neutral': 34.7, 'contradiction': 39.1}


In [29]:
hg_model_hub_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"

tokenizer = AutoTokenizer.from_pretrained(hg_model_hub_name)
model = AutoModelForSequenceClassification.from_pretrained(hg_model_hub_name)

Some weights of the model checkpoint at ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [75]:
####

premise = "The user says: 'is the underground running today'"
hypothesis = "The wants inquires about a tube line"
max_length = 512

tokenized_input_seq_pair = tokenizer.encode_plus(premise, hypothesis,
                                                 max_length=max_length,
                                                 return_token_type_ids=True, truncation=True)

input_ids = torch.Tensor(tokenized_input_seq_pair['input_ids']).long().unsqueeze(0)
# remember bart doesn't have 'token_type_ids', remove the line below if you are using bart.
token_type_ids = torch.Tensor(tokenized_input_seq_pair['token_type_ids']).long().unsqueeze(0)
attention_mask = torch.Tensor(tokenized_input_seq_pair['attention_mask']).long().unsqueeze(0)

outputs = model(input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=None)
    # Note:
    # "id2label": {
    #     "0": "entailment",
    #     "1": "neutral",
    #     "2": "contradiction"
    # },

In [76]:
torch.nn.functional.softmax(outputs.logits)

  torch.nn.functional.softmax(outputs.logits)


tensor([[0.7117, 0.2707, 0.0176]], grad_fn=<SoftmaxBackward0>)

In [23]:
def generate_multiple_answers(model, prompt, num, length=100):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = length
    tokens_length = tokens.shape[1]
    if tokens_length + _length > 1024:
        return ''
    generated_ids = model.generate(tokens.to("cuda"), max_length=_length, num_beams=num, num_return_sequences=num)
    generated_sentences = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    sentences = []
    for index, item in enumerate(generated_sentences):
        output = generated_sentences[index]
        offset = len(prompt)
        start = offset + 1
        end = output.find('\n', start)
        sentences.append(item[start: end])
    
    return sentences

In [24]:
import numpy as np

max_probs = 5

def generate_answer_and_get_confidence(model, prompt):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 50
    tokens_length = tokens.shape[1]
    if tokens_length + _length > 1024:
        return ''
    generated_entropy = 0
    while tokens.shape[-1] < tokens_length + _length:
        new_token = model(tokens.cuda())
        probs = torch.softmax(new_token.logits[:, -1, :], dim=-1)
        probs_and_indices = [(p, index) for index, p in enumerate(probs[0].cpu().detach())]
        probs_and_indices = sorted(probs_and_indices, key=lambda x: -x[0])
        probs = [item[0] for item in probs_and_indices[:max_probs]]
        generated_entropy -= np.dot(probs, np.log(probs))
        tokens = torch.cat([tokens, torch.tensor([[torch.argmax(new_token.logits[:, -1, :])]])], dim=-1)
        last_token = tokens[:, -1]
        last_output = tokenizer.decode(last_token, skip_special_tokens=True)
        if last_output == '\n':
            break
        
    print(tokens.shape)
    print(tokens_length)
    generated_output = tokens[:, tokens_length:]
    print(generated_output.shape)
    output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    end = output.find('\n')
    return output[:end].strip(), generated_entropy

In [25]:
prompt = """
In the text below two people are discussing a story.

Story:
The Wag says: "My ship is called George".

Discussion:
Q: Who was the first man to walk on the moon?
A: 
""".strip()

generate_multiple_answers(model, prompt, num=15)

IndexError: too many indices for tensor of dimension 2

In [26]:
prompt = """
In the text below two people are discussing a story.

Story:
The Wag says: "My ship is called George".

Discussion:
Q: What is the Wag's ship called?
A: 
""".strip()

generate_answer_with_typical_decoding(model, prompt)

'George".'

In [46]:
prompt = """
In the text below two people are discussing a story.

Story:
The user says: "My ship is called George".

Discussion:
Q: Who is speaking?
A: 
""".strip()

generate_answer_with_typical_decoding(model, prompt)

'"My ship is called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called called calle'

In [34]:
prompt = """
In the text below two people are discussing a story.

Story:
The Wag speaks: "My ship is called George".

Discussion:
Q: Who is speaking?
A: 
""".strip()

generate_answer_with_typical_decoding(model, prompt)

'The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The Th'

In [35]:
%%time
generate_answer_and_get_confidence(model, prompt)

KeyboardInterrupt: 

In [36]:
def generate_answer_greedy(model, prompt, max_length= 50):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    tokens_length = tokens.shape[1]
    if tokens_length + max_length > 1024:
        return ''
    
    while tokens.shape[-1] < tokens_length + max_length:
        new_tokens = model(tokens.cuda())
        pred_ids = torch.argmax(new_tokens.logits, dim=-1)
        last_token = pred_ids[:, -1].cpu().detach()
        tokens = torch.cat([tokens, torch.tensor([[last_token]])], dim=-1)
        last_output = tokenizer.decode(last_token, skip_special_tokens=True)
        if last_output == '\n':
            break
        
    generated_output = tokens[:, tokens_length:]
    output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    end = output.find('\n')
    return output[:end].replace('A: ', '').strip()

In [37]:
%%time
generate_answer_greedy(model, prompt)

CPU times: user 436 ms, sys: 4.22 ms, total: 441 ms
Wall time: 438 ms


'The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The The Th'

#### Compute prob of specific sequences

In [38]:
def get_sequence_probability_given_prompt(model, prompt, sequence):
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    sequence_tokens = tokenizer.encode(sequence, return_tensors='pt')[0]
    token_index = 0
    total_prob = 1
    while token_index < len(sequence_tokens):
        new_tokens = model(tokens.cuda())
        last_distribution = new_tokens.logits[:, -1].cpu().detach()
        probs = torch.nn.functional.softmax(last_distribution)
        total_prob *= probs[0][sequence_tokens[token_index]]
        token_index += 1
        
        pred_ids = torch.argmax(new_tokens.logits, dim=-1)
        last_token = pred_ids[:, -1].cpu().detach()
        tokens = torch.cat([tokens, torch.tensor([[last_token]])], dim=-1)

            
    return total_prob

In [43]:
prompt = """
In the text below two people are discussing a story.

Story:
The user speaks: "My ship is called George".

Discussion:
Q: Who is the user speaking to?
A: 
""".strip()

In [44]:
get_sequence_probability_given_prompt(model, prompt, "user")

  probs = torch.nn.functional.softmax(last_distribution)


tensor(1.8337e-06)

In [45]:
generate_answer_greedy(model, prompt)

'"My ship is called George".'