In [1]:
import json
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR

import random
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [2]:
num_epochs = 10

In [3]:
train_list = json.load(open('../data/creak_train_list.json'))
dev_list = json.load(open('../data/creak_dev_list.json'))

In [4]:
len(train_list)

10176

In [5]:
print(train_list[5])

The claim is:
The crack in the Liberty Bell sets it apart from other famous bells.

The claim makes sense:
Yes.

Because:
The Liberty Bell is famous for having a large crack in its side.


In [6]:
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def batchify(data, n):
    len_dict = {}
    for item in data:
        length = item.shape[1]
        try:
            len_dict[length].append(item)
        except:
            len_dict[length] = [item]

    batch_chunks = []
    for k in len_dict.keys():
        vectors = len_dict[k]
        batch_chunks += chunks(vectors, n)

    batches = []
    for chunk in batch_chunks:
        inputs = torch.stack([item[0] for item in chunk])
        batches.append((inputs))

    return batches

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [8]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.cuda()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [9]:
_limit = 1024
data = []
total_skipped = 0
for item in train_list:
    tokens = tokenizer.encode(item, return_tensors='pt')
    if tokens.shape[1] > _limit:
        total_skipped += 1
        continue
    data.append(tokens)
print(f'Skipped {total_skipped} out of {len(train_list)}')

Skipped 0 out of 10176


In [10]:
train_batches = batchify(data, 1)

In [11]:
def train(train_model, batches, optimizer, criterion):
    total_loss = 0.
    for i, batch in tqdm(enumerate(batches), total=len(batches)):
        model.train()
        inputs = batch
        optimizer.zero_grad()
        loss = train_model(inputs.cuda(), labels=inputs.cuda())[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(train_model.parameters(), 0.5)
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(batches)

In [12]:
len(train_batches)

10176

In [13]:
#train_batches = train_batches[:2000]

In [14]:
random.shuffle(train_batches)
scheduler = StepLR(optimizer, step_size=2, gamma=0.8)
for epoch in range(num_epochs):
    random.shuffle(train_batches)
    loss = train(model, train_batches, optimizer, criterion)
    #test(model, dev_list[:2000])
    print('Epoch:', epoch, 'Loss:', loss)
    torch.save({'epoch': epoch,
                'model_state_dict': model.state_dict()},
                'save_creak' + str(epoch))
    scheduler.step()

100%|█████████████████████████████████████| 10176/10176 [07:34<00:00, 22.40it/s]


Epoch: 0 Loss: 1.8558932856873334


100%|█████████████████████████████████████| 10176/10176 [07:34<00:00, 22.39it/s]


Epoch: 1 Loss: 1.5762933424714975


100%|█████████████████████████████████████| 10176/10176 [07:35<00:00, 22.33it/s]


Epoch: 2 Loss: 1.3989894967057608


100%|█████████████████████████████████████| 10176/10176 [07:34<00:00, 22.37it/s]


Epoch: 3 Loss: 1.2796877883111109


100%|█████████████████████████████████████| 10176/10176 [07:34<00:00, 22.39it/s]


Epoch: 4 Loss: 1.1587342018929292


100%|█████████████████████████████████████| 10176/10176 [07:33<00:00, 22.42it/s]


Epoch: 5 Loss: 1.0724251906697948


100%|█████████████████████████████████████| 10176/10176 [07:34<00:00, 22.40it/s]


Epoch: 6 Loss: 0.9820866109334729


100%|█████████████████████████████████████| 10176/10176 [07:34<00:00, 22.40it/s]


Epoch: 7 Loss: 0.9207002168928372


100%|█████████████████████████████████████| 10176/10176 [07:36<00:00, 22.29it/s]


Epoch: 8 Loss: 0.8521475448264336


100%|█████████████████████████████████████| 10176/10176 [07:36<00:00, 22.27it/s]


Epoch: 9 Loss: 0.8066815156636045


## Testing

In [7]:
import sys
import traceback

def test(model, data):
    model.eval()
    tp = 0
    fp = 0
    fn = 0

    skipped = 0

    for item in tqdm(data):
        expected = get_answer_from_text(item)
        predicted = ''
        try:
            predicted = generate_answer(model, item)
        except (IndexError, RuntimeError) as e:
            skipped += 1
            continue

        if expected == predicted:
            tp += 1
        if expected == 'N' and predicted == 'Y':
            fp += 1
        if expected == 'Y' and predicted == 'N':
            fn += 1

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1:', f1)
    print('Skipped:', skipped)

In [8]:
def get_text_up_to_question(text):
    _claim_yn = 'The claim makes sense:\n'
    return text[:text.find(_claim_yn) + len(_claim_yn)]

In [9]:
def get_answer_from_text(text):
    _claim_yn = 'The claim makes sense:\n'
    pos = text.find(_claim_yn) + len(_claim_yn)
    return text[pos]

In [10]:
def generate_answer(model, text):
    prompt = get_text_up_to_question(text)
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 1
    tokens_length = tokens.shape[1]
    if tokens_length + _length >= 1024:
        raise RuntimeError('Text is longer than 1024')
    output = model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length,  
             pad_token_id=50256
    )
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return get_answer_from_text(output)

In [11]:
def generate_full_answer(model, text):
    prompt = get_text_up_to_question(text)
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 70
    tokens_length = tokens.shape[1]
    if tokens_length + _length >= 1024:
        raise RuntimeError('Text is longer than 1024')
    output = model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length, 
             pad_token_id=50256
    )
    score = model(output, labels=output)[0]
    out_text = tokenizer.decode(output[0][tokens_length:], skip_special_tokens=True)

    return out_text, float(score)

In [12]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.cuda()
checkpoint = torch.load(f'save_creak0')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [13]:
get_answer_from_text(dev_list[1])

'N'

In [14]:
generate_answer(model, dev_list[1])

'Y'

In [15]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.cuda()
for epoch in range(0, num_epochs):
    checkpoint = torch.load(f'save_creak{epoch}')
    #checkpoint = torch.load(f'save_fact_check{epoch}')
    model.load_state_dict(checkpoint['model_state_dict'])
    _ = model.eval()
    print(f'Epoch {epoch}')
    test(model, dev_list)

KeyboardInterrupt: 

In [16]:
checkpoint = torch.load(f'save_creak{5}')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [25]:
story = "'battery' is part of a grocery list"

text = f"""
The claim is:
{story}

The claim makes sense:
"""[1:]


In [44]:
story = '"add salad" more of a request than an item.'

text = f"""
The claim is:
{story}

The claim makes sense:
"""[1:]


In [26]:
print(text)

The claim is:
'battery' is part of a grocery list

The claim makes sense:



In [27]:
generate_answer(model, text)

'N'

In [28]:
print(generate_full_answer(model, text)[0])

Nope.

Because:
'battery' is not a part of a grocery list. It is a term for electronic equipment. It is used to charge and charge batteries. It is not a physical characteristic of a device. It is a way to organize and organize things. It is not a physical characteristic of a device. It is


## Code for running the common sense classifier

In [None]:
class CommonSense:
    def __init__(self, model):
        self._model = model
        
    def makes_sense(self, claim):
        text = f"""
The claim is:
{claim}

The claim makes sense:
"""[1:]
        if generate_answer(text) == 'Y':
            return Answer(text='True')
        
        return Answer(text='False')
    
def generate_answer(model, text):
    prompt = get_text_up_to_question(text)
    tokens = tokenizer.encode(prompt, return_tensors='pt')
    _length = 1
    tokens_length = tokens.shape[1]
    if tokens_length + _length >= 1024:
        raise RuntimeError('Text is longer than 1024')
    output = model.generate(
             tokens.cuda(),
             max_length=tokens_length + _length, 
             pad_token_id=50256
    )
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return get_answer_from_text(output)