In [1]:
import os
import pickle
import sys
import torch

from constants import *
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

In [2]:
class ClozeModel(object): 
    
    def __init__(self, model_type): 
        if (model_type == "bert-base-uncased"): 
            self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif (model_type == "bert-base-cased"): 
            self.model = BertForMaskedLM.from_pretrained('bert-base-cased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        elif (model_type == 'gpt'): 
            self.model = OpenAIGPTModel.from_pretrained('openai-gpt')
            self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        self.model.eval() 
        
    def predict_candidate_GPT(sentence, candidates): 
        tokenized_text = self.tokenizer.tokenize(sentence)
        tokenized_candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]

        masked_index = tokenized_text.index('MASK')

        candidates_ids = self.tokenizer.convert_tokens_to_ids(tokenized_candidates)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        best_candidate = None
        best_perp = sys.maxint
        for i, candidate_id in enumerate(candidate_ids): 
            token_ids[masked_index] = candidate_id 
            tokens_tensor = torch.tensor([token_ids])

            # If you have a GPU, put everything on cuda
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

            predictions = self.model(tokens_tensor)

            perplexity = math.exp(torch.sum(predictions) / len(tokens_tensor))
            if perplexity > best_perp: 
                best_candidate = candidates[i]
                best_perp = perplexity

        print("The best candidate is: ", best_candidate)
        return best_candidate
       
    def predict_candidate_BERT(sentence, candidates):
        tokenized_text = self.tokenizer.tokenize(sentence)
        candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]
        masked_index = tokenized_text.index('MASK')

        candidates = [x.lower() for x in candidates]
        candidates_ids = self.tokenizer.convert_tokens_to_ids(candidates)

        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        segments_ids = [0] * len(tokenized_text)

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # If you have a GPU, put everything on cuda
        tokens_tensor = tokens_tensor.to('cuda')
        segments_tensors = segments_tensors.to('cuda')
        model.to('cuda')

        predictions = language_model(tokens_tensor, segments_tensors)
        predictions_candidates = predictions[0, masked_index, candidates_ids]
        answer_idx = torch.argmax(predictions_candidates).item()

        print('The most likely word is: ', candidates[answer_idx])

        return candidates[answer_idx]

    def predict_candidate(self, sentence, candidates):
        if self.model_type == "bert-base-uncased" or self.model_type == "bert-base-cased": 
            predict_candidate_BERT(sentence, candidates)
        elif self.model_type == "gpt": 
            predict_candidate_GPT(sentence, candidates)

In [5]:
    def read_examples(self): 
        correct_count = 0 
        total_count = 0 
        for data_split in DATA_SPLITS: 
            for data_type in DATA_TYPES:
                path = CLEANED_PATH + data_split + data_type 
                filenames = os.listdir(path)
                for filename in filenames: 
                    filename_path = path + filename
                    with open(filename_path, "rb") as pickle_file: 
                        examples = pickle.load(pickle_file)
                        for example in examples: 
                            sentence = example['sentence']
                            candidates = example['candidates']
                            answer = example['answer']
                            correct_answer = self.predict_candidate(sentence, candidates)
                            total_count += 1
                            if answer == correct_answer: 
                              correct_count += 1  

        return correct_count, total_count
                    

In [6]:
model = ClozeModel("bert-base-uncased")
model.read_examples() 

AttributeError: 'ClozeModel' object has no attribute 'read_examples'

In [None]:
with open("data/cleaned/train/high/high1495.pickle", "rb") as pickle_file: 
    examples = pickle.load(pickle_file)
    print(examples)