In [162]:
import math
import os
import pickle
import re
import sys
import torch

import nltk


from constants import *
from nltk import word_tokenize
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import Laplace
from nltk.lm import Vocabulary
from nltk.util import ngrams

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from pytorch_pretrained_bert import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel

In [197]:
class ClozeModel(object): 
    
    def __init__(self, model_type): 
        self.model_type = model_type
        if (model_type == "bert-base-uncased"): 
            self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif (model_type == "bert-base-cased"): 
            self.model = BertForMaskedLM.from_pretrained('bert-base-cased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        elif (model_type == "bert-large-uncased"): 
            self.model = BertForMaskedLM.from_pretrained('bert-large-uncased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        elif (model_type == "bert-base-uncased-pretrained"):
            self.model = BertForMaskedLM.from_pretrained('bert_finetuned_lm')
            self.tokenizer = BertTokenizer.from_pretrained('bert_finetuned_lm')
        elif (model_type == "bert-large-uncased-pretrained"):
            self.model = BertForMaskedLM.from_pretrained('bert_finetuned_large_lm')
            self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        elif (model_type == 'gpt'): 
            self.model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
            self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        elif (model_type == 'gpt2'): 
            self.model = GPT2LMHeadModel.from_pretrained('gpt2')
            self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        elif (model_type == 'ngrams'): 
            self.model = self.get_ngrams_model()
        else: 
            self.model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
            self.tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        if (model_type != 'ngrams'):
            self.model.eval() 
        
    def get_ngrams_model(self): 
        with open("corpus.txt", "r") as corpus_file: 
            lines = [line.rstrip('\n') for line in corpus_file]
            train_sentences = [line for line in lines if line]
            tokenized_text = [list(map(str.lower, word_tokenize(sent))) 
              for sent in train_sentences]
            train, vocab = padded_everygram_pipeline(2, tokenized_text)
            model = Laplace(2)
            model.fit(train, vocab)
            
        return model 

    def get_masked_index(self, tokenized_text): 
        if self.model_type == 'gpt': 
            return tokenized_text.index('mask</w>')
        elif self.model_type == 'gpt2':
            if 'Ġmask' in tokenized_text: 
                return tokenized_text.index('Ġmask')
            else: 
                return tokenized_text.index('mask')
        elif self.model_type == 'transformerxl': 
            return tokenized_text.index('MASK')
        return tokenized_text.index('mask')
            
       
    def predict_candidate_transformerXL(self, sentence, candidates, correct_answer): 
        tokenized_text = self.tokenizer.tokenize(sentence)
        tokenized_candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]
        
        masked_index = self.get_masked_index(tokenized_text)

        candidate_ids = self.tokenizer.convert_tokens_to_ids(tokenized_candidates)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        best_candidate = None
        best_perp = sys.maxsize
        correct_perp = None 
        print(len(token_ids))
        for i, candidate_id in enumerate(candidate_ids): 
            token_ids[masked_index] = candidate_id 
            tokens_tensor = torch.tensor([token_ids])

            # If you have a GPU, put everything on cuda
            tokens_tensor = tokens_tensor.to('cuda')
            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
            print(input_ids.shape)
            print(tokens_tensor.shape)
            self.model.to('cuda')
            input_ids = input_ids.to('cuda')

            loss, mems = self.model(input_ids, target=None)
            print(loss.shape)

#             perplexity = math.exp(torch.sum(loss) / tokens_tensor.shape[1])
#             if perplexity < best_perp: 
#                 best_candidate = candidates[i]
#                 best_perp = perplexity
                
#             if correct_answer == candidates[i]: 
#                 correct_perp = perplexity
            print(loss)
            perplexity = math.exp(torch.sum(loss) / len(token_ids))
            print(perplexity)
            if perplexity < best_perp: 
                best_candidate = candidates[i]
                best_perp = perplexity
            if candidates[i] == correct_answer: 
                correct_perp = perplexity
            

        return best_candidate, correct_perp
         
        
    def predict_candidate_GPT(self, sentence, candidates, correct_answer, version=1): 
        if version == 2: 
            sentence = sentence.lower()
        tokenized_text = self.tokenizer.tokenize(sentence)
        tokenized_candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]
        
        masked_index = self.get_masked_index(tokenized_text)

        candidate_ids = self.tokenizer.convert_tokens_to_ids(tokenized_candidates)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        best_candidate = None
        best_perp = sys.maxsize
        correct_perp = None 
        for i, candidate_id in enumerate(candidate_ids): 
            token_ids[masked_index] = candidate_id 
            tokens_tensor = torch.tensor([token_ids])
            
            # If you have a GPU, put everything on cuda
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')
            print

            if version == 1: 
                loss = self.model(tokens_tensor, lm_labels = tokens_tensor)
            elif version == 2: 
                loss = self.model(tokens_tensor, lm_labels = tokens_tensor)
            perplexity = math.exp(torch.sum(loss) / tokens_tensor.shape[1])
            if perplexity < best_perp: 
                best_candidate = candidates[i]
                best_perp = perplexity
                
            if correct_answer == candidates[i]: 
                correct_perp = perplexity

        return best_candidate, correct_perp
    
    def replace_with_unk(self, word_list): 
        for i, word in enumerate(word_list): 
            if self.model.vocab.lookup(word) == '<UNK>' and word != 'mask': 
                word_list[i] == '<UNK>'
        return word_list
        
    # get predictions 
    def predict_candidate_ngrams(self, sentence, candidates, correct_answer): 
        tokenized_text = list(map(str.lower, nltk.tokenize.word_tokenize(sentence)))
        tokenized_candidates = [list(map(str.lower, nltk.tokenize.word_tokenize(candidate))) for candidate in candidates]
        tokenized_text = self.replace_with_unk(tokenized_text)
        tokenized_candidates = self.replace_with_unk(tokenized_candidates)
        masked_index = tokenized_text.index('mask')
        best_perplexity = 0.0 
        best_candidate = None
        correct_perplexity = None 
        print(candidates)
        for i, tokenized_candidate in enumerate(tokenized_candidates): 
            tokenized_text[masked_index] = tokenized_candidate[0]
            test_data = nltk.bigrams(tokenized_text,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>")
            perplexity = self.model.perplexity(test_data)
            if perplexity > best_perplexity: 
                best_perplexity = perplexity 
                best_candidate = candidates[i]
#             if candidates[i] == correct_answer: 
#                 correct_perplexity = perplexity
        print(best_candidate)
        return best_candidate
       
    def predict_candidate_BERT(self, sentence, candidates):
        tokenized_text = self.tokenizer.tokenize(sentence.lower())
        candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]
        
        masked_index = self.get_masked_index(tokenized_text)

        candidates_ids = self.tokenizer.convert_tokens_to_ids(candidates)

        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        segments_ids = [0] * len(tokenized_text)

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # If you have a GPU, put everything on cuda
        tokens_tensor = tokens_tensor.to('cuda')
        segments_tensors = segments_tensors.to('cuda')
        self.model.to('cuda')

        predictions = self.model(tokens_tensor, segments_tensors)
        predictions_candidates = predictions[0, masked_index, candidates_ids]
        answer_idx = torch.argmax(predictions_candidates).item()
        loss = predictions_candidates[answer_idx]
        perplexity = math.exp(loss / len(tokens_tensor))
        return candidates[answer_idx], perplexity

    def predict_candidate(self, sentence, candidates, correct_answer):
        if self.model_type in ["bert-base-uncased", "bert-base-cased", "bert-large-uncased", "bert-base-uncased-pretrained", "bert-large-uncased-pretrained"]: 
            return self.predict_candidate_BERT(sentence, candidates)
        elif self.model_type == "gpt": 
            return self.predict_candidate_GPT(sentence, candidates, correct_answer)
        elif self.model_type == 'gpt2': 
            return self.predict_candidate_GPT(sentence, candidates, correct_answer, version=2)
        elif self.model_type == "transformerxl":
            return self.predict_candidate_transformerXL(sentence, correct_answer, candidates)
        elif self.model_type == "ngrams": 
            return self.predict_candidate_ngrams(sentence, correct_answer, candidates)
        
    def read_examples(self): 
        correct_count = 0 
        total_count = 0 
        total_perplexity = 0.0 
        for data_split in ['test/']: 
            for data_type in DATA_TYPES:
                path = CLEANED_PATH + data_split + data_type 
                filenames = os.listdir(path)
                for filename in filenames: 
                    filename_path = path + filename
                    if os.path.getsize(filename_path) > 0: 
                        with open(filename_path, "rb") as pickle_file: 
                            examples = pickle.load(pickle_file)
                            for example in examples: 
                                sentence = example['sentence']
                                candidates = example['candidates']
                                answer = example['answer']
                                correct_answer = self.predict_candidate(sentence, candidates, answer)
#                                 total_perplexity += perplexity
                                total_count += 1
                                if answer == correct_answer: 
                                  correct_count += 1 
                                if total_count % 100 == 0: 
                                    print("processed so far: ", total_count) 
                                    print("accuracy so far: ", correct_count * 1.0 / total_count) 
        return correct_count, total_count, total_perplexity 
    
            
    def get_num_examples(self): 
        num_examples = 0
        for data_split in ['test/']: 
            for data_type in DATA_TYPES:
                path = CLEANED_PATH + data_split + data_type 
                filenames = os.listdir(path)
                for filename in filenames: 
                    filename_path = path + filename
                    if os.path.getsize(filename_path) > 0: 
                        with open(filename_path, "rb") as pickle_file: 
                            examples = pickle.load(pickle_file)
                            num_examples += len(examples)
                            
        return num_test_examples

In [198]:
model = ClozeModel("ngrams")
# correct_count, total_count, total_perplexity = model.read_examples() 
print(model.get_num_examples())

NameError: name 'num_test_examples' is not defined

In [181]:
print("correct count: ", correct_count)
print("total count: ", total_count)
print("total_perplexity: ", total_perplexity)

correct count:  3593
total count:  8989
total_perplexity:  44919373.63105837


In [None]:
with open("data/cleaned/train/high/high1495.pickle", "rb") as pickle_file: 
    examples = pickle.load(pickle_file)
    print(examples)