In [23]:
import math
import os
import pickle
import re
import sys
import torch

from constants import *
# from nltk.lm.preprocessing import padded_everygram_pipeline
# from nltk.lm import MLE
# from nltk.lm import Vocabulary
# from nltk.util import ngrams

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
from pytorch_pretrained_bert import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel

In [24]:
def get_ngrams(sentence): 
    sentence = sentence.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    output = list(ngrams(tokens, 2))
    return output



# fit model 

def get_ngrams_model(corpus): 
    with open("corpus.txt", "r") as corpus_file: 
        lines = [line.rstrip('\n') for line in open('filename')]
        train_sentences = [line for line in strings if line]

        tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in train_sentences]
        n = 2
        train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
        words = [word for sent in tokenized_text for word in sent]
        words.extend(["<s>", "</s>"])
        padded_vocab = Vocabulary(words)
        model = MLE(n)
        model.fit(train_data, padded_vocab)
    return model 

# get predictions 
def get_top_sentence(sentence, candidates): 
    tokenized_text = list(map(str.lower, nltk.tokenize.word_tokenize(sentence)))
    tokenized_candidates = [list(map(str.lower, nltk.tokenize.word_tokenize(candidate))) for candidate in candidates]
    masked_index = tokenized_text.index('mask')
    best_perplexity = 0.0 
    best_candidate = None
    for i, tokenized_candidate in enumerate(tokenized_candidates): 
        tokenized_text[masked_index] = tokenized_candidate
        test_data = nltk.bigrams(tokenized_text,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>")
#         for test in test_data:
#             print ("MLE Estimates:", ((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])
        perplexity = model.perplexity(test_data)
        if perplexity > best_perplexity: 
            best_perplexity = perplexity 
            best_candidate = candidates[i]
        print("PP({0}):{1}".format(tokenized_candidate, model.perplexity(test_data)))
    return best_candidate

def make_predictions():
    correct_count = 0 
    total_count = 0
    for data_split in ['test/']: 
        for data_type in DATA_TYPES:
            path = CLEANED_PATH + data_split + data_type 
            filenames = os.listdir(path)
            for filename in filenames: 
                filename_path = path + filename
                if os.path.getsize(filename_path) > 0: 
                    with open(filename_path, "rb") as pickle_file: 
                        examples = pickle.load(pickle_file)
                        for example in examples: 
                            sentence = example['sentence']
                            candidates = example['candidates']
                            answer = example['answer']
                            predicted_answer = get_top_sentence(sentence, candidates)
                            total_count += 1
                            if answer == correct_answer: 
                                correct_count += 1 
                            if total_count % 100 == 0: 
                                print("processed so far: ", total_count) 
                                print("accuracy so far: ", correct_count * 1.0 / total_count) 
                            


In [12]:
class ClozeModel(object): 
    
    def __init__(self, model_type): 
        self.model_type = model_type
        if (model_type == "bert-base-uncased"): 
            self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        elif (model_type == "bert-base-cased"): 
            self.model = BertForMaskedLM.from_pretrained('bert-base-cased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        elif (model_type == "bert-large-uncased"): 
            self.model = BertForMaskedLM.from_pretrained('bert-large-uncased')
            self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        elif (model_type == "bert-base-uncased-pretrained"):
            self.model = BertForMaskedLM.from_pretrained('bert_finetuned_lm')
            self.tokenizer = BertTokenizer.from_pretrained('bert_finetuned_lm')
#             self.model.load_state_dict(torch.load("bert_finetuned_lm/pytorch_model.bin"))
        elif (model_type == "bert-large-uncased-pretrained"):
            self.model = BertForMaskedLM.from_pretrained('bert_finetuned_large_lm')
            self.tokenizer = BertTokenizer.from_pretrained('bert_finetuned_large_lm')
        elif (model_type == 'gpt'): 
            self.model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
            self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        else: 
            print("about to initialize model")
            self.model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
            print("model initialized")
            self.tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
            print("tokenizer set up")
        self.model.eval() 
        
    def get_masked_index(self, tokenized_text): 
        if self.model_type == 'gpt': 
            return tokenized_text.index('mask</w>')
        elif self.model_type == 'transformerxl': 
            return tokenized_text.index('MASK')
        return tokenized_text.index('mask')
            
       
    def predict_candidate_transformerXL(self, sentence, candidates): 
        tokenized_text = self.tokenizer.tokenize(sentence)
        tokenized_candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]
        
        masked_index = self.get_masked_index(tokenized_text)

        candidate_ids = self.tokenizer.convert_tokens_to_ids(tokenized_candidates)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        best_candidate = None
        best_perp = sys.maxsize
        for i, candidate_id in enumerate(candidate_ids): 
            token_ids[masked_index] = candidate_id 
            tokens_tensor = torch.tensor([token_ids])

            # If you have a GPU, put everything on cuda
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

            predictions, mems = self.model(tokens_tensor)

            perplexity = math.exp(torch.sum(predictions) / len(token_ids))
            if perplexity < best_perp: 
                best_candidate = candidates[i]
                best_perp = perplexity

        return best_candidate
    
    def predict_candidate_GPT(self, sentence, candidates): 
        tokenized_text = self.tokenizer.tokenize(sentence)
        tokenized_candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]
        # print(tokenized_text)
        
        masked_index = self.get_masked_index(tokenized_text)
        # print(masked_index)

        candidate_ids = self.tokenizer.convert_tokens_to_ids(tokenized_candidates)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        best_candidate = None
        best_perp = sys.maxsize
        for i, candidate_id in enumerate(candidate_ids): 
            token_ids[masked_index] = candidate_id 
            tokens_tensor = torch.tensor([token_ids])

            # If you have a GPU, put everything on cuda
            tokens_tensor = tokens_tensor.to('cuda')
            self.model.to('cuda')

            predictions = self.model(tokens_tensor)
    
            perplexity = math.exp(torch.sum(predictions) / len(token_ids))
            if perplexity < best_perp: 
                best_candidate = candidates[i]
                best_perp = perplexity

        return best_candidate
    
#     def predict_candidate_transformerXL(self, sentence, candidates):
#         tokenized_text = self.tokenizer.tokenize(sentence)
#         tokenized_candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]
        
#         masked_index = tokenized_text.index('mask</w>')
        
#         candidate_ids = self.tokenizer.convert_tokens_to_ids(tokenized_candidates)
#         token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        
#         best_candidate = None
#         best_perp = sys.maxsize
#         for i, candidate_id in enumerate(candidate_ids): 
#             token_ids[masked_index] = candidate_id 
#             tokens_tensor = torch.tensor([token_ids])

#             # If you have a GPU, put everything on cuda
#             tokens_tensor = tokens_tensor.to('cuda')
#             self.model.to('cuda')

#             predictions = self.model(tokens_tensor)

#             perplexity = math.exp(torch.sum(predictions) / len(tokens_tensor))
#             if perplexity < best_perp: 
#                 best_candidate = candidates[i]
#                 best_perp = perplexity

#         return best_candidate
       
    def predict_candidate_BERT(self, sentence, candidates):
        tokenized_text = self.tokenizer.tokenize(sentence.lower())
        candidates = [self.tokenizer.tokenize(candidate)[0] for candidate in candidates]
        
#         lower_text = [item.lower() for item in tokenized_text]
        masked_index = self.get_masked_index(tokenized_text)

        candidates_ids = self.tokenizer.convert_tokens_to_ids(candidates)

        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        segments_ids = [0] * len(tokenized_text)

        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        # If you have a GPU, put everything on cuda
        tokens_tensor = tokens_tensor.to('cuda')
        segments_tensors = segments_tensors.to('cuda')
        self.model.to('cuda')

        predictions = self.model(tokens_tensor, segments_tensors)
        predictions_candidates = predictions[0, masked_index, candidates_ids]
        answer_idx = torch.argmax(predictions_candidates).item()
        return candidates[answer_idx]

    def predict_candidate(self, sentence, candidates):
        if self.model_type == "bert-base-uncased" or self.model_type == "bert-base-cased" or self.model_type == "bert-large-uncased" or self.model_type == "bert-base-uncased-pretrained": 
            return self.predict_candidate_BERT(sentence, candidates)
        elif self.model_type == "gpt": 
            return self.predict_candidate_GPT(sentence, candidates)
        elif self.model_type == "transformerxl":
            return self.predict_candidate_transformerXL(sentence, candidates)
        
    def read_examples(self): 
        correct_count = 0 
        total_count = 0 
        for data_split in ['test/']: 
            for data_type in DATA_TYPES:
                path = CLEANED_PATH + data_split + data_type 
                filenames = os.listdir(path)
                for filename in filenames: 
                    filename_path = path + filename
                    if os.path.getsize(filename_path) > 0: 
                        with open(filename_path, "rb") as pickle_file: 
                            examples = pickle.load(pickle_file)
                            for example in examples: 
                                sentence = example['sentence']
                                candidates = example['candidates']
                                answer = example['answer']
                                correct_answer = self.predict_candidate(sentence, candidates)
                                total_count += 1
                                if answer == correct_answer: 
                                  correct_count += 1 
                                if total_count % 100 == 0: 
                                    print("processed so far: ", total_count) 
                                    print("accuracy so far: ", correct_count * 1.0 / total_count) 
        return correct_count, total_count


In [25]:
model = ClozeModel("bert-base-uncased-pretrained")
correct_count, total_count = model.read_examples() 

about to set up model
processed so far:  100
accuracy so far:  0.49
processed so far:  200
accuracy so far:  0.47
processed so far:  300
accuracy so far:  0.43
processed so far:  400
accuracy so far:  0.455
processed so far:  500
accuracy so far:  0.446
processed so far:  600
accuracy so far:  0.44666666666666666
processed so far:  700
accuracy so far:  0.43857142857142856
processed so far:  800
accuracy so far:  0.44625
processed so far:  900
accuracy so far:  0.4411111111111111
processed so far:  1000
accuracy so far:  0.44
processed so far:  1100
accuracy so far:  0.4481818181818182
processed so far:  1200
accuracy so far:  0.45666666666666667
processed so far:  1300
accuracy so far:  0.4584615384615385
processed so far:  1400
accuracy so far:  0.4585714285714286
processed so far:  1500
accuracy so far:  0.458
processed so far:  1600
accuracy so far:  0.459375
processed so far:  1700
accuracy so far:  0.4623529411764706
processed so far:  1800
accuracy so far:  0.4638888888888889
pr

In [33]:
print("correct count: ", correct_count)
print("total count: ", total_count)

correct count:  4319
total count:  8989


In [None]:
with open("data/cleaned/train/high/high1495.pickle", "rb") as pickle_file: 
    examples = pickle.load(pickle_file)
    print(examples)