In [1]:
def find_candidates(word, vocabulary, word_to_id_map, missing_token="@"):
    candidates = []

    for vocab_word in vocabulary:
        if len(word) != len(vocab_word):
            continue  # Skip words with different lengths

        candidate = []
        for char1, char2 in zip(word, vocab_word):
            if char1 == missing_token:
                candidate.append(char2)
            elif char1 == char2:
                candidate.append(char2)  
            else:
                break #mismatch, skip
        else:
            try:
                candidates.append(word_to_id_map["".join(candidate)])
            except:
                print(word, vocab_word, candidate,"".join(candidate) )
                raise Exception

    return candidates

In [2]:
#load wikitext data

train = []
test = []

train_file_path = '../Srilm/newtraincorpus.txt'
test_file_path = '../Srilm/newtestcorpus.txt'

with open(train_file_path, 'r', encoding='utf-8') as file:
    for line in file:
            # Process each line as a sentence
            words = (line.strip().split())
            train.append(words)

with open(test_file_path, 'r', encoding='utf-8') as file:
    for line in file:
            # Process each line as a sentence
            words = (line.strip().split())
            test.append(words)

print(len(train))
print(len(test))

77363
9418


In [3]:
#create vocabulary

UNK_symbol = "<UNK>"
vocab = set([UNK_symbol])



# create term frequency of the words
words_term_frequency_train = {}
for doc in train:
    for word in doc:
        # this will calculate term frequency
        # since we are taking all words now
        words_term_frequency_train[word] = words_term_frequency_train.get(word,0) + 1

# create vocabulary
for doc in train:
    for word in doc:
        if words_term_frequency_train.get(word,0) >= 5:
            vocab.add(word)

# remove "@-@" from vocab
vocab.remove("@-@")

print(len(vocab))

19114


In [4]:
#create 4grams

import numpy as np

x_test = []
y_test = []


# create word to id mappings
word_to_id_mappings = {}
for idx,word in enumerate(vocab):
    word_to_id_mappings[word] = idx

id_to_word_mappings = {v: k for k, v in word_to_id_mappings.items()}

# function to get id for a given word
# return <UNK> id if not found
def get_id_of_word(word):
    unknown_word_id = word_to_id_mappings['<UNK>']
    return word_to_id_mappings.get(word,unknown_word_id)


for sentence in test:
    for i,word in enumerate(sentence):
        if i+3 >= len(sentence):
            # sentence boundary reached
            # ignoring sentence less than 3 words
            break
        # convert word to id
        x_extract = [get_id_of_word(word),get_id_of_word(sentence[i+1]), get_id_of_word(sentence[i+2])]
        y_extract = [get_id_of_word(sentence[i+3])]

        x_test.append(x_extract)
        y_test.append(y_extract)
  


x_test = np.array(x_test)
y_test = np.array(y_test)  
  
print(x_test.shape)
print(y_test.shape)

(164980, 3)
(164980, 1)


In [5]:
from sklearn.utils import shuffle

#shuffle data cause this is too much for this 
x_test, y_test = shuffle(x_test, y_test)

x_test = x_test[0:10000]
y_test = y_test[0:10000]



In [6]:
#this dataset will add a version of the label, (third word in the trigram), with a single character removed 
from torch.utils.data import Dataset
import random

import random

def randomly_replace_char(input_str, missing_token="@"):
    if not input_str:
        return input_str  

    num_replacements = random.randint(1, min(len(input_str), 3))  
    indices_to_replace = random.sample(range(len(input_str)), num_replacements)

    modified_str = input_str
    for index in indices_to_replace:
        modified_str = modified_str[:index] + missing_token + modified_str[index + 1:]

    return modified_str

class charMaskDataset(Dataset):
    def __init__(self, data, labels, id_to_word):
        """
        Args:
            data (list): List of input data samples.
            labels (list): List of corresponding labels.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.data = data
        self.labels = labels
        self.id_to_word = id_to_word

    def __len__(self):
        return len(self.data)
    
    def __iter__(self):
        for index in range(len(self)):
            yield self[index]

    def __getitem__(self, index):

        if index < len(self.data):

            context = [self.id_to_word[word] for word in self.data[index]]
            label = self.id_to_word[self.labels[index][0]]


            #randomly remove a single char from the label word

            masked_word = randomly_replace_char(label)

    

            return context, label, masked_word

In [7]:


dataset = charMaskDataset(x_test, y_test, id_to_word_mappings)




In [8]:
import spacy

# Load the pre-trained model
nlp = spacy.load("en_core_web_lg")

In [9]:
def compute_avg_similarity(context, target):

    avg = 0
    target = nlp(target)
    for word in context:
        avg += target.similarity(nlp(word))

    return avg / len(context)


In [10]:
def predict(dataset):

    preds = []
    true = []

    for i, example in enumerate(dataset):

        
        context, label, masked_word = example

        pred = "<UNK>"
        pred_score = -1
        candidates = find_candidates(masked_word, vocab, word_to_id_mappings)

        for cand in candidates:
            cand = id_to_word_mappings[cand]

            score = compute_avg_similarity(context, cand)

            if score > pred_score:
                pred = cand
                pred_score = score
        
        preds.append(pred)
        true.append(label)

    
    return preds, true




In [11]:
predictions, true_labels = predict(dataset)

correct = 0
masked_correct = [0,0,0,0]
masked_totals = [0,0,0,0]

masks = [e[2] for e in dataset]

for i, p in enumerate(predictions):
    if p == true_labels[i]:
        correct += 1

        #see how it does depending on how masked the word is
        masked_correct[masks[i].count('@')] += 1

    masked_totals[masks[i].count('@')] += 1

print(f'Total Acc: {correct / len(predictions)}')
print(f'Acc by num chars missing: 0 : {masked_correct[0] / masked_totals[0]}, 1 : {masked_correct[1] / masked_totals[1]}, 2 : {masked_correct[2] / masked_totals[2]}, 3 : {masked_correct[3] / masked_totals[3]}')

  avg += target.similarity(nlp(word))


Total Acc: 0.6806


ZeroDivisionError: division by zero

In [22]:
dataset[23]

(['in', 'other', 'aspects'], 'especially', 'e@peci@l@y')

In [12]:
masked_correct

[0, 2499, 2299, 2008]

In [13]:
masked_totals

[0, 3805, 3480, 2715]

In [14]:
print(f'Total Acc: {correct / len(predictions)}')
print(f'Acc by num chars missing:  1 : {masked_correct[1] / masked_totals[1]}, 2 : {masked_correct[2] / masked_totals[2]}, 3 : {masked_correct[3] / masked_totals[3]}')

Total Acc: 0.6806
Acc by num chars missing:  1 : 0.6567674113009199, 2 : 0.660632183908046, 3 : 0.7395948434622468
