In [1]:
import transformers 
import torch
import math
import random

tokenizer = transformers.AutoTokenizer.from_pretrained("roberta-base")
model = transformers.RobertaForMaskedLM.from_pretrained("roberta-base")

inputs = tokenizer("I am so <mask>", return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# retrieve index of <mask>
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

labels = tokenizer("I am so <mask>", return_tensors="pt")["input_ids"]
# mask labels of non-<mask> tokens
labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)

print(labels)
outputs = model(**inputs, labels=labels)
outputs.loss.item()

  from .autonotebook import tqdm as notebook_tqdm


tensor([[ -100,  -100,  -100,  -100, 50264,  -100]])


15.918440818786621

In [43]:
text = "I am so <mask>"
top_k = 5
input_ids = tokenizer.encode(text, return_tensors="pt")

# Get the logits for masked positions
logits = model(input_ids)["logits"]

# Find the position of the '<mask>' token in the input
mask_token_index = torch.where(input_ids == 524)[1].item()
print(tokenizer.mask_token_id)
print(mask_token_index)
# Get the probabilities for the top-k predictions
top_k_values, top_k_indices = torch.topk(logits[0, mask_token_index], top_k)
top_k_probabilities = torch.nn.functional.softmax(top_k_values, dim=-1)

# Convert indices back to tokens
predicted_tokens = tokenizer.convert_ids_to_tokens(top_k_indices.tolist())

predicted_tokens, top_k_probabilities.tolist()

50264
2


(['Ġam', 'Ġis', "'m", 'Ġwas', 'Ġfeel'],
 [0.9999134540557861,
  3.9379392546834424e-05,
  2.9938039006083272e-05,
  8.689116839377675e-06,
  8.550994607503526e-06])

In [1]:
import torch
from transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM
from sklearn.metrics.pairwise import cosine_similarity

model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
roberta_model = RobertaModel.from_pretrained(model_name)
masked_language_model = RobertaForMaskedLM.from_pretrained(model_name)

###############1.1###############
def get_word_vector(word_index, tokenized_sentence, model):
    outputs = model(**tokenized_sentence)
    last_hidden_states = outputs.last_hidden_state
    word_vector = last_hidden_states[0, word_index]
    return word_vector

###############2###############
similar_sentence_1 = 'I love you'
similar_sentence_2 = 'I love him'
tokenized_similar_sentence_1 = tokenizer(similar_sentence_1, return_tensors='pt')
tokenized_similar_sentence_2 = tokenizer(similar_sentence_2, return_tensors='pt')

love_1_index = tokenized_similar_sentence_1["input_ids"][0].tolist().index(tokenizer.encode(" love")[1])
love_2_index = tokenized_similar_sentence_2["input_ids"][0].tolist().index(tokenizer.encode(" love")[1])
love_1_vector = get_word_vector(love_1_index, tokenized_similar_sentence_1, roberta_model)
love_2_vector = get_word_vector(love_2_index, tokenized_similar_sentence_2, roberta_model)

print('love similarity')
print(cosine_similarity(love_1_vector.detach().numpy().reshape(1, -1), love_2_vector.detach().numpy().reshape(1, -1)))
###############3###############
different_sentence_1 = 'She decided to clip her hair back with a colorful barrette.'
different_sentence_2 = 'Some have come out of nowhere, others have taken months to catch on, and all of them could become ubiquitous in the blink of a TikTok clip.'
tokenized_different_sentence_1 = tokenizer(different_sentence_1, return_tensors='pt')
tokenized_different_sentence_2 = tokenizer(different_sentence_2, return_tensors='pt')

bass_1_index = tokenized_different_sentence_1["input_ids"][0].tolist().index(tokenizer.encode(" clip")[1])
bass_2_index = tokenized_different_sentence_2["input_ids"][0].tolist().index(tokenizer.encode(" clip")[1])
bass_1_vector = get_word_vector(bass_1_index, tokenized_different_sentence_1, roberta_model)
bass_2_vector = get_word_vector(bass_2_index, tokenized_different_sentence_2, roberta_model)

print('bass similarity')
print(cosine_similarity(bass_1_vector.detach().numpy().reshape(1, -1), bass_2_vector.detach().numpy().reshape(1, -1)))

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


love similarity
[[0.9897104]]
bass similarity
[[0.86344]]


In [64]:
###############3###############
similar
different_sentence_1 = 'The teacher asked Sarah to lead the class discussion.'
different_sentence_2 = 'Be careful not to touch that pencil; it has a lead tip.'
tokenized_different_sentence_1 = tokenizer(different_sentence_1, return_tensors='pt')
tokenized_different_sentence_2 = tokenizer(different_sentence_2, return_tensors='pt')

bass_1_index = tokenized_different_sentence_1["input_ids"][0].tolist().index(tokenizer.encode(" lead")[1])
bass_2_index = tokenized_different_sentence_2["input_ids"][0].tolist().index(tokenizer.encode(" lead")[1])
bass_1_vector = get_word_vector(bass_1_index, tokenized_different_sentence_1, roberta_model)
bass_2_vector = get_word_vector(bass_2_index, tokenized_different_sentence_2, roberta_model)

print('bass similarity')
print(cosine_similarity(bass_1_vector.detach().numpy().reshape(1, -1), bass_2_vector.detach().numpy().reshape(1, -1)))

bass similarity
[[0.8710439]]


In [61]:
from collections import Counter
#Returns 2 values: Article dicts: containing the Topic and the article text
def get_file_data(file_name, lower = False):
    #Each topic is separated by header line, empty line before the text and another empty line
    #So we will skip the first two lines, get the thrid line and skip the fourth one 
    print("Reading file")
    with open(file_name, 'r') as file:   
        file_data = file.read().splitlines()
    
    if lower:
        file_data = [s.lower() for s in file_data]
        
    return file_data

def get_all_words_no_anotation(data):
    words = []
    for sen in data:
        for word in sen.split(' '):
            words.append(word)
    
    return words

def get_all_words_pos(data):
    pos_list = []
    for sen in data:
        for token in sen.split():
            word_split = token.rsplit('/', 1)
            word_pos = (word_split[0], word_split[1])
            pos_list.append(word_pos)
    
    return pos_list

def get_word_freq_per_pos(data):
    counts = {}
    for word, pos in data:
        if word not in counts:
            counts[word] = {}

        counts[word][pos] = counts[word].get(pos, 0) + 1
        
    return counts

def get_word_most_freq_pos(data):
    return {word: max(data[word], key=data[word].get) for word in data}

def fill_for_missing_word(data):
    all_pos_tags = [pos_tag for pos_dict in data.values() for pos_tag in pos_dict]

    return max(set(all_pos_tags), key=all_pos_tags.count)

def predict(train_pos_dist_data, test_data, fill_pos_dist, word_sample = False, fill_sample = False):
    random.seed(42)
    word_pos_pred = []
    if not word_sample:
        train_word_most_freq_pos = get_word_most_freq_pos(train_pos_dist_data)
    if not fill_sample:
        fill_value = fill_for_missing_word(train_pos_dist_data)
        
    for sentence in test_data:
        for word in sentence.split():
            if word not in train_pos_dist_data:
                if fill_sample:
                    pos = random.choices(list(fill_pos_dist.keys()), weights = list(fill_pos_dist.values()))[0]
                else:
                    pos = fill_value

                word_pos_pred.append(pos)
            else:
                if word_sample:
                    pos = random.choices(list(train_pos_dist_data[word].keys()), weights = list(train_pos_dist_data[word].values()))[0]
                else:
                    pos = train_word_most_freq_pos[word]

                word_pos_pred.append(pos)
    
    return word_pos_pred

def calc_Accuracy(y_pred, y_true):
    if len(y_pred) != len(y_true):
        raise ValueError("Lists need to be the same length")
    
    return sum(p == t for p, t in zip(y_pred, y_true)) / len(y_true)

In [69]:
def predict_with_inflection(train_pos_dist_data, test_data, fill_pos_dist, word_sample = False, fill_sample = False):
    random.seed(42)
    word_pos_pred = []
    if not word_sample:
        train_word_most_freq_pos = get_word_most_freq_pos(train_pos_dist_data)
    if not fill_sample:
        fill_value = fill_for_missing_word(train_pos_dist_data)
        
    for sentence in test_data:
        for word in sentence.split():
            if all(w not in train_pos_dist_data for w in [word, word.lower(), word.capitalize()]):
                if fill_sample:
                    pos = random.choices(list(fill_pos_dist.keys()), weights = list(fill_pos_dist.values()))[0]
                else:
                    pos = fill_value

                word_pos_pred.append(pos)
            else:
                if word in train_pos_dist_data:
                    pass
                elif word.lower() in train_pos_dist_data:
                    word = word.lower()
                elif word.capitalize() in train_pos_dist_data:
                    word = word.capitalize()
                    
                if word_sample:
                    pos = random.choices(list(train_pos_dist_data[word].keys()), weights = list(train_pos_dist_data[word].values()))[0]
                else:
                    pos = train_word_most_freq_pos[word]

                word_pos_pred.append(pos)
    
    return word_pos_pred


In [71]:
def get_bigram_dict(data):
    counts = {}
    for sentence in data:
        prev_pos = ''
        for token in sentence.split():
            word_split = token.rsplit('/', 1)
            pos_word = f'{prev_pos}_{word_split[0]}'
            if pos_word not in counts:
                counts[pos_word] = {}

            counts[pos_word][word_split[1]] = counts[pos_word].get(word_split[1], 0) + 1
            prev_pos = word_split[1]
            
    return counts

def bigram_predict(train_pos_word_dist_data, train_pos_dist_data, test_data, fill_pos_dist, word_sample = False, fill_sample = False):
    random.seed(42)
    word_pos_pred = []
    train_pos_word_most_freq_pos = get_word_most_freq_pos(train_pos_word_dist_data)
    if not word_sample:
        train_word_most_freq_pos = get_word_most_freq_pos(train_pos_dist_data)
    if not fill_sample:
        fill_value = fill_for_missing_word(train_pos_dist_data)
        
    for sentence in test_data:
        prev_pos = ''
        for word in sentence.split():
            if f'{prev_pos}_{word}' not in train_pos_word_dist_data:
                if word not in train_pos_dist_data:
                    pos = fill_value
                else:
                    pos = train_word_most_freq_pos[word]
            else:
                pos = train_pos_word_most_freq_pos[f'{prev_pos}_{word}']

            word_pos_pred.append(pos)
            prev_pos = pos
    
    return word_pos_pred

def bigram_predict_with_inflection(train_pos_word_dist_data, train_pos_dist_data, test_data, fill_pos_dist, word_sample = False, fill_sample = False):
    random.seed(42)
    word_pos_pred = []
    train_pos_word_most_freq_pos = get_word_most_freq_pos(train_pos_word_dist_data)
    if not word_sample:
        train_word_most_freq_pos = get_word_most_freq_pos(train_pos_dist_data)
    if not fill_sample:
        fill_value = fill_for_missing_word(train_pos_dist_data)
        
    for sentence in test_data:
        prev_pos = ''
        for word in sentence.split():
            if all(w not in train_pos_word_dist_data for w in [f'{prev_pos}_{word}',
                                                          f'{prev_pos}_{word.lower()}',
                                                          f'{prev_pos}_{word.capitalize()}']):
                if all(w not in train_pos_dist_data for w in [word, word.lower(), word.capitalize()]):
                    pos = fill_value
                else:
                    if word in train_pos_dist_data:
                        pass
                    elif word.lower() in train_pos_dist_data:
                        word = word.lower()
                    elif word.capitalize() in train_pos_dist_data:
                        word = word.capitalize()
                    pos = train_word_most_freq_pos[word]
            else:
                if f'{prev_pos}_{word}' in train_pos_word_dist_data:
                    pass
                elif f'{prev_pos}_{word.lower()}' in train_pos_word_dist_data:
                    word = word.lower()
                elif f'{prev_pos}_{word.capitalize()}' in train_pos_word_dist_data:
                    word = word.capitalize()
                pos = train_pos_word_most_freq_pos[f'{prev_pos}_{word}']

            word_pos_pred.append(pos)
            prev_pos = pos
    
    return word_pos_pred

In [66]:
train_file = './data/pos/ass1-tagger-train'
train_sentences = get_file_data(train_file)
words_pos = get_all_words_pos(train_sentences)
train_data_freq = get_word_freq_per_pos(words_pos)
train_data_mdl = get_word_most_freq_pos(train_data_freq)

Reading file


In [67]:
dev_input_file = './data/pos/ass1-tagger-dev-input'
dev_file = './data/pos/ass1-tagger-dev'
dev_sentences = get_file_data(dev_file)
dev_actual = [pos for word, pos in get_all_words_pos(dev_sentences)]
train_pos_word_freq = get_bigram_dict(train_sentences)
dev_input_sentences = get_file_data(dev_input_file)
train_pos_freq_dict = Counter([v for k, v in words_pos])
dev_predict_bigrm = bigram_predict(train_pos_word_freq, train_data_freq, dev_input_sentences, train_pos_freq_dict)
print(calc_Accuracy(dev_predict_bigrm, dev_actual))

Reading file
Reading file
0.9291606836162964


In [72]:
dev_predict_bigrm_inflection = bigram_predict_with_inflection(train_pos_word_freq, train_data_freq, dev_input_sentences, train_pos_freq_dict)
print(calc_Accuracy(dev_predict_bigrm_inflection, dev_actual))

0.9271897964355098


In [70]:
dev_input_file = './data/pos/ass1-tagger-dev-input'
dev_file = './data/pos/ass1-tagger-dev'
dev_input_sentences = get_file_data(dev_input_file)
dev_sentences = get_file_data(dev_file)
dev_actual = [pos for word, pos in get_all_words_pos(dev_sentences)]
train_pos_freq_dict = Counter([v for k, v in words_pos])
sentences = get_file_data(dev_input_file)
dev_predict_f_f = predict(train_data_freq, dev_input_sentences, train_pos_freq_dict, word_sample = False, fill_sample = False)
dev_predict_f_t = predict(train_data_freq, dev_input_sentences, train_pos_freq_dict, word_sample = False, fill_sample = True)
dev_predict_t_f = predict(train_data_freq, dev_input_sentences, train_pos_freq_dict, word_sample = True, fill_sample = False)
dev_predict_t_t = predict(train_data_freq, dev_input_sentences, train_pos_freq_dict, word_sample = True, fill_sample = True)
print(calc_Accuracy(dev_predict_f_f, dev_actual))
print(calc_Accuracy(dev_predict_f_t, dev_actual))
print(calc_Accuracy(dev_predict_t_f, dev_actual))
print(calc_Accuracy(dev_predict_t_t, dev_actual))


Reading file
Reading file
Reading file
0.912182898330377
0.9055381929780105
0.8926711152405891
0.8849283441732129


In [160]:
[s.lower() for s in dev_input_sentences]

["influential members of the house ways and means committee introduced legislation that would restrict how the new savings-and-loan bailout agency can raise capital , creating another potential obstacle to the government 's sale of sick thrifts .",
 "the bill , whose backers include chairman dan rostenkowski ( d. , ill. ) , would prevent the resolution trust corp. from raising temporary working capital by having an rtc-owned bank or thrift issue debt that would n't be counted on the federal budget .",
 'the bill intends to restrict the rtc to treasury borrowings only , unless the agency receives specific congressional authorization .',
 "`` such agency ` self-help ' borrowing is unauthorized and expensive , far more expensive than direct treasury borrowing , '' said rep. fortney stark ( d. , calif. ) , the bill 's chief sponsor .",
 'the complex financing plan in the s&l bailout law includes raising $ 30 billion from debt issued by the newly created rtc .',
 'this financing system was 