In [2]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


In [3]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# GPT-2 Model and Tokenizer to be fine-tuned
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to("cuda")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

In [4]:
import pandas as pd

# Import CSV and get the real to fake
rtf_df = pd.read_csv("/home/ubuntu/test/datasets/realtofake.csv")
real_words_list = rtf_df["Real"].tolist()
fake_words_list = rtf_df["Fake"].tolist()

# Populate dictionary
real_to_fake_dict = {}
for i in range(len(real_words_list)):
    real_to_fake_dict[real_words_list[i]] = fake_words_list[i]

In [5]:
from nltk.corpus import wordnet as wn

# Keep track of all the final definitions
final_definitions = []

# Loop through each real word and append
for word in real_words_list:
    definition = ""
    for synset in wn.synsets(word):
        definition += synset.definition() + ". "
    final_definitions.append(definition)

# Quick sanity check
assert(len(real_words_list) == len(final_definitions))
assert(len(fake_words_list) == len(final_definitions))
assert(len(real_to_fake_dict) == len(final_definitions))

In [6]:
# Use Hewitt code to get embeddings that are average of all other embeddings
params = model.state_dict()
embeddings = params['transformer.wte.weight']
mu = torch.mean(embeddings, dim=0)

In [7]:
predict_model = GPT2LMHeadModel.from_pretrained("/home/ubuntu/test/weights/G2GMaskingBestM").to("cuda")
predict_tokenizer = GPT2Tokenizer.from_pretrained("/home/ubuntu/test/weights/G2GMaskingT")

In [8]:
# Helpful Debug Message
print("Number of total definitions: " + str(len(final_definitions)))

# Tokenizing all of the definitions at once
tokenized_inputs = predict_tokenizer(final_definitions, return_tensors="pt", padding='max_length', truncation=True, max_length=511)
tokenized_cls = predict_tokenizer([" [CLS]"] * len(final_definitions), return_tensors="pt")

# Get the correct input IDs and and attention mask
tokenized_inputs['input_ids'] = torch.cat((tokenized_inputs['input_ids'], tokenized_cls['input_ids']), dim=1).to("cuda")
tokenized_inputs['attention_mask'] = torch.cat((tokenized_inputs['attention_mask'], tokenized_cls['attention_mask']), dim=1).to("cuda")

# Add the new tokens and resize the model embeddings matrix
displacement = len(tokenizer)
tokenizer.add_tokens(fake_words_list)
model.resize_token_embeddings(len(tokenizer))
params = model.state_dict()

# Adding new embeddings in a range of 4
for i in range(0, len(final_definitions), 4):
    outputs = predict_model(input_ids=tokenized_inputs['input_ids'][i:min(len(final_definitions), i + 4)], output_hidden_states=True, attention_mask=tokenized_inputs['attention_mask'][i:min(len(final_definitions), i + 4)])
    params['transformer.wte.weight'][displacement + i: displacement + min(len(final_definitions), i + 4),:] = outputs.hidden_states[-1][:,511,:].detach().clone()
model.load_state_dict(params)

Number of total definitions: 343


<All keys matched successfully>

In [9]:
del predict_model
del predict_tokenizer
torch.cuda.empty_cache()

In [10]:
params = model.state_dict()
embeddings = params['transformer.wte.weight']

In [11]:
embeddings = embeddings[50257:]
print(len(embeddings))

print(mu)

343
tensor([-0.0558, -0.0170, -0.0460,  ...,  0.0417, -0.0542, -0.0225],
       device='cuda:0')


In [12]:
import torch.nn as nn

mse_loss = nn.MSELoss()

total_loss = 0

print()

for i in range(len(embeddings)):
    total_loss += mse_loss(mu,embeddings[i])
print(total_loss/len(embeddings))



tensor(0.0014, device='cuda:0')


In [13]:
predicted_mu = torch.mean(embeddings, dim=0)
print(torch.linalg.vector_norm(predicted_mu))
print(torch.linalg.vector_norm(mu))

tensor(1.6543, device='cuda:0')
tensor(2.0444, device='cuda:0')
