## CS 224N - Finetuning on Example Sentences
Finetune on the downstream task of sample sentences.

In [1]:
import torch
print("Using GPU: " + str(torch.cuda.is_available()))

Using GPU: True


## Get all WinoDict Words
Get all the words from the WinoDict dataset.

In [2]:
import pandas as pd
first_set = pd.read_csv("winodict/prob1_of_5.csv")
real_words, winodict_words = first_set['lemma'].tolist(), first_set['fake_lemma'].tolist()

## Function to Get WordNet Definition of WinoDict Word
Using WordNet to get the WordNet definition of a fake word.

In [3]:
from nltk.corpus import wordnet as wn

def find_definition(word):
    definition = ""
    for synset in wn.synsets(word):
        definition += synset.definition() + ". "
    return definition

## Load in G2G/R2G Model for Predicting Embeddings
This will be used to initialize our initial word embeddings for the fake words.

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# G2G/R2G used to Predict
predict_model = GPT2LMHeadModel.from_pretrained("weights/G2GNext1").to("cuda")
predict_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
predict_tokenizer.add_tokens(['[CLS]'])

# GPT-2 Model and Tokenizer to be fine-tuned
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to("cuda")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

# Standalone tokenizer
ORIG_TOKENIZER = GPT2Tokenizer.from_pretrained('gpt2-medium')

## Create 1:1 Fake Word to Real Word Ratio
Make sure there is established parity.

In [5]:
# Set for all real and fake words seen
real_words_seen = set()
fake_words_seen = set()

# Final dataset -- should have 1:1 for each
final_real_words = []
final_fake_words = []

# Iterate through all combinations
for i in range(len(real_words)):
    # Case 1: not already seen
    if real_words[i] not in real_words_seen and winodict_words[i] not in fake_words_seen:
        # Add to seen sets
        fake_words_seen.add(winodict_words[i])
        real_words_seen.add(real_words[i])
        
        # Add to arrays
        final_real_words.append(real_words[i])
        final_fake_words.append(winodict_words[i])
    
    # Case 2: real word has not been seen, fake word has
    elif real_words[i] not in real_words_seen and winodict_words[i] in fake_words_seen:
        # Modify until new word in vocabulary
        curr_word = winodict_words[i]
        while (curr_word in fake_words_seen):
            curr_word += "z"
        
        # Add to seen sets
        fake_words_seen.add(curr_word)
        real_words_seen.add(real_words[i])
        
        # Add to arrays
        final_real_words.append(real_words[i])
        final_fake_words.append(curr_word)

# Quick sanity check
assert(len(final_real_words) == len(final_fake_words))

## Save the Real and Fake Word Pairings
Take the two arrays and save them together.

In [6]:
import csv
with open('datasets/realtofake.csv', 'w', newline='') as file:
    writer = csv.writer(file)

    writer.writerow(["Real", "Fake"])
    for i in range(len(final_real_words)):
        writer.writerow([final_real_words[i], final_fake_words[i]])

## Collect All Definitions for the 1:1 Dataset
Gather all of the definitions together for each of the real words.

In [7]:
# Gather all of the WordNet definitions
final_definitions = []
for real_word in final_real_words:
    final_definitions.append(find_definition(real_word))

# Quick sanity check
assert(len(final_definitions) == len(final_fake_words))
assert(len(final_definitions) == len(final_real_words))

## Batch Add All Embeddings
For each of the corresponding real and fake words, plus definitions, create an embedding using G2G/R2G.

In [8]:
# Helpful Debug Message
print("Number of total definitions: " + str(len(final_definitions)))

# Tokenizing all of the definitions at once
predict_tokenizer.pad_token = tokenizer.eos_token
tokenized_inputs = predict_tokenizer(final_definitions, return_tensors="pt", padding='max_length', truncation=True, max_length=511)
tokenized_cls = predict_tokenizer([" [CLS]"] * len(final_definitions), return_tensors="pt")
tokenized_inputs['input_ids'] = torch.cat((tokenized_inputs['input_ids'], tokenized_cls['input_ids']), dim=1).to("cuda")

# Add the new tokens and resize the model embeddings matrix
displacement = len(tokenizer)
tokenizer.add_tokens(final_fake_words)
model.resize_token_embeddings(len(tokenizer))
params = model.state_dict()

# Adding new embeddings in a range of 4
for i in range(0, len(final_definitions), 4):
    outputs = predict_model(input_ids=tokenized_inputs['input_ids'][i:min(len(final_definitions), i + 4)], output_hidden_states=True)
    params['transformer.wte.weight'][displacement + i: displacement + min(len(final_definitions), i + 4),:] = outputs.hidden_states[-1][:,511,:].detach().clone()
model.load_state_dict(params)

Number of total definitions: 343


<All keys matched successfully>

## Sanity Check for Maintaining Index Parity
Make sure that all of the embeddings, definitions, and fake words are in the same place!

In [9]:
# Sanity Check #1: at index 41
print(final_fake_words[41])
print(final_definitions[41])
print(tokenizer("prodittionzzz", return_tensors="pt"))
print(tokenizer.decode(displacement + 41))
print(displacement + 41)

# Sanity Check #1: at the last index
print(final_fake_words[-1])
print(final_definitions[-1])
print(tokenizer("nturyzzz", return_tensors="pt"))
print(tokenizer.decode(displacement + len(final_fake_words) - 1))

prodittionzzz
a boxlike container in a piece of furniture; made so as to slide in and out. the person who writes a check or draft instructing the drawee to pay someone else. an artist skilled at drawing. 
{'input_ids': tensor([[50298]]), 'attention_mask': tensor([[1]])}
prodittionzzz
50298
nturyzzz
having or showing feelings of unwarranted importance out of overbearing pride. 
{'input_ids': tensor([[50599]]), 'attention_mask': tensor([[1]])}
nturyzzz


## Ensure Correct Tokenization of Sample Sentences
Make sure all of the sample sentences are correctly tokenized.

In [10]:
import json
from collections import defaultdict

# Load in all of the example sentences
example_sentences = json.load(open('datasets/example_sentences.json'))

# Collecting all of the sentences and words used for training
fake_word_to_sentences = defaultdict(list)

# Iterate through each example sentence
for word in example_sentences:
    for sentence in example_sentences[word]:
        # Get the corresponding fake word and craft the new sentence
        fake_word = final_fake_words[final_real_words.index(word)]
        fake_sentence = sentence.replace(word, fake_word) 
        
        # Check if tokenized correctly
        tokens = tokenizer(fake_sentence)['input_ids']
        word_token = tokenizer(fake_word)['input_ids'][0]
        if (word_token in tokens):
            fake_word_to_sentences[fake_word].append(fake_sentence)


## Dataset Class for Example Sentences
Key: makes sure that the training sentences are already a tensor of the tokenized sentences.

In [11]:
from torch.utils.data import Dataset, DataLoader

class ExampleSentencesData(Dataset):

    def __init__(self, training_sentences, tokenizer):
        self.training_sentences = tokenizer(training_sentences, return_tensors="pt", padding='max_length', truncation=True)['input_ids'].to("cuda")

    def __len__(self):
        return len(self.training_sentences)

    def __getitem__(self, idx):
        return {
            'sentence': self.training_sentences[idx]
        }

## Training Loop
Creating the training loader and training the model

In [13]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Reset the padding token and set up parameters for the training loader
tokenizer.pad_token = tokenizer.eos_token

# Set up Adam optimizer and parameters for training loader
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_params = {
    'batch_size': 2,        
    'shuffle': False,        
    'num_workers': 0
}

# Zero out non embedding layers by setting require_grad = false
params = model.state_dict()
for param in params:
    if (param != "transformer.wte.weight"):
        params[param].requires_grad = False
model.load_state_dict(params)

# Iterate through each of the real words
embedding_mask = torch.zeros(model.transformer.wte.weight.shape).to("cuda")
for key in tqdm(fake_word_to_sentences):
    # Set up our dataset and our training oader
    special_word_dataset = ExampleSentencesData(fake_word_to_sentences[key], tokenizer)
    training_loader = DataLoader(special_word_dataset, **train_params)
    
    # Enumerate all examples in our training loader
    for _ in range(2):
        for j, data in enumerate(training_loader, 0):
            optimizer.zero_grad()

            # Get the tokens for each of the words in the fake sentence
            #sentence_tokens = tokenizer(fake_word_to_sentences[key], return_tensors="pt", padding='max_length', truncation=True)['input_ids']
            #cuda_sentence_tokens = sentence_tokens.to('cuda')
            
            # Run it through the actual model and calculate the loss
            outputs = model(input_ids=data['sentence'], labels=data['sentence'])
            outputs.loss.backward()
            
            # Zero out gradient layers
            embedding_mask[tokenizer(key)['input_ids'][0]] = 1
            model.transformer.wte.weight.grad = model.transformer.wte.weight.grad * embedding_mask
            embedding_mask[tokenizer(key)['input_ids'][0]] = 0
            
            # Take a step with the optimizer
            optimizer.step()
        


  0%|‚ñè                                                                    | 1/303 [00:11<58:11, 11.56s/it]


KeyboardInterrupt: 

In [14]:
model.save_pretrained('weights/G2G-Finetuned-2-Epochs')
tokenizer.save_pretrained('weights/G2G-Finetuned-2-Epochs-T')

('weights/G2G-Finetuned-2-Epochs-T/tokenizer_config.json',
 'weights/G2G-Finetuned-2-Epochs-T/special_tokens_map.json',
 'weights/G2G-Finetuned-2-Epochs-T/vocab.json',
 'weights/G2G-Finetuned-2-Epochs-T/merges.txt',
 'weights/G2G-Finetuned-2-Epochs-T/added_tokens.json')

In [None]:
quit()

In [None]:
import time
time.sleep(99999)