## Produce continuations

### Import packages

In [1]:
import os
import pandas as pd
import numpy as np
import json
import torch
import pickle 

import string
import re

### Load and preprocess the data


In [2]:
## data used in the study:
df = pd.read_excel("../data/prompts.xlsx")

## put all the verbs into a list
verb_list = sorted(list(df["verb"].value_counts().keys()))
print("all verbs used in the study:", sorted(verb_list))

all verbs used in the study: ['bewundern', 'enttäuschen', 'faszinieren', 'hassen ', 'inspirieren', 'respektieren', 'schockieren', 'verabscheuen']


In [3]:
## data used for hyperparameter testing:
df_hptest = df = pd.read_excel("../data/prompts_for_hyperparameter_testing.xlsx")

## put all the verbs into a list
verb_list_hptest = sorted(list(df_hptest["verb"].value_counts().keys()))
print("all verbs used for parameter testing:", sorted(verb_list_hptest))

all verbs used for parameter testing: ['Mitleid mit DP haben', 'amüsieren', 'beneiden', 'entzücken', 'fürchten', 'langeweilen ', 'stören', 'vergöttern']


In [4]:
## create a dictionary with all the data:

def create_data_dict(dataset1, verblist):
    """
    Takes the dataset and the verblist to create a (nested) dictionary with the (important) values.
    The individual verbs are used as keys (of the outer dict).
    
    To be called for the individual models (GPT-2 etc.), to later fill "model_generation".
    """
    data_dict = {}
    for verb in verblist: 
        data_dict[verb] = {"prompt_condition": [],
                           "prompt": [], 
                           "arg_structure": [],
                           "gender_order": [], #order of male-female-arguments in the prompt
                           "sentiment": [],
                           "human_reference": [], #here, this is only the continuation (i.e. excluding the prompt)
                           "human_reference_selected": [], #in case eval. metrics are used that can only handle one reference for one prediction
                           "generation_length":[],
                           # Decoding Procedures:
                           "model_generation_diversebeam": [],
                           "model_generation_nucleus_sampling": [],
                           "model_generation_typical_sampling" : []
                          } 

    for index, row in dataset1.iterrows():
        for verb in verblist:
            if row["verb"] == verb: 
                data_dict[verb]["prompt_condition"].append(row["prompt condition"])
                data_dict[verb]["prompt"].append(row["prompt"])
                data_dict[verb]["arg_structure"].append(row["verb_type"])
                data_dict[verb]["gender_order"].append(row["gender-order"])
                data_dict[verb]["sentiment"].append(row["sentiment"])
                data_dict[verb]["human_reference"].append(row["human continuations"]) 
    
    return data_dict

In [5]:
## datadict which will contain the generations used in the study
gendict = create_data_dict(dataset1=df, verblist=verb_list)

In [6]:
## datadict which will contain the generations used for hyperparameter testing
hptest_dict =  create_data_dict(dataset1=df_hptest, verblist=verb_list_hptest)

### Load models and generate continuations (for different decoding strategies)

- Only do one at a time, save (with pickle below), then restart kernel (otherwise the generation of the different models will all be added into the same dictionary)

- if dealing with a lot of data: move models (and tokenizers) to device 

In [8]:
## Uncomment the one that should be used

#### GPT-2: https://huggingface.co/dbmdz/german-gpt2
from transformers import AutoTokenizer, AutoModelWithLMHead
tokenizer = AutoTokenizer.from_pretrained("dbmdz/german-gpt2")
model = AutoModelWithLMHead.from_pretrained("dbmdz/german-gpt2")#.to("mps")

#### mGPT: https://huggingface.co/ai-forever/mGPT 
#from transformers import AutoTokenizer, AutoModelForCausalLM
#model = AutoModelForCausalLM.from_pretrained("sberbank-ai/mGPT")
#tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/mGPT")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
### create the prompt_list (needed as input for the models)
def extract_prompts(datadict):
    """
    Takes the datadict(i.e., all the data) as input. 
    Extracts the individual prompts and puts them into a list so that each prompt only appears once.
    Returns this promptlist.
    """
    prompt_list = [] 
    
    for verb in datadict:
        for prompt in list(datadict[verb]["prompt"]):
            #print(prompt)
            if prompt not in prompt_list:
                prompt_list.append(prompt)
            
    return prompt_list

#### HYPERPARAMETERS

- **results after hyperparameter testing with automatic evaluation metrics:**
    - **diverse beam search:** best config (in 4/5 metrics): beam size =10 and diversity penalty = 0.7
    - **nucleus sampling:** best config (in 3/5 metrics): top p = 0.85 with temperature = 0.7 
    - **typical sampling:** best config (in 5/5 metrics): typical p = 0.9 with temperature = 0.7

In [11]:
GENERATION_LENGTH = 45
# max. length of the model generation (but generations will be cut off after the first full stop)

NO_REPEAT_N_GRAM_SIZE = 2  
# -> n-gram penalties as introduced by Paulus et al. (2017) and Klein et al. (2017): 
# makes sure that no n-gram appears twice by manually setting the probability of next words that could create an already seen n-gram to 0.

# For diverse beam search: 
BEAM_SIZE = 10
DIVERSITY_PENALTY = 0.7

# for nucleus (top-p) sampling 
P = 0.85

#for (local) typical sampling 
TYPICAL_P = 0.9

# for both sampling methods:
TEMPERATURE = 0.7 

# Set the seed for reproducibility
torch.manual_seed(23)

<torch._C.Generator at 0x12ec218d0>

#### DECODING FUNCTIONS

In [13]:
# Diverse beam 
def generate_diversebeam(datadict, promptlist, model, tokenizer):
    """
    Takes the datadict (i.e., all the data), the prompt list, the model and the tokenizer as input. 
    Generates the model continuations for the individual prompts from the list 
    and adds these continuations to the datadict so that every human reference has a respective model prediction. 
    Returns the dictionary with the added **diverse beam** model generations.
    
    *Beware: if the function has been called before (i.e., if there is already the respective data in the dict),
    another function call will lead to duplicates! So only call once for every decoding strategy.*
    
    """
    generation_list=[]
    # let model generate... 
    for prompt in promptlist:
        # diverse beam 
        generation_list.append(tokenizer.decode(model.generate(**tokenizer(prompt, return_tensors="pt"),#.to("mps")), 
                                                              max_length=GENERATION_LENGTH,
                                                              num_beams=BEAM_SIZE, 
                                                              num_beam_groups=BEAM_SIZE,
                                                              #num_beam_groups must be <= num_beams                                     
                                                              do_sample= False, 
                                                              no_repeat_ngram_size=NO_REPEAT_N_GRAM_SIZE, 
                                                              diversity_penalty= DIVERSITY_PENALTY, 
                                                              early_stopping=True)[0], skip_special_tokens=True))
    
    # now add these generations to the datadict
    for verb in datadict:
        for generation in generation_list:
            for i in range(0, len(datadict[verb]["prompt"])):  
                if datadict[verb]["prompt"][i] in generation:
                    datadict[verb]["model_generation_diversebeam"].append(generation)
                    datadict[verb]["generation_length"].append(GENERATION_LENGTH) 
                    
    return datadict 


# Nucleus Sampling
def generate_nucleus_sampling(datadict, promptlist, model, tokenizer):
    """
    Does the same as generate_diverse_beam but only with the **nucleus-sampling (top-p)** decoding procedure!
    """ 
    generation_list=[]
    # let model generate... 
    for prompt in promptlist:
        # Nucleus sampling/ top-p sampling: 
        generation_list.append(tokenizer.decode(model.generate(**tokenizer(prompt, return_tensors="pt"),#.to("mps")), 
                                                              max_length=GENERATION_LENGTH, 
                                                              do_sample=True, 
                                                              top_p=P,                                  
                                                              top_k=0, #here k needs to be 0 
                                                              temperature=TEMPERATURE, 
                                                              no_repeat_ngram_size=NO_REPEAT_N_GRAM_SIZE, 
                                                              early_stopping=True)[0], skip_special_tokens=True))
    
    # now add these generations to the datadict
    for verb in datadict:
        for generation in generation_list:
            for i in range(0, len(datadict[verb]["prompt"])):  
                if datadict[verb]["prompt"][i] in generation:
                    datadict[verb]["model_generation_nucleus_sampling"].append(generation) 
                    
    return datadict 
   
    
# (Local) Typical Sampling
def generate_typical_sampling(datadict, promptlist, model, tokenizer):
    """
    Does the same as generate_diverse_beam but only with the **typical sampling (typical p)** decoding procedure!
    """ 
    generation_list=[]
    # let model generate... 
    for prompt in promptlist:
        # Nucleus sampling/ top-p sampling: 
        generation_list.append(tokenizer.decode(model.generate(**tokenizer(prompt, return_tensors="pt"),#.to("mps")), 
                                                              max_length=GENERATION_LENGTH, 
                                                              do_sample=True, 
                                                              typical_p=TYPICAL_P,                                  
                                                              top_k=0,
                                                              temperature=TEMPERATURE, 
                                                              no_repeat_ngram_size=NO_REPEAT_N_GRAM_SIZE, 
                                                              early_stopping=True)[0], skip_special_tokens=True))
    
    # now add these generations to the datadict
    for verb in datadict:
        for generation in generation_list:
            for i in range(0, len(datadict[verb]["prompt"])):  
                if datadict[verb]["prompt"][i] in generation:
                    datadict[verb]["model_generation_typical_sampling"].append(generation) 
                    
    return datadict 
    

    

#### GENERATE...
- call the function to produce the model generations and thereby add them into the data_dict
- remember: only one model at a time!

In [24]:
## hyperparamter testing:

# first get the prompts
#prompt_list_hptest = extract_prompts(datadict=hptest_dict)

# call the decoding functions 
#hptest_dict = generate_diversebeam(datadict=hptest_dict, promptlist=prompt_list_hptest, 
#                                model =model, tokenizer=tokenizer)
#hptest_dict = generate_nucleus_sampling(datadict=hptest_dict, promptlist=prompt_list_hptest, 
#                                     model =model, tokenizer=tokenizer)
#hptest_dict = generate_typical_sampling(datadict=hptest_dict, promptlist=prompt_list_hptest, 
#                                     model =model, tokenizer=tokenizer)

In [22]:
## now with all the prompts for the study

# first get the prompts
prompt_list = extract_prompts(datadict=gendict)

# call the decoding functions 
gendict = generate_diversebeam(datadict=gendict, promptlist=prompt_list, 
                                model =model, tokenizer=tokenizer)
gendict = generate_nucleus_sampling(datadict=gendict, promptlist=prompt_list, 
                                     model =model, tokenizer=tokenizer)
gendict = generate_typical_sampling(datadict=gendict, promptlist=prompt_list, 
                                     model =model, tokenizer=tokenizer)

#### Cut off the model generations after the first full stop:

In [23]:
def cut_sentence(sentence):
    """
    Input = str (the sentence that needs to be modified).
    Cuts off the sentence after the first full stop and returns this truncated sentence.
    """
    # Find the index of the first full stop
    full_stop_index = sentence.find('.')
    # Cut off the sentence after the first full stop
    cut_sentence = sentence[:full_stop_index+1]
    
    return cut_sentence

## applying this to all generations of the datadict:
def cut_generations(datadict):
    """
    Input = dictionary (containing the model generations that needs to be cut off)
    Applies the cut_sentence() function to all the model 
    """
    for verb in datadict:
        for i in range(0, len(datadict[verb]["prompt"])):
            datadict[verb]["model_generation_diversebeam"][i] = cut_sentence(datadict[verb]["model_generation_diversebeam"][i])
            datadict[verb]["model_generation_nucleus_sampling"][i] = cut_sentence(datadict[verb]["model_generation_nucleus_sampling"][i])
            datadict[verb]["model_generation_typical_sampling"][i] = cut_sentence(datadict[verb]["model_generation_typical_sampling"][i])
        
    return datadict


In [24]:
#hptest_dict = cut_generations(hptest_dict)

gendict = cut_generations(gendict)

In [14]:
#gendict

#### Save the generations 

In [16]:
### save the data dict
with open ("../data/pkl_files/gpt2.pkl", "wb") as fp: 
    pickle.dump(gendict, fp)

In [13]:
### load data dict
with open("../data/pkl_files/mgpt.pkl", "rb") as fp: 
    mgpt_data = pickle.load(fp)

with open("../data/pkl_files/gpt2.pkl", "rb") as fp: 
    gpt2_data = pickle.load(fp)