In [1]:
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
tqdm.pandas()

import warnings
# warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

  from tqdm.autonotebook import tqdm


## Read & Preprocess Data

In [2]:
df_data = pd.read_csv("../data/ROCStories/cloze_test_val__winter2018-cloze_test_ALL_val.csv")
df_data = df_data.head(100)
df_data.shape
df_data.head()

(100, 8)

Unnamed: 0,InputStoryid,InputSentence1,InputSentence2,InputSentence3,InputSentence4,RandomFifthSentenceQuiz1,RandomFifthSentenceQuiz2,AnswerRightEnding
0,138d5bfb-05cc-41e3-bf2c-fa85ebad14e2,Rick grew up in a troubled household.,"He never found good support in family, and tur...",It wasn't long before Rick got shot in a robbery.,The incident caused him to turn a new leaf.,He is happy now.,He joined a gang.,1
1,bff9f820-9605-4875-b9af-fe6f14d04256,Laverne needs to prepare something for her fri...,She decides to bake a batch of brownies.,She chooses a recipe and follows it closely.,Laverne tests one of the brownies to make sure...,The brownies are so delicious Laverne eats two...,Laverne doesn't go to her friend's party.,1
2,e8f628d5-9f97-40ed-8611-fc0e774673c4,Sarah had been dreaming of visiting Europe for...,She had finally saved enough for the trip.,She landed in Spain and traveled east across t...,She didn't like how different everything was.,Sarah then decided to move to Europe.,Sarah decided that she preferred her home over...,2
3,f5226bfe-9f26-4377-b05f-3d9568dbdec1,Gina was worried the cookie dough in the tube ...,She was very happy to find she was wrong.,The cookies from the tube were as good as from...,Gina intended to only eat 2 cookies and save t...,Gina liked the cookies so much she ate them al...,Gina gave the cookies away at her church.,1
4,69ac9b05-b956-402f-9fff-1f926ef9176b,It was my final performance in marching band.,I was playing the snare drum in the band.,We played Thriller and Radar Love.,The performance was flawless.,I was very proud of my performance.,I was very ashamed of my performance.,1


In [3]:
# add 5th sentence
df_data['InputSentence5'] = df_data.apply(
    lambda row: row['RandomFifthSentenceQuiz1'] if row['AnswerRightEnding']==1 \
    else row['RandomFifthSentenceQuiz2'],
    axis=1
)

# combine all 5 sentences into outline
df_data['outline'] = df_data['InputSentence1'] + "\n" + \
                     df_data['InputSentence2'] + "\n" + \
                     df_data['InputSentence3'] + "\n" + \
                     df_data['InputSentence4'] + "\n" + \
                     df_data['InputSentence5']

# create prompt
df_data['prompt'] = "Generate a multi-paragaph story using the following outline:\nOUTLINE:\n" + \
                    df_data['outline'] + "\n" + "STORY:"

In [4]:
df_data.head(2)

Unnamed: 0,InputStoryid,InputSentence1,InputSentence2,InputSentence3,InputSentence4,RandomFifthSentenceQuiz1,RandomFifthSentenceQuiz2,AnswerRightEnding,InputSentence5,outline,prompt
0,138d5bfb-05cc-41e3-bf2c-fa85ebad14e2,Rick grew up in a troubled household.,"He never found good support in family, and tur...",It wasn't long before Rick got shot in a robbery.,The incident caused him to turn a new leaf.,He is happy now.,He joined a gang.,1,He is happy now.,Rick grew up in a troubled household.\nHe neve...,Generate a multi-paragaph story using the foll...
1,bff9f820-9605-4875-b9af-fe6f14d04256,Laverne needs to prepare something for her fri...,She decides to bake a batch of brownies.,She chooses a recipe and follows it closely.,Laverne tests one of the brownies to make sure...,The brownies are so delicious Laverne eats two...,Laverne doesn't go to her friend's party.,1,The brownies are so delicious Laverne eats two...,Laverne needs to prepare something for her fri...,Generate a multi-paragaph story using the foll...


In [5]:
print(df_data['outline'][0])

Rick grew up in a troubled household.
He never found good support in family, and turned to gangs.
It wasn't long before Rick got shot in a robbery.
The incident caused him to turn a new leaf.
He is happy now.


In [6]:
print(df_data['prompt'][0])

Generate a multi-paragaph story using the following outline:
OUTLINE:
Rick grew up in a troubled household.
He never found good support in family, and turned to gangs.
It wasn't long before Rick got shot in a robbery.
The incident caused him to turn a new leaf.
He is happy now.
STORY:


## Generate Story

In [7]:
# checkpoint_bloomz_1b7 = "bigscience/bloomz-1b7"
# checkpoint = "gpt2"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")

In [8]:
# def generate_stories(model, tokenizer, prompts, do_sample=True, min_length=30, max_new_tokens=100):
#     generated_stories = []
#     for prompt in tqdm(prompts):
#         inputs = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
#         outputs = model.generate(inputs, 
#                                  do_sample=do_sample, 
#                                  min_length=min_length, 
#                                  max_new_tokens=max_new_tokens,
#                                  top_k=50,
#                                  top_p=0.90,
#                                  # temperature=0.7,
#                                  # num_return_sequences=3
#                                  # repetition_penalty=0.9
#                                  # num_beams=5,
#                                  # no_repeat_ngram_size=3,
#                                  # early_stopping=True,
#                                 )
#         generated_stories.append(tokenizer.decode(outputs[0]))
#     return generated_stories

In [9]:
def generate_stories(model, tokenizer, prompts, do_sample=True, min_length=30, max_new_tokens=100):
    generated_stories = []
    for prompt in tqdm(prompts):
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        output_ids = model.generate(inputs["input_ids"], 
                                 do_sample=do_sample, 
                                 min_length=min_length, 
                                 max_new_tokens=max_new_tokens,
                                 top_k=50,
                                 top_p=0.90,
                                 # temperature=0.7,
                                 num_return_sequences=3,
                                 # repetition_penalty=0.9,
                                 # num_beams=5,
                                 # no_repeat_ngram_size=3,
                                 # early_stopping=True,
                                )
        outputs = tokenizer.batch_decode(output_ids)
        
        # select best story
        gen_stories_len = [len(x) for x in outputs]
        final_story = outputs[np.argmax(gen_stories_len)]
        final_story = final_story.split("STORY:")[-1].strip()
        
        generated_stories.append(final_story)
    return generated_stories

In [10]:
# outline = df_data['prompt'][0]
# stories = generate_stories(
#     model, 
#     tokenizer, 
#     [outline], 
#     do_sample=True, min_length=50, max_new_tokens=500
# )

In [11]:
# for x in stories:
#     print(x)

In [12]:
# gen_stories_len = [len(x) for x in stories[0]]
# final_story = stories[0][np.argmax(gen_stories_len)]
# final_story = final_story.split("STORY:")[-1].strip()
# print(final_story)

In [13]:
checkpoint = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")

df_data[f'gen_stories_{checkpoint}'] = generate_stories(
    model, 
    tokenizer, 
    df_data['prompt'].tolist(), 
    do_sample=True, min_length=50, max_new_tokens=500
)

  0%|          | 0/100 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [14]:
checkpoint = "bigscience/bloomz-1b7"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype="auto", device_map="auto")
checkpoint = checkpoint.split('/')[-1]
df_data[f'gen_stories_{checkpoint}'] = generate_stories(
    model, 
    tokenizer, 
    df_data['prompt'].tolist(), 
    do_sample=True, min_length=50, max_new_tokens=500
)

  0%|          | 0/100 [00:00<?, ?it/s]

In [15]:
df_data.to_parquet("../data/generated_stories/generated_stories_non_gpt3_100.parquet")