# Abstractive summarization with T5
This notebook is to run a baseline model for our final project.
The notebook is [here](https://github.com/datasci-w266/2023-spring-main/blob/master/materials/lesson_notebooks/lesson_7_summarization_QA.ipynb)


In [None]:
%pip install -q sentencepiece
%pip install -q transformers
%pip install -q rouge_score
%pip install -q evaluate

In [None]:
import evaluate
from google.colab import output
import pprint
import timeit
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import zipfile
from io import BytesIO, StringIO
import pprint
import operator
from pprint import pprint
from google.colab import drive
drive.mount('/content/drive')
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

In [None]:
# additional packages
import pprint

In [None]:
train_df = pd.read_csv('content/drive/MyDrive/w266/final_proj_data/train_wiki_bert_lemma.csv')
train_df.head()

Unnamed: 0.1,Unnamed: 0,title,imdb_id,bert_data,lemma_data,plot_outline,bert_dialog,bert_text
0,221,Three Billboards Outside Ebbing Missouri,5027774,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"THREE BILLBOARDS OUTSIDE EBBING, MISSOURI is a...","by\n Martin McDonagh\n You Red Welby?\n Yes, ...","MILDRED HAYES, a woman in her early 50's, dri..."
1,346,Candle to Water,2387411,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,To present five different contemporary stories...,\n Written by\n Nick Green\n Shooting draft (...,"The car is stuck in rush-hour traffic, which ..."
2,277,1917,8579674,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"April 1917, the Western Front. Two British sol...",1917\n Written by\n Sam Mendes\n &\n Krysty W...,The following script takes place in real time...
3,420,Friends with Benefits,1632708,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Jamie Rellis (Mila Kunis) is a New York City h...,No. Not even close.\n I know. I'll be there.\...,He almost steps on a dog sleeping on the floo...
4,315,Anonymous,1521197,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"Edward De Vere, Earl of Oxford, is presented a...","\n Written by\n John Orloff\n up, etc...\n th...",TITLES BEGIN over the SOUNDS of city traffic....


In [None]:
subset_train_df = train_df.sample(n=30, axis=0, random_state=266)
subset_train_df.shape

(30, 8)

In [None]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
lemma_text = []

for i in range(subset_train_df.shape[0]):
    raw_lemma = subset_train_df.iloc[i]['lemma_data']
    with open(raw_lemma, 'r') as file:
        text_data = file.read().replace('\n', '')
        text_data = ' '.join(str(text_data).split()[:1000])
        print(raw_lemma)
        print(len(text_data))
        print(text_data[:60])
        print(50*'*')
        lemma_text.append(text_data)
# /content/drive/MyDrive/W266_Movie_Data/raw_text_lemmas/raw_text_lemmas

/content/drive/MyDrive/W266_Movie_Data/raw_text_lemmas/raw_text_lemmas/The Guard_1540133_lemmas.txt
4575
the guard by JOHN michael mcdonagh final shoot draft - conta
**************************************************
/content/drive/MyDrive/W266_Movie_Data/raw_text_lemmas/raw_text_lemmas/Batman Year One_1672723_lemmas.txt
4568
batman year one by Frank Miller sign in sign - up welcome ! 
**************************************************
/content/drive/MyDrive/W266_Movie_Data/raw_text_lemmas/raw_text_lemmas/Inherent Vice_1791528_lemmas.txt
4456
inherent vice screenplay by Paul Thomas Anderson base on the
**************************************************
/content/drive/MyDrive/W266_Movie_Data/raw_text_lemmas/raw_text_lemmas/Colombiana_1657507_lemmas.txt
4333
colombiana written by Robert Mark kamen &amp ; luc besson Au
**************************************************
/content/drive/MyDrive/W266_Movie_Data/raw_text_lemmas/raw_text_lemmas/Barney s Version_1423894_lemmas.txt
4500
barney 's

In [None]:
len(lemma_text)

30

# Functions to get the texts

In [None]:
def get_movie_title(script_txt_file):
    '''get the movie title without the unique identifier and _anno.txt suffix'''
    movie_title = script_txt_file.split('_')[0]
    return movie_title

def get_script_length(file_path, script_txt_file):
    '''calculate the number of lines in a BERT annotated script'''
    with open(str(file_path) + str(script_txt_file), 'r') as test_file:
        script_length = len(test_file.readlines())
    return script_length

def read_script(file_path, script_txt_file):
    '''read in the BERT annotated script'''
    script_text = open(str(file_path) + str(script_txt_file), 'r')
    # print(test_file.read())
    return script_text.read()

def count_script_elements(file_path, script_txt_file):
    '''count script elements such as dialog, text, speaker_heading, scene_heding'''
    script_element_dict = {}
    with open(str(file_path) + str(script_txt_file), 'r') as script_file:
        for line in script_file:
            script_element = line.split(':')[0]    
            if script_element not in script_element_dict:
                script_element_dict[script_element] = 1
            else:
                script_element_dict[script_element] += 1
    return script_element_dict

def identify_characters(file_path, script_txt_file):
    '''count number of characters and their speaking parts'''
    speaker_heading_dict = {}
    with open(str(file_path) + str(script_txt_file), 'r') as script_file:
        for line in script_file:
    
            # if the script element is 'speaker_heading' then that is a character
            if 'speaker_heading' in line.split(':')[0]:
                # some speaker_headings do not contain character names
                if re.search('[a-zA-Z]', line.split(':')[1]) != None:

                    # remove leading and trailing spaces and trailing newlines
                    character = line.split(':')[1].lstrip().rstrip().replace('\n', '')

                    # remove text that is not uppercase
                    character = ''.join(ch for ch in character if not ch.islower())

                    # remove (O.S.) off screen from character name
                    character = character.replace(' (O.S.)', '')

                    # remove trailing punctuation
                    character = character.rstrip('.').rstrip('?').rstrip('!')

                    ##### NEED TO ADD LOGIC TO DEAL WITH CONTINUOUS, CONTINUED, and CONT'D #####

                    if character not in speaker_heading_dict:
                        speaker_heading_dict[character] = 1
                    else:
                        speaker_heading_dict[character] += 1

    # remove characters that only have one speaking line
    character_dict = {k:v for k, v in speaker_heading_dict.items() if v > 1}
    print(f'character_dict length before removing single speaking lines: {len(speaker_heading_dict)}')
    print(f'character_dict length after removing single speaking lines: {len(character_dict)}')

    return character_dict

# Look into one raw text


In [None]:
subset_train_df['lemmas_raw_text'] = lemma_text
subset_train_df.head()


Unnamed: 0.1,Unnamed: 0,title,imdb_id,bert_data,lemma_data,plot_outline,bert_dialog,bert_text,lemmas_raw_text
168,118,The Guard,1540133,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Sergeant Gerry Boyle is a small-town Irish cop...,Final Shooting Draft - Containing all revisio...,HELICOPTER SHOT -- a red car speeds through t...,the guard by JOHN michael mcdonagh final shoot...
129,320,Batman Year One,1672723,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Two men come to Gotham City: Bruce Wayne after...,\n BATMAN YEAR ONE By Frank Miller\n \n Sign ...,LIGHTNING RIPS across a jet-black SKY.\n LIGH...,batman year one by Frank Miller sign in sign -...
147,497,Inherent Vice,1791528,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"During the psychedelic 60s and 70s Larry ""Doc""...",screenplay by\n Paul Thomas Anderson\n based ...,"A sweet, young woman’s voice narrates.\n half...",inherent vice screenplay by Paul Thomas Anders...
272,375,Colombiana,1657507,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,A young woman grows up to be a stone-cold assa...,\n Written by\n Robert Mark Kamen &amp; Luc B...,ON A BLACK SCREEN\n CLOSE ON A PAGE OF A LARA...,colombiana written by Robert Mark kamen &amp ;...
66,310,Barney s Version,1423894,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Take a ride through the life and memories of B...,Barney's Version\n by\n Michael Konyves\n Bas...,A tumbler of Scotch. A half empty bottle of M...,barney 's version by Michael konyve base on th...


# T5 for Generic Summarization - Model Loading

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration
t5model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
t5tokenizer = T5Tokenizer.from_pretrained("t5-base")
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/892M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("t5-base")
config

T5Config {
  "_name_or_path": "t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
      "num_beams": 4,
      "pre

In [None]:
t5model.summary()


Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
Total params: 222,903,552
Trainable params: 222,903,552
Non-trainable params: 0
_________________________________________________________________


In [None]:
# PROMPT = 'summarize: '
# T5ARTICLE_TO_SUMMARIZE = PROMPT + ARTICLE_TO_SUMMARIZE

subset_train_df['lemmas_raw_text_prompt'] = 'summarize: ' + subset_train_df['lemmas_raw_text']
subset_train_df.head()




Unnamed: 0.1,Unnamed: 0,title,imdb_id,bert_data,lemma_data,plot_outline,bert_dialog,bert_text,lemmas_raw_text,lemmas_raw_text_prompt
168,118,The Guard,1540133,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Sergeant Gerry Boyle is a small-town Irish cop...,Final Shooting Draft - Containing all revisio...,HELICOPTER SHOT -- a red car speeds through t...,the guard by JOHN michael mcdonagh final shoot...,summarize: the guard by JOHN michael mcdonagh ...
129,320,Batman Year One,1672723,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Two men come to Gotham City: Bruce Wayne after...,\n BATMAN YEAR ONE By Frank Miller\n \n Sign ...,LIGHTNING RIPS across a jet-black SKY.\n LIGH...,batman year one by Frank Miller sign in sign -...,summarize: batman year one by Frank Miller sig...
147,497,Inherent Vice,1791528,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"During the psychedelic 60s and 70s Larry ""Doc""...",screenplay by\n Paul Thomas Anderson\n based ...,"A sweet, young woman’s voice narrates.\n half...",inherent vice screenplay by Paul Thomas Anders...,summarize: inherent vice screenplay by Paul Th...
272,375,Colombiana,1657507,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,A young woman grows up to be a stone-cold assa...,\n Written by\n Robert Mark Kamen &amp; Luc B...,ON A BLACK SCREEN\n CLOSE ON A PAGE OF A LARA...,colombiana written by Robert Mark kamen &amp ;...,summarize: colombiana written by Robert Mark k...
66,310,Barney s Version,1423894,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Take a ride through the life and memories of B...,Barney's Version\n by\n Michael Konyves\n Bas...,A tumbler of Scotch. A half empty bottle of M...,barney 's version by Michael konyve base on th...,summarize: barney 's version by Michael konyve...


In [None]:
subset_train_df.reset_index()

Unnamed: 0.1,index,Unnamed: 0,title,imdb_id,bert_data,lemma_data,plot_outline,bert_dialog,bert_text,lemmas_raw_text,lemmas_raw_text_prompt
0,168,118,The Guard,1540133,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Sergeant Gerry Boyle is a small-town Irish cop...,Final Shooting Draft - Containing all revisio...,HELICOPTER SHOT -- a red car speeds through t...,the guard by JOHN michael mcdonagh final shoot...,summarize: the guard by JOHN michael mcdonagh ...
1,129,320,Batman Year One,1672723,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Two men come to Gotham City: Bruce Wayne after...,\n BATMAN YEAR ONE By Frank Miller\n \n Sign ...,LIGHTNING RIPS across a jet-black SKY.\n LIGH...,batman year one by Frank Miller sign in sign -...,summarize: batman year one by Frank Miller sig...
2,147,497,Inherent Vice,1791528,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"During the psychedelic 60s and 70s Larry ""Doc""...",screenplay by\n Paul Thomas Anderson\n based ...,"A sweet, young woman’s voice narrates.\n half...",inherent vice screenplay by Paul Thomas Anders...,summarize: inherent vice screenplay by Paul Th...
3,272,375,Colombiana,1657507,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,A young woman grows up to be a stone-cold assa...,\n Written by\n Robert Mark Kamen &amp; Luc B...,ON A BLACK SCREEN\n CLOSE ON A PAGE OF A LARA...,colombiana written by Robert Mark kamen &amp ;...,summarize: colombiana written by Robert Mark k...
4,66,310,Barney s Version,1423894,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Take a ride through the life and memories of B...,Barney's Version\n by\n Michael Konyves\n Bas...,A tumbler of Scotch. A half empty bottle of M...,barney 's version by Michael konyve base on th...,summarize: barney 's version by Michael konyve...
5,338,264,Zootopia,2948356,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,From the largest elephant to the smallest shre...,\n Written by\n Jared Bush &amp; Phil Johnsto...,"We hear the feral, primeval sounds of a jungl...",zootopia written by Jared Bush &amp ; phil joh...,summarize: zootopia written by Jared Bush &amp...
6,255,62,Scary Stories to Tell in the Dark,3387520,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,It's 1968 in America. Change is blowing in the...,Written by\n Kevin & Dan Hageman\n Story by\n...,TO TELL IN THE DARK\n Settled on a fork of th...,scary storues to tell in the dark written by K...,summarize: scary storues to tell in the dark w...
7,349,119,The Edge of Seventeen,1878870,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"Everyone knows that growing up is hard, and li...","by\n Kelly Fremon Craig\n January 3, 2011\n I...",A murky green pond dotted with floating litte...,"bestie by Kelly fremon Craig January 3 , 2011 ...",summarize: bestie by Kelly fremon Craig Januar...
8,126,308,As Above So Below,2870612,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Miles of twisting catacombs lie beneath the st...,Written by\n John Erick Dowdle\n Drew Dowdle\...,"Camera powers up, jostles around until it set...","as above , so below written by John erick dowd...","summarize: as above , so below written by John..."
9,261,359,Celeste Jesse Forever,1405365,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Celeste and Jesse have been best friends forev...,\n \n Written by\n \n Rashida Jones &amp; Wil...,"A progression of images of CELESTE and JESSE,...",celeste and jesse forever written by rashida J...,summarize: celeste and jesse forever written b...


In [None]:
list_of_candidates = []
for i in range(subset_train_df.shape[0]):
# for i in range(df.shape[0]):
    print(i)
    prompt = subset_train_df['lemmas_raw_text_prompt'].iloc[i]
    # print(prompt)
    inputs = t5tokenizer(prompt, max_length=1024, truncation=True, return_tensors="tf")
    summary_ids = t5model.generate(inputs["input_ids"]
                                        , max_length = 256
                                        , min_length = 94
                                        , no_repeat_ngram_size = 3
                                        , num_beams = 4)
    candidate = t5tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    list_of_candidates.append(candidate)
    pprint.pprint(candidate[0], compact=True)
output.eval_js('new Audio("https://upload.wikimedia.org/wikipedia/commons/0/05/Beep-09.ogg").play()')

0
("the guard's final shoot draft - contain all revision January 2010 reprisal "
 'film / element picture 21 mespil Road Dublin 4 Ireland . int . open title - '
 'day 11 overhead shot -- boyle asleep on he bed . wearing only blue-and- '
 'yellow Marks & Spencer y-front . and a white undershirt . close on -- a navy '
 'tunic jacket . with three chevron buttoned up over he paunch .')
1
('batman year one screenplay by frank miller fade in . close batman season one '
 ". bruce wayne's apartment is a claustrophobic . room of a obsessive "
 "compulsive . but he manages to find a job he'd like to do . little al, a "
 'gigantic , early middle - age black man .')
2
('inherent vice screenplay by Paul Thomas Anderson base on the novel by Thomas '
 'pynchon this script be the confidential and proprietary property of Warner '
 'bros . picture and no portion of it may be perform , distribute , reproduce '
 ', use , quote or publish without prior write permission . final shoot script '
 "August 7 , 2

In [None]:
len(list_of_candidates)

subset_train_df['t5_generation'] = list_of_candidatesb

In [None]:
subset_train_df.head()
subset_train_df.to_csv('subset_train_df_with_t5.csv')

In [None]:
test = pd.read_csv('subset_train_df_with_t5.csv')


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,imdb_id,bert_data,lemma_data,plot_outline,bert_dialog,bert_text,lemmas_raw_text,lemmas_raw_text_prompt,t5_generation
0,168,118,The Guard,1540133,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Sergeant Gerry Boyle is a small-town Irish cop...,Final Shooting Draft - Containing all revisio...,HELICOPTER SHOT -- a red car speeds through t...,the guard by JOHN michael mcdonagh final shoot...,summarize: the guard by JOHN michael mcdonagh ...,"[""the guard's final shoot draft - contain all ..."
1,129,320,Batman Year One,1672723,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Two men come to Gotham City: Bruce Wayne after...,\n BATMAN YEAR ONE By Frank Miller\n \n Sign ...,LIGHTNING RIPS across a jet-black SKY.\n LIGH...,batman year one by Frank Miller sign in sign -...,summarize: batman year one by Frank Miller sig...,"[""batman year one screenplay by frank miller f..."
2,147,497,Inherent Vice,1791528,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"During the psychedelic 60s and 70s Larry ""Doc""...",screenplay by\n Paul Thomas Anderson\n based ...,"A sweet, young woman’s voice narrates.\n half...",inherent vice screenplay by Paul Thomas Anders...,summarize: inherent vice screenplay by Paul Th...,"[""inherent vice screenplay by Paul Thomas Ande..."
3,272,375,Colombiana,1657507,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,A young woman grows up to be a stone-cold assa...,\n Written by\n Robert Mark Kamen &amp; Luc B...,ON A BLACK SCREEN\n CLOSE ON A PAGE OF A LARA...,colombiana written by Robert Mark kamen &amp ;...,summarize: colombiana written by Robert Mark k...,"[""fabio and don luis have been together since ..."
4,66,310,Barney s Version,1423894,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Take a ride through the life and memories of B...,Barney's Version\n by\n Michael Konyves\n Bas...,A tumbler of Scotch. A half empty bottle of M...,barney 's version by Michael konyve base on th...,summarize: barney 's version by Michael konyve...,"[""barney 's version by michael konyve based on..."


In [None]:
subset_train_df.head()

Unnamed: 0.1,Unnamed: 0,title,imdb_id,bert_data,lemma_data,plot_outline,bert_dialog,bert_text,lemmas_raw_text,lemmas_raw_text_prompt,t5_generation
168,118,The Guard,1540133,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Sergeant Gerry Boyle is a small-town Irish cop...,Final Shooting Draft - Containing all revisio...,HELICOPTER SHOT -- a red car speeds through t...,the guard by JOHN michael mcdonagh final shoot...,summarize: the guard by JOHN michael mcdonagh ...,[the guard's final shoot draft - contain all r...
129,320,Batman Year One,1672723,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Two men come to Gotham City: Bruce Wayne after...,\n BATMAN YEAR ONE By Frank Miller\n \n Sign ...,LIGHTNING RIPS across a jet-black SKY.\n LIGH...,batman year one by Frank Miller sign in sign -...,summarize: batman year one by Frank Miller sig...,[batman year one screenplay by frank miller fa...
147,497,Inherent Vice,1791528,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"During the psychedelic 60s and 70s Larry ""Doc""...",screenplay by\n Paul Thomas Anderson\n based ...,"A sweet, young woman’s voice narrates.\n half...",inherent vice screenplay by Paul Thomas Anders...,summarize: inherent vice screenplay by Paul Th...,[inherent vice screenplay by Paul Thomas Ander...
272,375,Colombiana,1657507,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,A young woman grows up to be a stone-cold assa...,\n Written by\n Robert Mark Kamen &amp; Luc B...,ON A BLACK SCREEN\n CLOSE ON A PAGE OF A LARA...,colombiana written by Robert Mark kamen &amp ;...,summarize: colombiana written by Robert Mark k...,[fabio and don luis have been together since t...
66,310,Barney s Version,1423894,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Take a ride through the life and memories of B...,Barney's Version\n by\n Michael Konyves\n Bas...,A tumbler of Scotch. A half empty bottle of M...,barney 's version by Michael konyve base on th...,summarize: barney 's version by Michael konyve...,[barney 's version by michael konyve based on ...


In [None]:
subset_train_df['t5_generation_str'] = subset_train_df['t5_generation'].apply(lambda x: ','.join(map(str, x)))

subset_train_df.head()


Unnamed: 0.1,Unnamed: 0,title,imdb_id,bert_data,lemma_data,plot_outline,bert_dialog,bert_text,lemmas_raw_text,lemmas_raw_text_prompt,t5_generation,t5_generation_str
168,118,The Guard,1540133,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Sergeant Gerry Boyle is a small-town Irish cop...,Final Shooting Draft - Containing all revisio...,HELICOPTER SHOT -- a red car speeds through t...,the guard by JOHN michael mcdonagh final shoot...,summarize: the guard by JOHN michael mcdonagh ...,[the guard's final shoot draft - contain all r...,the guard's final shoot draft - contain all re...
129,320,Batman Year One,1672723,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Two men come to Gotham City: Bruce Wayne after...,\n BATMAN YEAR ONE By Frank Miller\n \n Sign ...,LIGHTNING RIPS across a jet-black SKY.\n LIGH...,batman year one by Frank Miller sign in sign -...,summarize: batman year one by Frank Miller sig...,[batman year one screenplay by frank miller fa...,batman year one screenplay by frank miller fad...
147,497,Inherent Vice,1791528,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,"During the psychedelic 60s and 70s Larry ""Doc""...",screenplay by\n Paul Thomas Anderson\n based ...,"A sweet, young woman’s voice narrates.\n half...",inherent vice screenplay by Paul Thomas Anders...,summarize: inherent vice screenplay by Paul Th...,[inherent vice screenplay by Paul Thomas Ander...,inherent vice screenplay by Paul Thomas Anders...
272,375,Colombiana,1657507,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,A young woman grows up to be a stone-cold assa...,\n Written by\n Robert Mark Kamen &amp; Luc B...,ON A BLACK SCREEN\n CLOSE ON A PAGE OF A LARA...,colombiana written by Robert Mark kamen &amp ;...,summarize: colombiana written by Robert Mark k...,[fabio and don luis have been together since t...,fabio and don luis have been together since th...
66,310,Barney s Version,1423894,/content/drive/MyDrive/W266_Movie_Data/BERT_an...,/content/drive/MyDrive/W266_Movie_Data/raw_tex...,Take a ride through the life and memories of B...,Barney's Version\n by\n Michael Konyves\n Bas...,A tumbler of Scotch. A half empty bottle of M...,barney 's version by Michael konyve base on th...,summarize: barney 's version by Michael konyve...,[barney 's version by michael konyve based on ...,barney 's version by michael konyve based on t...


In [None]:
rouge = evaluate.load('rouge')
references = list(subset_train_df['plot_outline'])
pred =       list(subset_train_df['t5_generation_str'])
# T5
results = rouge.compute(predictions=pred, references=references)
print('base t5 with n=30 compared with outline, ', results)

base t5 with n=30 compared with outline,  {'rouge1': 0.1852579479351284, 'rouge2': 0.01797985622544701, 'rougeL': 0.11554045940424346, 'rougeLsum': 0.11445694008626571}


pred_base_ada_94,  {'rouge1': 0.1852579479351284, 'rouge2': 0.01797985622544701, 'rougeL': 0.11554045940424346, 'rougeLsum': 0.11445694008626571}


In [None]:
rouge = evaluate.load('rouge')
references = list(subset_train_df['plot_outline'])
pred =       list(subset_train_df['title'])
# T5
results = rouge.compute(predictions=pred, references=references)
print('title, ', results)

title,  {'rouge1': 0.041811426049666034, 'rouge2': 0.014025394577675314, 'rougeL': 0.04032699063468995, 'rougeLsum': 0.039798010939398915}
