# Imports

In [1]:
from absl import logging
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
from pathlib import Path
import nltk
from ast import literal_eval

In [2]:
print(tf.__version__)

2.8.0


In [3]:
!python --version

Python 3.7.13


# Universal Sentence Encoder (USE)

In [4]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)
  
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

message_embeddings = embed(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
  print("Message: {}".format(messages[i]))
  print("Embedding size: {}".format(len(message_embedding)))
  message_embedding_snippet = ", ".join(
      (str(x) for x in message_embedding[:3]))
  print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded
Message: Elephant
Embedding size: 512
Embedding: [0.008344466798007488, 0.00048083445290103555, 0.06595246493816376, ...]

Message: I am a sentence for which I would like to get its embedding.
Embedding size: 512
Embedding: [0.0508086271584034, -0.01652432419359684, 0.01573781482875347, ...]

Message: Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.
Embedding size: 512
Embedding: [-0.028332682326436043, -0.05586216226220131, -0.012941485270857811, ...]



# Explanation
Let's get embeddings for each sentence in our [scripts](https://github.com/ppapalampidi/TRIPOD) and just like in [Papalampidi et al](https://arxiv.org/pdf/2004.12727.pdf) represent a scene by the mean of its sentence representations and measure scene similarity $e_i{_j}$ using cosine similarity

In [5]:
%%bash
git clone https://github.com/ppapalampidi/TRIPOD.git
cd TRIPOD
python screenplays_scene_segmentation.py

The Mummy (1999 film)
The Kids Are All Right (film)
17 Again (film)
Drive (2011 film)
Funny People
One Eight Seven
What Women Want
Saw (film)
Superman (1978 film)
We Own the Night (film)
The Searchers (film)
Bridesmaids (2011 film)
Sherlock Holmes (2009 film)
Arbitrage (film)
The Apartment
10 Things I Hate About You
Bonnie and Clyde (film)
The Dark Knight (film)
The Ugly Truth
Ring (film)
Young Frankenstein
The Breakfast Club
Reservoir Dogs
Indiana Jones and the Kingdom of the Crystal Skull
Total Recall (1990 film)
Top Gun
The Time Machine (1960 film)
Pirates of the Caribbean: The Curse of the Black Pearl
While She Was Out
Gran Torino
Minority Report (film)
The Sixth Sense
Juno (film)
Pride & Prejudice (2005 film)
Star Wars Episode I: The Phantom Menace
Panic Room
Seven (film)
The Back-up Plan
Slumdog Millionaire
Youve Got Mail
Vertigo (film)
Meet Joe Black
30 Minutes or Less
My Girl (film)
Men in Black (film)
Titanic (1997 film)
The Truman Show
Kalifornia
Jaws (film)
Sleepless in Seat

Cloning into 'TRIPOD'...


# NLTK Sentence Splitter

In [6]:
nltk.download('punkt')
def sentence(text):
   return nltk.sent_tokenize(text) 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Turn into Dataframe

In [7]:
def padding(array,x,y):
  '''
  appends zeros to end of array to create array of desired shape
  '''
  a = x - array.shape[0]
  b = y - array.shape[1]
  return np.pad(array, pad_width=((0,a),(0,b)))

In [8]:
rows = []
screenplays = os.listdir('/content/TRIPOD/Segmented_screenplays/')
for sp in screenplays:
  scenes_raw = Path('/content/TRIPOD/Segmented_screenplays/'+sp).read_text().split('========================================')[:-1]
  for sr in scenes_raw:
    row = {}
    row['scene'] = sr.split('====================')[2].strip()
    row['number'] = int(sr.split('====================')[1])
    row['screenplay'] = sp.split('_segmented')[0]
    rows.append(row)
scenes = pd.DataFrame(rows)

scenes['movie'] = [i[0] for i in scenes['screenplay'].str.split('_script')]
scenes_as_embeddings = [np.array(embed(sentence(scene))).mean(axis=0) for scene in scenes['scene'].iloc]
scenes['embedding'] = scenes_as_embeddings
scenes

Unnamed: 0,scene,number,screenplay,movie,embedding
0,EXT. ROAD -- TEXAS/MEXICO BORDER -- NIGHT\n\nA...,0,Men in Black (film)_script,Men in Black (film),"[-0.0063585127, 0.012928954, 0.019146848, -0.0..."
1,INT. VAN -- TEXAS/MEXICO BORDER -- NIGHT\n\n\n...,1,Men in Black (film)_script,Men in Black (film),"[0.004618446, 0.022468232, 0.01473625, -0.0033..."
2,EXT. ROAD -- TEXAS/MEXICO BORDER -- NIGHT\n\n\...,2,Men in Black (film)_script,Men in Black (film),"[-0.008483304, 0.0077187833, 0.0056559523, -0...."
3,EXT. DESERT CLEARING - NIGHT\n\n\n\nKay and De...,3,Men in Black (film)_script,Men in Black (film),"[-0.016636118, 0.03233801, -0.004568749, -0.00..."
4,EXT. ROAD -- TEXAS/MEXICO BORDER -- NIGHT\n\n\...,4,Men in Black (film)_script,Men in Black (film),"[0.018280473, -0.01764558, 0.017716356, -0.026..."
...,...,...,...,...,...
13607,INT. POGUE HOME - DAY\n\n\n\nWe're in the mids...,109,Angel Eyes (film)_script,Angel Eyes (film),"[-0.0069848467, 0.027921446, 0.012036828, -0.0..."
13608,"INT. KITCHEN - DAY\n\n\n\nSharon enters, waits...",110,Angel Eyes (film)_script,Angel Eyes (film),"[-0.00444262, 0.009855239, 0.017026015, 0.0019..."
13609,INT. LIVING ROOM - DAY\n\n\n\nSharon comes out...,111,Angel Eyes (film)_script,Angel Eyes (film),"[-0.0022966473, 0.00900619, 0.00882751, -0.000..."
13610,EXT. POGUE HOUSE - DAY\n\n\n\nSharon exits ...,112,Angel Eyes (film)_script,Angel Eyes (film),"[-0.022781255, 0.018216146, 0.005998154, -0.00..."


In [9]:
#extracting golf-standard turning points from TRIPOD Dataset
gold = pd.read_csv('/content/TRIPOD/Synopses_and_annotations/TRIPOD_screenplays_test.csv')
full_gold_rows = []
for movie in gold.iloc:
  tp1=literal_eval(movie['tp1'])
  tp2=literal_eval(movie['tp2'])
  tp3=literal_eval(movie['tp3'])
  tp4=literal_eval(movie['tp4'])
  tp5=literal_eval(movie['tp5'])
  for i in tp1:
    for j in tp2:
      for k in tp3:
        for l in tp4:
          for m in tp5:
            row = {}
            row['movie'] = movie['movie_name']
            row['tp1'] = i
            row['tp2'] = j
            row['tp3'] = k
            row['tp4'] = l
            row['tp5'] = m
            full_gold_rows.append(row)
full_gold = pd.DataFrame(full_gold_rows)
full_gold

Unnamed: 0,movie,tp1,tp2,tp3,tp4,tp5
0,The Back-up Plan,9,40,82,106,131
1,The Back-up Plan,9,40,82,107,131
2,The Back-up Plan,9,40,82,111,131
3,The Back-up Plan,9,41,82,106,131
4,The Back-up Plan,9,41,82,107,131
...,...,...,...,...,...,...
198,The Last Temptation of Christ (film),21,49,65,74,77
199,The Last Temptation of Christ (film),21,50,64,74,76
200,The Last Temptation of Christ (film),21,50,64,74,77
201,The Last Temptation of Christ (film),21,50,65,74,76


In [10]:
#incomplete code below.  
#Still used to pull the scene encodings which are needed for our baseline code

rrows=[]
for row in full_gold.iloc:
  rrow={}
  rrow['movie'] = row['movie']
  
  # X for each movie tpid is the embeddings for all scenes 
  rrow['X'] = padding(np.stack(scenes[scenes['movie']==row['movie']]['embedding']),256,512) # embeddings for all scenes padded with zeros to have uniform length of 256 scene embeddings
  rrow['Y'] = np.array(row[['tp1','tp2','tp3','tp4','tp5']])

# ============================
# ENCODING (ONE HOT OR NOT?)
# ============================

  # y for each movie tpid is [prob of scene being tp1, ... , prob of scene being tp5]
  y = np.zeros(shape=[256,])

  # Get index from gold df and apply it to the array 
  y[row['tp1']] = np.array([1])
  # y[row['tp1']] = np.array([1,0,0,0,0])
  # y[row['tp2']] = np.array([0,1,0,0,0])
  # y[row['tp3']] = np.array([0,0,1,0,0])
  # y[row['tp4']] = np.array([0,0,0,1,0])
  # y[row['tp5']] = np.array([0,0,0,0,1])
  rrow['y']=y
  rrows.append(rrow)
  
train_df = pd.DataFrame(rrows)
#create a copy to work on scene average baseline

train_df = train_df.sample(frac=1,random_state=25)


In [None]:
#oversampling
#Needed oversampling due to a bug in model.fit not allowing us to use class_weight
os = 256
for i in range(len(train_df)):
  x_tp = np.array(train_df.iloc[i]['X'][train_df.iloc[i]['y']==1])
  x_fill = np.tile(x_tp,(os,1))
  y_fill = np.ones(os)

  train_df.iloc[i]['X'] = np.concatenate((train_df.iloc[i]['X'], x_fill))
  train_df.iloc[i]['y'] = np.concatenate((train_df.iloc[i]['y'], y_fill))

In [None]:
train_df.iloc[0]['X'].shape

(512, 512)

In [11]:
# Define train and test data

X_train = np.stack([i for i in train_df['X'][:-43]])
y_train = np.stack([i for i in train_df['y'][:-43]])

X_test = np.stack([i for i in train_df['X'][-43:]])
y_test = np.stack([i for i in train_df['y'][-43:]])



In [None]:
X_train.shape

(160, 768, 512)

In [None]:
y_train.shape

(160, 768)

In [None]:
X_test.shape

(43, 256, 512)

# Class weights
Omitted due to Keras bug: https://github.com/keras-team/keras/issues/16311

In [None]:
# Removed due to errors with class_weight arg in model.fit
# Issue being actively tracked here: https://github.com/keras-team/keras/issues/16311

#Calculate Class Weights
# #Useless since class_weight doesn't seem to work

# from sklearn.utils import class_weight

# class_weights = class_weight.compute_class_weight(
#                                      class_weight = "balanced",
#                                      classes = np.unique(y_train),
#                                      y = y_train.flatten())

# class_dict = dict(zip(np.unique(y_train), class_weights))
# class_dict

{0.0: 0.5019607843137255, 1.0: 128.0}

# Tensorflow

The Tensorflow section is not used in this notebook. That model is covered in our other notebook.  Please proceed to the baseline section

In [None]:
# Import the libraries

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, MultiHeadAttention, LayerNormalization
from keras.activations import softmax

import tensorflow as tf

# Create our custom Softmax:
def sm(x):
    return softmax(x,axis=1)

# Creating the layers
input_layer = Input(shape=(512,512)) # number of scenes in movie
context_layer = MultiHeadAttention(num_heads=2, key_dim=512,)(input_layer,input_layer,input_layer,) 
norm_layer = LayerNormalization()(context_layer)

# tpid_layer = MultiHeadAttention(num_heads=2, key_dim=512,)(norm_layer,norm_layer) 
# norm_layer_2 = LayerNormalization()(tpid_layer)
# ff_layer_2 = Dense(256, activation='relu')(norm_layer_2)
ff_layer_2 = Dense(256, activation='relu')(norm_layer)

output_layer = Dense(1, activation=sm)(ff_layer_2)


# Create the model
model_context = Model(inputs=input_layer, outputs=output_layer,)

# Defining the optimiser and loss function
# s = Semantic_loss_functions()
model_context.compile(optimizer='adam',
              loss='binary_crossentropy',
              #loss = focal_loss(gamma=0.001),
              #loss=BinaryFocalLoss(gamma=2),
              #loss=s.weighted_cross_entropyloss,
              metrics = ['accuracy'])

print(model_context.summary())

# To treat every instance of class 1 as 50 instances of class 0


# Training the model
model_context.fit(X_train, y_train, epochs=3, batch_size=2, validation_data=(X_test,y_test))#, class_weight=class_dict)

# Evaluating the model
print()
results = model_context.evaluate(X_test,y_test, batch_size=8)
print("test loss, test acc:", results)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 512, 512)]   0           []                               
                                                                                                  
 multi_head_attention (MultiHea  (None, 512, 512)    2100736     ['input_1[0][0]',                
 dAttention)                                                      'input_1[0][0]',                
                                                                  'input_1[0][0]']                
                                                                                                  
 layer_normalization (LayerNorm  (None, 512, 512)    1024        ['multi_head_attention[0][0]']   
 alization)                                                                                   

# Baseline Model

In [12]:
#get average turning point baseline
#this is essentially the average position in a script that one would expect a turning point to occur

#get a copy of the dataframe for turning point calculations
tp_df = train_df.copy()
#clean up df
x_train_tp = tp_df[:-43]
x_train_tp = x_train_tp[['movie','X','Y']]

x_train_tp['num_scenes'] = x_train_tp['X']

lengths = []
for i in x_train_tp['num_scenes']:
  y = len(i[:256][i[:256] != np.zeros(512)])/512
  lengths.append(y)

#add movie lenghts into df
x_train_tp['num_scenes'] = lengths

#scene ratio averages where the turning points show up in relation to the length of the script
x_train_tp['scene_ratio'] = x_train_tp['Y'] / x_train_tp['num_scenes']

#get average turning point of all the train data
avg_tp = x_train_tp['scene_ratio'].mean()
avg_tp

array([0.1397162 , 0.34208487, 0.62469028, 0.84348432, 0.96439568])

In [13]:
#calculate turning point based on scene length
#avg_tp = [0.1397162 , 0.34208487, 0.62469028, 0.84348432, 0.96439568]
def baseline_tp(scenes):
  tp_scenes = [0,0,0,0,0]

  for i in range(len(avg_tp)):
    tp_scenes[i] = int(scenes * avg_tp[i])

  return tp_scenes

In [14]:
#get movie data to generate baseline scenes
b_scenes = scenes[['scene','number','movie']]

In [15]:
#list of movies to be summarized
b_list = b_scenes['movie'].unique()

In [16]:
#Pull all the human-written synopses for comparison
synopses1 = pd.read_csv('/content/TRIPOD/Synopses_and_annotations/TRIPOD_synopses_train.csv')
synopses2 = pd.read_csv('/content/TRIPOD/Synopses_and_annotations/TRIPOD_synopses_test.csv')

synopses = pd.concat((synopses1,synopses2))
#clean up synopses movie names
synopses['movie_name'] = synopses['movie_name'].str.replace("_0","")
# synopses['movie_name'] = synopses['movie_name'].str.replace("_1","")
# synopses['movie_name'] = synopses['movie_name'].str.replace("_2","")
synopses = synopses[['movie_name','synopsis_raw','synopsis_segmented']]
synopses = synopses.rename(columns={'movie_name':'movie'})
synopses

Unnamed: 0,movie,synopsis_raw,synopsis_segmented
0,Beloved (film)_1,"Set shortly after the Civil War, the film revo...","[STR_SENT] Set shortly after the Civil War, th..."
1,Beloved (film),"Set shortly after the Civil War, the film revo...","[STR_SENT] Set shortly after the Civil War, th..."
2,Beloved (film)_2,"Set shortly after the Civil War, the film revo...","[STR_SENT] Set shortly after the Civil War, th..."
3,Jaws (film),A girl named Chrissie Watkins leaves a beach p...,[STR_SENT] A girl named Chrissie Watkins leave...
4,Angel Eyes (film)_1,"On a wet rainy night in Chicago, police office...","[STR_SENT] On a wet rainy night in Chicago, po..."
...,...,...,...
10,One Eight Seven,Trevor Garfield is an African American high sc...,[STR_SENT] Trevor Garfield is an African Ameri...
11,The Shining (film),Jack Torrance arrives at the Overlook Hotel to...,[STR_SENT] Jack Torrance arrives at the Overlo...
12,Die Hard,"On Christmas Eve, New York City Police officer...","[STR_SENT] On Christmas Eve, New York City Pol..."
13,Arbitrage (film),Sixty-year-old magnate Robert Miller manages a...,[STR_SENT] Sixty-year-old magnate Robert Mille...


In [55]:
#Get the list of baseline turning points needed for the model

final_scenes = []
#loop through each movie
for i in b_list:
  scene_x = []
  #calculate number of scenes in movie
  b_movie = b_scenes[b_scenes['movie'] == i]
  b_movie = b_movie.set_index(b_movie['number'])
  #reset index to match movie scene number
  b_scriptlength = b_movie['number'].argmax()
  #identify turning point scenes
  b_tps = baseline_tp(b_scriptlength)

  #extract text from each scene
  for j in b_tps:
    newscene = b_movie['scene'][j].replace('\n',' ')
    scene_x.append(newscene)
    #scene_x.append(b_movie[b_movie['number']==j]['scene'].replace('\n',''))

  final_scenes.append(scene_x)

# Extractive Summaries
Using BERT

In [18]:
!pip install torch
!pip install transformers
!pip install sentencepiece
!pip install bert-extractive-summarizer

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: p

In [19]:
#basic BERT ext_sum

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PegasusForConditionalGeneration, PegasusTokenizer, logging
from summarizer import Summarizer #bert-extractive-summarizer

logging.set_verbosity_error()

def sequence_length_estimate(text):
  '''
  estimates applicable to script text only
  '''
  return int(len(text.split(' '))+2+len(text.split(' '))*0.25)

def abs_sum(text, architecture, min_length=10, max_length=512, length_penalty=1, repetition_penalty=1):
  """
  Abstractive summarization of text using input Huggingface architecture 
  Token sequences with length > 512 are retokenized after extractive summarization
  See https://huggingface.co/models for available models
  """
  text = text.strip()
  device = "cuda" if torch.cuda.is_available() else "cpu"
  model = AutoModelForSeq2SeqLM.from_pretrained(architecture).to(device)
  tokenizer = AutoTokenizer.from_pretrained(architecture)
  if 't5' in architecture:
    text = f"summarize: {text}"
  tokens_input = tokenizer.encode(text, return_tensors='pt', truncation=False).to(device)
  while tokens_input.shape[1] > 512:
    ratio=512/tokens_input.shape[1]
    print('Token sequence length > 512')
    print(f'Performing extractive summarization and retokenization with ratio={ratio}')
    text = ext_sum(text, ratio=ratio).strip()
    if 't5' in architecture:
      text = f"summarize: {text}"
    tokens_input = tokenizer.encode(text, return_tensors='pt', truncation=False).to(device)
  summary_ids = model.generate(tokens_input, min_length=min_length, max_length=max_length, length_penalty=length_penalty, repetition_penalty=repetition_penalty)
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
  return summary

def ext_sum(text, ratio=None):
  """
  Uses Huggingface BERT to do extractive summarization
  See https://github.com/dmmiller612/bert-extractive-summarizer for more info
  """
  text = text.strip()
  model = Summarizer()
  if ratio is None:
    num_sentences = model.calculate_optimal_k(text)
    summary = model(text, num_sentences=num_sentences)
  else:
    summary = model(text, ratio=ratio) # Specified with Ratio
  return summary

In [20]:
#get test movies
#The dataset has a lot of repeating movies due to multiple turning point labels
#We limit this set to the unique films in the Test set

x_test_tp = tp_df[-43:]
x_test_tp = np.unique(x_test_tp['movie'])
x_test_tp = pd.DataFrame(x_test_tp)
x_test_tp = x_test_tp.rename(columns={0:'movie'})
x_test_tp

Unnamed: 0,movie
0,Arbitrage (film)
1,Die Hard
2,Juno (film)
3,Moon (film)
4,One Eight Seven
5,Panic Room
6,The Back-up Plan
7,The Crying Game
8,The Last Temptation of Christ (film)
9,Total Recall (1990 film)


In [62]:
#create data frame of the extracted scenes
final_scenes_df = pd.DataFrame()
final_scenes_df['movie'] = b_list
final_scenes_df['summary'] = final_scenes

In [63]:
#create combined dataframe of all the required data:
# * movie name
# * written synopsis
# * identified tp scenes
synopsis_list = []
scene_list = []
for i in x_test_tp['movie']:
  synopsis_list.append(synopses[synopses['movie'] == i]['synopsis_raw'].item())
  scene_list.append(final_scenes_df[final_scenes_df['movie'] == i]['summary'].item())

x_test_tp['synopsis'] = synopsis_list
x_test_tp['summary'] = scene_list
x_test_tp

Unnamed: 0,movie,synopsis,summary
0,Arbitrage (film),Sixty-year-old magnate Robert Miller manages a...,[INT. GREENBERG & COMPANY - OFFICE HALL - THE ...
1,Die Hard,"On Christmas Eve, New York City Police officer...",[37 INT. ELLIS' BATHROOM - 30th FLOOR - S...
2,Juno (film),Sixteen-year-old Minnesota high-schooler Juno ...,[EXT. BLEEKER HOUSE - CONTINUED 17 ...
3,Moon (film),Sam Bell nears the end of a three-year work co...,[21 INT. DOCK ...
4,One Eight Seven,Trevor Garfield is an African American high sc...,"[20 INT. JOHN QUINCY ADAMS - ""A"" BUILDING EN..."
5,Panic Room,Recently divorced Meg Altman (Foster) and her ...,[INT. FOURTH FLOOR HALLWAY - NIGHT 23 ...
6,The Back-up Plan,Zoe (Jennifer Lopez) has given up on finding t...,[INT. CAROL'S APARTMENT - DAY 19 Seven WOME...
7,The Crying Game,The film opens as a psychological thriller – I...,"[INT. GREENHOUSE - NIGHT. Jody, still laugh..."
8,The Last Temptation of Christ (film),The film begins with a man whispering in despa...,[EXT. MARY MAGDALENE'S COURTYARD - DAY Jesu...
9,Total Recall (1990 film),"In 2084, Douglas Quaid is an Earthbound constr...",[29 INT. QUAID'S LIVING ROOM/KITCHEN - NIGHT 2...


In [69]:
#start pulling summaries!
ext_sums = []
for movie in x_test_tp['summary']:
  final_sum = ""
  loopnum=0

  for scene in movie:
    print(loopnum)
    summ = ext_sum(scene)
    final_sum += summ
    #track progress
    loopnum += 1

  ext_sums.append(final_sum)

0
1
2
3
4
0
1
2
3
4
0
1
2
3
4
0
1
2
3
4
0
1
2
3
4
0
1
2
3
4
0
1
2
3
4
0
1
2
3
4
0
1
2
3
4
0
1
2
3
4
0
1
2
3
4


In [70]:
#add resulted extractions to data frame
x_test_tp['ext_sum'] = ext_sums

In [71]:
x_test_tp

Unnamed: 0,movie,synopsis,summary,ext_sum
0,Arbitrage (film),Sixty-year-old magnate Robert Miller manages a...,[INT. GREENBERG & COMPANY - OFFICE HALL - THE ...,GREENBERG & COMPANY - OFFICE HALL - THE NEXT D...
1,Die Hard,"On Christmas Eve, New York City Police officer...",[37 INT. ELLIS' BATHROOM - 30th FLOOR - S...,ELLIS' BATHROOM - 30th FLOOR - SAME ...
2,Juno (film),Sixteen-year-old Minnesota high-schooler Juno ...,[EXT. BLEEKER HOUSE - CONTINUED 17 ...,BLEEKER HOUSE - CONTINUED 17 ...
3,Moon (film),Sam Bell nears the end of a three-year work co...,[21 INT. DOCK ...,DOCK ...
4,One Eight Seven,Trevor Garfield is an African American high sc...,"[20 INT. JOHN QUINCY ADAMS - ""A"" BUILDING EN...","JOHN QUINCY ADAMS - ""A"" BUILDING ENTRANCE - FE..."
5,Panic Room,Recently divorced Meg Altman (Foster) and her ...,[INT. FOURTH FLOOR HALLWAY - NIGHT 23 ...,FOURTH FLOOR HALLWAY - NIGHT 23 O...
6,The Back-up Plan,Zoe (Jennifer Lopez) has given up on finding t...,[INT. CAROL'S APARTMENT - DAY 19 Seven WOME...,CAROL'S APARTMENT - DAY 19 Seven WOMEN sit ...
7,The Crying Game,The film opens as a psychological thriller – I...,"[INT. GREENHOUSE - NIGHT. Jody, still laugh...",Suddenly the hood is slammed back over his he...
8,The Last Temptation of Christ (film),The film begins with a man whispering in despa...,[EXT. MARY MAGDALENE'S COURTYARD - DAY Jesu...,MARY MAGDALENE'S COURTYARD - DAY Jesus and ...
9,Total Recall (1990 film),"In 2084, Douglas Quaid is an Earthbound constr...",[29 INT. QUAID'S LIVING ROOM/KITCHEN - NIGHT 2...,QUAID'S LIVING ROOM/KITCHEN - NIGHT 23 ...


# Rouge Score

In [75]:
!pip install rouge-score
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [88]:
from rouge import Rouge

#hypotheses
hyp = x_test_tp['ext_sum']
#references
ref = x_test_tp['synopsis']

#run scores
rouge = Rouge()
scores = rouge.get_scores(hyp, ref, avg=True)
scores


{'rouge-1': {'f': 0.11736880981871768,
  'p': 0.23993863772355659,
  'r': 0.08164685032472217},
 'rouge-2': {'f': 0.016174478945387243,
  'p': 0.03668644253124451,
  'r': 0.010565081859742665},
 'rouge-l': {'f': 0.10877176739276546,
  'p': 0.22297146517621302,
  'r': 0.0756309210136863}}

# BLEURT Scores

There were problems running BLEURT in this notebook.  As a result, had to download the reference and hypothesis files to my local machine to run BLEURT scores.  I've included a copy of the code and results here

In [109]:
from google.colab import files
#save files to computer
ref.to_csv("ref.csv")
hyp.to_csv("hyp.csv")

files.download('ref.csv')
files.download('hyp.csv')

In [127]:
#commands run to pull BLEURT scores for extractive summaries:
#installation instructions per:

git clone https://github.com/google-research/bleurt.git
cd bleurt
pip install .

#run scores on extracted data
#did have to do some cleanup as BLEURT expects an unformatted file with all text on a single line.

python -m bleurt.score_files \
-candidate_file=test_data/candidates \
-reference_file=test_data/references \
-bleurt_checkpoint=test_checkpoint \ 
-scores_file=scores

In [131]:
#scores from BLEURT are in the below list, and the mean is taken

import statistics
statistics.mean([-0.8517532348632812,
-0.9455795288085938,
-0.8833158016204834,
-0.9186558723449707,
-0.8752586841583252,
-0.8716404438018799,
-0.8093118667602539,
-1.1411449909210205,
-0.8251528143882751,
-0.8309003114700317,
-0.8986597061157227])

-0.8955793868411671