In [30]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import nlp
import tensorflow as tf
import random
import os
%matplotlib inline
import re
from transformers import BertTokenizer
import tensorflow as tf
from transformers import TFBertForSequenceClassification


In [None]:
# Load the model.
# Note that we didn't put the trained model in the repository itself due to size constraints 
model_path = "./bert_finetuned_model_1"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = TFAutoModelForSequenceClassification.from_pretrained(model_path)

Some layers from the model checkpoint at ./bert_finetuned_model_1 were not used when initializing TFBertForSequenceClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./bert_finetuned_model_1.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
file_path = "../../data/plot_summaries.txt"
df = pd.read_csv(file_path, sep='\t', header=None, names=['Wikipedia movie ID', 'summary'])

def split_sentences(summary):
    # Split the text into sentences using common sentence delimiters (:, !, ;, .)
    sentences = [s.strip() for s in re.split(r'[,:!;.]', summary) if s.strip()]
    
    # Filter out sentences with fewer than 4 words
    filtered_sentences = [sentence for sentence in sentences if sentence.count(" ") > 3]
    return filtered_sentences

# Apply the split_sentences function to the 'summary' column and store the result in a new column
df['sentences'] = df['summary'].apply(split_sentences)

# Expand the list of sentences into separate columns
sentence_df = pd.DataFrame(df['sentences'].to_list())

# Concatenate the original DataFrame (without the 'sentences' column) with the new sentence columns
df = pd.concat([df.drop(columns=['sentences']), sentence_df], axis=1)

# Merge all sentence columns into a single column as a list
df["Merged Content"] = df.iloc[:, 2:].apply(lambda row: [item for item in row if item != "None"], axis=1)

# Extract only the columns 'Wikipedia movie ID' and 'Merged Content' from the DataFrame
result_df = df[["Wikipedia movie ID", "Merged Content"]]

file_path = "data/cleaned_data.csv"

# Read the cleaned data from the CSV file into a DataFrame
higher = pd.read_csv("data/higher.csv")
lower = pd.read_csv("data/lower.csv")

higher_df = result_df[result_df["Wikipedia movie ID"].isin(higher["Wikipedia movie ID"])]
lower_df = result_df[result_df["Wikipedia movie ID"].isin(lower["Wikipedia movie ID"])]

In [None]:
# Filter out None values from the 'Merged Content' column
higher_df["Merged Content"] = higher_df["Merged Content"].apply(lambda x: [i for i in x if i is not None])

# Expand the 'Merged Content' column, which contains lists, into multiple rows
hexpanded_df = higher_df.explode("Merged Content", ignore_index=True)
hexpanded_df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  higher_df["Merged Content"] = higher_df["Merged Content"].apply(lambda x: [i for i in x if i is not None])


Unnamed: 0,Wikipedia movie ID,Merged Content
0,31186339,The nation of Panem consists of a wealthy Capi...
1,31186339,As punishment for a past rebellion
2,31186339,each district must provide a boy and girl bet...
3,31186339,The tributes must fight to the death in an arena
4,31186339,the sole survivor is rewarded with fame and we...
...,...,...
73100,277160,Mac slips Gin a gun and she holds Mac hostage
73101,277160,threatening to shoot him if the agents follow her
73102,277160,She boards a train and the FBI heads to the ne...
73103,277160,Gin jumps trains mid-station and arrives back ...


In [None]:
def label_texts(texts, batch_size=256):
    predicted_labels = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        tokenized_batch = tokenizer(batch_texts, padding=False, truncation=False)
        max_length = max(len(tokens) for tokens in tokenized_batch["input_ids"])
        
        # Encode texts using tokenizer
        encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=max_length, return_tensors="tf")
        
        # Predict the labels of each row
        predictions = model(encodings).logits
        batch_labels = tf.argmax(predictions, axis=1).numpy()
        print(f"Batch {i // batch_size + 1} Max Length: {max_length}, Labels: {batch_labels}")
        
        predicted_labels.extend(batch_labels)
    return predicted_labels

# Labelling merged content
hexpanded_df["Predicted Label"] = label_texts(hexpanded_df["Merged Content"].tolist(), batch_size=256)

hexpanded_df

Batch 1 Max Length: 42, Labels: [ 0  7  0  3  8  0  5  0  0  0  0  0  0  5  3  0 10  3  3  3  0  3  0  3
  3  4  3  3 10  0  3  3  3  0  0  0  3  3  3  3  0  0  3  0  0  5  0 10
  3  5  8 10  3  0  0  0  3  0  3  0  0  3  4  0  3  3  3  8  1  3  0  3
  1  3  0  0  0  3  1  3  0 10  0  3  0  8  3  0  3  3  0  0  3  3  4  3
  1  8  3  0  0  0  3  3  3  3  1 10  0  0  0  7  3  3  0  3 10  3  3  3
  0  3 10  8  0  3  3  3  1  7  1  2 10  3  2  3  3  1  5  3 10  0  0  3
  0  3 10  0  8  3  3  0  0  4  3  3  3  1  0  0  3  0  0  3  3  3  3  3
 10  0  3  3  0  0  3  3  3  3  3  4  8  3  3 10  4  3  7  7  0  3 10  3
  7  4  7  3 10 10  1  3  1  3  0  0  0  0  0  0  0  0  0  1  0  0  3  0
  1 10  3  0  3  3 10  1  1  8  3  7 10  3  0  3  0 10  1  0 10  3  3  1
 10  0  0  0  3  3  3  0  8  3  3  1  3  1  3  0]
Batch 2 Max Length: 41, Labels: [ 0  3  0  8  0  5  1  0  5  0  0  3  1  0  3  2  8  4  3  8  3  3  3  3
 10  0  3  0  0  3  3 10  0  3 10  3  0  3  3  7  3 10 10 10  3 10  0  3
  0  3 10 

Unnamed: 0,Wikipedia movie ID,Merged Content,Predicted Label
0,31186339,The nation of Panem consists of a wealthy Capi...,0
1,31186339,As punishment for a past rebellion,7
2,31186339,each district must provide a boy and girl bet...,0
3,31186339,The tributes must fight to the death in an arena,3
4,31186339,the sole survivor is rewarded with fame and we...,8
...,...,...,...
73100,277160,Mac slips Gin a gun and she holds Mac hostage,7
73101,277160,threatening to shoot him if the agents follow her,3
73102,277160,She boards a train and the FBI heads to the ne...,0
73103,277160,Gin jumps trains mid-station and arrives back ...,3


In [None]:
# We output this as part of the process to check labelling. Final results are currently stored just as "higher_labels.csv"
result = hexpanded_df.groupby("Wikipedia movie ID")["Predicted Label"].apply(list).reset_index()
result.to_csv("data/higher_labels1.csv", index=False)


In [29]:
# Filter out None values from the 'Merged Content' column
lower_df["Merged Content"] = lower_df["Merged Content"].apply(lambda x: [i for i in x if i is not None])

# Expand the 'Merged Content' column, which contains lists, into multiple rows
lexpanded_df = lower_df.explode("Merged Content", ignore_index=True)
lexpanded_df 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lower_df["Merged Content"] = lower_df["Merged Content"].apply(lambda x: [i for i in x if i is not None])


Unnamed: 0,Wikipedia movie ID,Merged Content
0,6167400,The film is similar in some ways to Seth Micha...
1,6167400,the plot of Oliver Twist is updated to the pre...
2,6167400,and moved out of the London poor house onto th...
3,6167400,the tale is told not from Oliver's point of view
4,6167400,but rather that of Dodge
...,...,...
26959,30352708,Eun-soo is nowhere to be seen and Joong-shik a...
26960,30352708,While believing her brother-in-law killed her ...
26961,30352708,Eun-mo finds herself falling in love with him
26962,30352708,the sole guardian and grownup in the lonely gi...


In [None]:
# Labelling merged content
lexpanded_df["Predicted Label"] = label_texts(lexpanded_df["Merged Content"].tolist(), batch_size=256)

lexpanded_df

Batch 1 Max Length: 42, Labels: [ 0  8  1  3  0  3  7  3  3  3  1  3  7  3  0  3  0  3  3  3  0  3 10  3
  0  3  3  3  4  0  3  3  0  3  3  3  3  3  0  3  3  3  3  4  3 10  1  1
  3  0  3  0  0  3  3  3  0  4  7  4  3  3  3  0  3  4  3  3  0  0  4  3
  0  3  0  0  0  8  0  3  3  7  3  0  7  3  7  3  4  3  0  0  3  0  0  3
  0  3 10  0  3 10  3  4  3  3  3  0  3  3  3  3  3  0 10  4  3  3  3  3
  3  3  3  3  3  3  4 10  3  3  3 10  1  0  0  3 10  3  0  0 10  3  3  1
  3 10  3  8 10 10  3  0  4  5  5  3  4  3 10  3  0  3  2  5  4  0  3  0
  0  3  5  0  3  3  3  0  5  1  0  0  0  3 10  3  3  3 10  3  5  5  3  0
  4  3  0  0  3  0  0  0  3  3  3  4  4  3  3  0  3  3  0  0  4  3 10  0
  3  0  3  7  0  8  8  4  5  3  3  5  0  0  7  4  3  0  3  0  3  0  3  0
  4  1  3  1  0  4  3 10  8  0  3  3  4  0  3  3]
Batch 2 Max Length: 47, Labels: [ 3  3  0  3  3  3  3  7  0  3  8  3  3  3  0  3  0  7  1  3  0  8  1  3
  3  3  1  0  0  0  0  3  1  3 10 10  4  0  3  0  0  3 10  3  3  0  3  4
  3  1  0 

Unnamed: 0,Wikipedia movie ID,Merged Content,Predicted Label
0,6167400,The film is similar in some ways to Seth Micha...,0
1,6167400,the plot of Oliver Twist is updated to the pre...,8
2,6167400,and moved out of the London poor house onto th...,1
3,6167400,the tale is told not from Oliver's point of view,3
4,6167400,but rather that of Dodge,0
...,...,...,...
26959,30352708,Eun-soo is nowhere to be seen and Joong-shik a...,1
26960,30352708,While believing her brother-in-law killed her ...,3
26961,30352708,Eun-mo finds herself falling in love with him,5
26962,30352708,the sole guardian and grownup in the lonely gi...,3


In [None]:
# We output this as part of the process to check labelling. Final results are currently stored just as "lower_labels.csv"
result = lexpanded_df.groupby("Wikipedia movie ID")["Predicted Label"].apply(list).reset_index()
result.to_csv("data/lower_labels_1.csv", index=False)

In [None]:
# Output labelling per sentence if we want to check for ourselves
hexpanded_df.to_csv("data/higher_labels_check1.csv", index=False)
#lexpanded_df.to_csv("data/lower_labels_check.csv", index=False)