In [11]:
import numpy as np
from transformers import BertTokenizer, AutoTokenizer
from model import BertForMultiLabelClassification
from multilabel_pipeline import MultiLabelPipeline
from pprint import pprint
import torch
import pandas as pd
import ast
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import re
import nltk
from collections import Counter
from transformers import pipeline
from tqdm.auto import tqdm

# Requirements:
# create environment with python=3.8 or 3.7 (required for tokenizers 0.7.0 below)
# conda install pytorch==1.4.0 cpuonly -c pytorch 
# conda install -c scw torchvision 
# conda install -c conda-forge attrdict  
# from: https://pypi.org/project/tokenizers/0.7.0/#files download wheel for tokenizer==0.7.0 (required for transformers=2.11.0) -- run command pip install tokenizers-0.7.0-cp38-cp38-win_amd64.whl (with appropiate wheel file name)
# pip install transformers==2.11.0


In [3]:
device =  torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Open and read the text file
with open(R"C:\Users\Berta\Desktop\EPFL\ADA\PROJECT\MovieSummaries\plot_summaries.txt", 'r', encoding='utf-8') as file:    
    content_summaries = file.read()

In [5]:
# Split the content into individual summaries based on "movie_id"
summaries = content_summaries.split('\n')

# Remove empty lines if any
plot_summaries = [summary.strip() for summary in summaries if summary.strip()]

movie_ids = []
summary_texts = []

# Split the plot summaries into movie IDs and summary text
for summary in plot_summaries:
    parts = summary.split('\t', 1)  # Split at the first space character
    if len(parts) == 2:
        movie_id, summary_text = parts
        movie_ids.append(int(movie_id))
        summary_texts.append(summary_text)

summaries_df = pd.DataFrame({'Movie ID': movie_ids, 'Plot': summary_texts})

column_names = [
    'Movie ID',
    'Freebase ID',
    'Movie Title',
    'Release Date',
    'Box Office',
    'Runtime',
    'Language',
    'Country',
    'Genre'
]

df = pd.read_csv(R"C:\Users\Berta\Desktop\EPFL\ADA\PROJECT\MovieSummaries\movie.metadata.tsv", delimiter='\t', header=None, names=column_names)
df = df[~(df['Genre']=='{}').values]

def extract_first_genre(genre_str):
    genre_dict = ast.literal_eval(genre_str)
    return next(iter(genre_dict.values()))

df['Genre'] = df['Genre'].apply(extract_first_genre)
df['Language'] = df['Language'].apply(lambda x: ', '.join([value.split()[0] for key, value in ast.literal_eval(x).items()]))
df['Country'] = df['Country'].apply(lambda x: ', '.join([value for key, value in ast.literal_eval(x).items()]))

metadata = df[df['Movie ID'].isin(movie_ids)]
summaries_df = summaries_df[summaries_df['Movie ID'].isin(metadata['Movie ID'])]

movie_data = pd.merge(metadata, summaries_df, on='Movie ID')

In [6]:
# Preparing data for tokenisation:

# Note: same summary and title for this entries, but different release year! They also cause problems with manual tokenisation, so they will be removed for the moment. Further analysis needed.
# movie_data.iloc[9724]
# movie_data.iloc[25327]
# movie_data.iloc[29447]
# movie_data.iloc[33197]

# Other entries which caused problems with tokenizer:
# movie_data.iloc[16550]
# movie_data.iloc[29414]
# movie_data.iloc[30229]
# movie_data.iloc[34493]

plots_need_analsys = movie_data.iloc[[9724, 16550, 25327, 29414, 29447, 30229, 33197, 34493]]
movie_data = movie_data.drop([9724, 16550, 25327, 29414, 29447, 30229, 33197, 34493])

In [7]:
# Keep only movie data for 10 most recurrent Genres:
genre_occurrences = movie_data['Genre'].value_counts()
top_10_genres = genre_occurrences[:10].index.to_numpy()
movie_data_top10 = movie_data[movie_data['Genre'].isin(top_10_genres)]

In [259]:
# Load the pretrained GoEmotions model and tokenizer
model_name = "monologg/bert-base-cased-goemotions-original"
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [9]:
# Manual tokenization with max_len=512 to avoid sentences splitting in half and allow mapping emotions to each plot summary:

nltk.download('punkt')

def sentence_aware_split(text, max_length, tokenizer):
    # Split text into sentences
    sentences = nltk.tokenize.sent_tokenize(text)

    chunks = []
    current_chunk = ""
    for sentence in sentences:
        # Check if adding this sentence would exceed the max_length
        potential_chunk = current_chunk + " " + sentence if current_chunk else sentence
        potential_chunk_tokenized = tokenizer.tokenize(potential_chunk)
        
        if len(potential_chunk_tokenized) <= max_length - 2:  # -2 for [CLS] and [SEP]
            current_chunk = potential_chunk
        else:
            # Add the current_chunk to chunks and start a new one
            chunks = chunks + [current_chunk]
            current_chunk = sentence

    # Don't forget to add the last chunk
    if current_chunk:
        chunks = chunks + [current_chunk]

    return chunks

max_length = 512  # Adjust based on your model's max length

texts = movie_data['Plot'].values.tolist()
tokenized = []
chunk_to_text_mapping = {}
for i, text in enumerate(texts):
    
    chunked_text = sentence_aware_split(text, max_length, tokenizer)

    for chunk in chunked_text:
        chunk_to_text_mapping[chunk] = i

    tokenized = tokenized + chunked_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Berta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors


In [12]:
# Save summary chunks and chunks to plot mapping for future runs:

def save_list_to_file(list_of_strings, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for i, string in enumerate(list_of_strings):
            file.write(string + '\n')

def save_dict_to_file(dictionary, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for key, value in dictionary.items():
            file.write(f'{key}: {value}\n')

save_list_to_file(tokenized, R'C:\Users\Berta\Desktop\EPFL\ADA\PROJECT\ada-2023-project-badafixm01\GoEmotions-pytorch\tokenized_plots.txt')
save_dict_to_file(chunk_to_text_mapping, R'C:\Users\Berta\Desktop\EPFL\ADA\PROJECT\ada-2023-project-badafixm01\GoEmotions-pytorch\chunk_mapping.txt')


In [8]:
# Cell to load files to avoid re-running manual tokenisation cell:

# Open the file and read the contents
with open(R"C:\Users\Berta\Desktop\EPFL\ADA\PROJECT\ada-2023-project-badafixm01\GoEmotions-pytorch\final\tokenized_plots.txt", 'r', encoding='utf-8') as file:
    file_content = file.read()

tokenized = file_content.strip().split('\n')

with open(R"C:\Users\Berta\Desktop\EPFL\ADA\PROJECT\ada-2023-project-badafixm01\GoEmotions-pytorch\final\chunk_mapping.txt", 'r', encoding='utf-8') as file:
    chunk_mapping = file.read()

# Split the content into individual summaries based on "movie_id"
chunk_to_text_mapping = chunk_mapping.strip().split('\n')

In [9]:
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")
model = AutoModelForSequenceClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")

goemotions = pipeline(
        model=model, 
        tokenizer=tokenizer, 
        task="text-classification",
        return_all_scores=True,
        function_to_apply='sigmoid',
    )



In [80]:
# Run only 76 sequences for first analysis
batch_size = 10  # or another size that suits your system
batches = [tokenized[i:i + batch_size] for i in range(0, len(tokenized[:(batch_size*10)]), batch_size)]

In [81]:
emotions = []
for i, batch in enumerate(tqdm(batches, desc="Processing Batches")):
    try:
        batch_result = goemotions(batch)
        emotions.extend(batch_result)
        print(f"Batch {i+1}/{len(batches)} processed successfully.")
    except Exception as e:
        print(f"Error processing batch {i+1}: {e}")

Processing Batches:   0%|          | 0/10 [00:00<?, ?it/s]

Batch 1/10 processed successfully.
Batch 2/10 processed successfully.
Batch 3/10 processed successfully.
Batch 4/10 processed successfully.
Batch 5/10 processed successfully.
Batch 6/10 processed successfully.
Batch 7/10 processed successfully.
Batch 8/10 processed successfully.
Batch 9/10 processed successfully.
Batch 10/10 processed successfully.


In [82]:
# Remove neutral emotion
emotions = [text[:-1] for text in emotions]
# Store scores for all emotions
scores = [[emotion['score'] for emotion in sublist] for sublist in emotions]

# Keep only k emotions per text
k = 3
ranked_emotions = [[emotions[i][k]['label'] for k in np.argsort(text)[::-1][:k]] for i, text in enumerate(scores)]

In [83]:
chunk_maps = [chunk_to_text_mapping[i][-1] for i in range(len(chunk_to_text_mapping[:batch_size*10]))]
df_emotions = pd.DataFrame({'Emotions': ranked_emotions, 'Plot ID': chunk_maps})
grouped_emotions = df_emotions.groupby('Plot ID')['Emotions'].apply(np.sum).reset_index()

In [84]:
movie_data_top10.loc[:len(grouped_emotions),'Emotions'] = grouped_emotions['Emotions']
genre_emotions = movie_data_top10.loc[:len(grouped_emotions)].groupby('Genre')['Emotions'].apply(np.sum)

In [85]:
genre_top_emotions = {}
for i, emotions in enumerate(genre_emotions):
    if emotions == 0:
        continue
    genre = genre_emotions.index[i]
    counts = Counter(emotions)
    top_8 = counts.most_common(2)
    em = [item[0] for item in top_8]
    genre_top_emotions[genre] = em

df_genre_top_emotions = pd.DataFrame({'Genre': list(genre_top_emotions.keys()), 'Emotions': genre_top_emotions.values()})