In [41]:
import pandas as pd
import numpy as np 
import ast
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import pickle
from scipy.interpolate import interp1d
import seaborn as sns
sns.set(style="darkgrid")


# Download necessary NLTK data (only needs to be done once)
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\march\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
DATA_PATH = "../../../data/final_dataset.csv"

In [20]:
df = pd.read_csv(DATA_PATH)
print(df.columns)
df.shape

Index(['Unnamed: 0', 'Wikipedia_movie_ID', 'summary', 'Freebase_movie_ID',
       'Movie_name', 'Movie_release_date', 'Movie_box_office_revenue',
       'Movie_runtime', 'Movie_languages', 'Movie_countries', 'Movie_genres',
       'category', 'continent', 'Rating'],
      dtype='object')


(42718, 14)

In [22]:
def count_words(text):
    return len(word_tokenize(str(text)))

def count_sentences(text):
    return len(sent_tokenize(str(text)))

In [23]:
df = df.drop(columns=df.columns[0])
df['Rating'] = df['Rating'].apply(lambda x: x if 0 <= x <= 10 else np.nan)


# Filter rows where summary has at least 100 words and 2 sentences
df = df[df['summary'].apply(lambda x: count_words(x) >= 100 and count_sentences(x) >= 2)]

In [24]:
df.shape

(34342, 13)

In [25]:
df["category"] = df["category"].apply(lambda x: ast.literal_eval(x))

In [27]:
df.to_csv("../../../data/final_dataset.csv")

In [28]:
df.to_pickle("../../../data/final_dataset.pkl")

In [32]:
df_emotions = pd.read_pickle("../../../data/emotions_data.pkl")
df_emotions.shape

(745637, 10)

In [34]:
new_df_emotions =  df_emotions[df_emotions['Wikipedia_movie_ID'].isin(df['Wikipedia_movie_ID'])]
new_df_emotions.shape

(723658, 10)

In [35]:
new_df_emotions.to_pickle("../../../data/emotions_data_raw.pkl")

In [36]:
new_df_emotions.to_pickle("../../../data/emotions_data_raw.csv")

Interpolating

In [38]:
DATA_PATH = "../../../data/emotions_data_raw.pkl"  # Path to the CSV file containing sentence emotions data
DATA_PATH_MOVIE_METADATA = "../../../data/final_dataset.pkl"  # Path to the pickle file containing movie metadata

df_emotions = pd.read_pickle(DATA_PATH)

with open(DATA_PATH_MOVIE_METADATA, 'rb') as f:
    df_metadata = pickle.load(f)

df_metadata.sample(3)

Unnamed: 0,Wikipedia_movie_ID,summary,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,category,continent,Rating
26057,17778990,Laida Magtalas is a modern-day Belle who work...,/m/047gs0x,A Very Special Love,2008.0,"₱185,235,324.00(US$3,836,701.00)",66.0,"Tagalog language, Filipino language, English L...",Philippines,"Romance Film, Comedy film","[Comedy, Romance]",Asia,6.5
18871,10507423,"Thomas ""Babe"" Levy is a history Ph.D. candida...",/m/02qg5zq,Marathon Man,1976.0,28204261.0,126.0,English Language,United States of America,"Thriller, Crime Fiction, Psychological thrille...","[Action/Adventure, Drama, Thriller]",North America,7.4
19414,10888233,In ancient times in India there was a time whe...,/m/02qt2qq,Balika Badhu,,,130.0,Hindi Language,India,"Romance Film, Drama, Family Film","[Drama, Family/Animation, Romance]",Asia,7.0


In [53]:
target_timesteps = 20  # Number of timesteps to which we want to interpolate the data
emotions = ["anger", "disgust", "fear", "joy", "neutral", "sadness", "surprise"] 

def interpolate_emotions(movie_data, target_timesteps):
    """
    Interpolates emotion values over a fixed number of timesteps for a given movie data.

    This function takes a DataFrame containing emotion values over time for a specific movie
    and interpolates each emotion to fit a fixed number of timesteps. This is useful for standardizing
    the data across movies with different numbers of timestamps, allowing for easier comparison.

    Args:
        movie_data (pd.DataFrame): A DataFrame containing emotion values for a specific movie.
            Each row represents a timestep, and columns should include:
            - "Wikipedia_movie_ID" (int or str): The unique identifier for the movie.
            - One column per emotion (e.g., "anger", "disgust", "fear", etc.).
        target_timesteps (int): The target number of timesteps for interpolation. The function
            will output a DataFrame with this fixed number of rows.

    Returns:
        pd.DataFrame: A DataFrame with `target_timesteps` rows, containing the interpolated
        emotion values. Columns include:
            - "Wikipedia_movie_ID" (int or str): Repeated ID of the movie for each row.
            - One column per emotion (e.g., "anger", "disgust", "fear", etc.) with interpolated values.
            
    Notes:
        - This function uses linear interpolation. For other types of interpolation,
          change the `kind` parameter in `interp1d`.
        - The function extrapolates values if the target positions extend beyond the range
          of `movie_data`.

    """
    # Define the original positions based on the length of the input data
    original_positions = np.arange(len(movie_data))
    # Define the target positions for interpolation based on the target timesteps
    target_positions = np.linspace(0, len(movie_data) - 1, target_timesteps)
    
    # Initialize a dictionary to store the interpolated data
    interpolated_data = {emotion: [] for emotion in emotions}
    # Add the Wikipedia_movie_ID to the dictionary, repeating it for each target timestep
    interpolated_data["Wikipedia_movie_ID"] = [movie_data["Wikipedia_movie_ID"].iloc[0]] * target_timesteps
    
    # Interpolate each emotion's values over the target positions
    for emotion in emotions:
        # Create a linear interpolation function for the current emotion
        interp_function = interp1d(original_positions, movie_data[emotion], kind="linear", fill_value="extrapolate")
        interpolated_data[emotion] = interp_function(target_positions)

    return pd.DataFrame(interpolated_data)

In [54]:
interpolated_movies = []

for movie_id, movie_data in df_emotions.groupby("Wikipedia_movie_ID"):
    # Apply the interpolate_emotions function to get a fixed number of timesteps for this movie
    interpolated_movie = interpolate_emotions(movie_data, target_timesteps)
    interpolated_movies.append(interpolated_movie)

interpolated_df = pd.concat(interpolated_movies, ignore_index=True)

# Add a timestep column to indicate the timestep index within each movie
interpolated_df['timestep'] = interpolated_df.groupby("Wikipedia_movie_ID").cumcount()

In [55]:
interpolated_df.shape

(686840, 9)

In [56]:
interpolated_df.to_pickle("../../../data/emotions_interpolated_20.pkl")
interpolated_df.to_csv("../../../data/emotions_interpolated_20.csv")