In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
# Importing src scripts:
sys.path.insert(0, 'src')
# from misc_scripts import scripts
from utils import utils
from data import data

# from models import models

## Loading datasets:

In [2]:
#Load CMU Corpus Dataset
metadata = data.load_metadata()
summaries_df = data.load_summaries()
cmu_df = metadata.merge(summaries_df, on="Wikipedia_movie_ID") 


In [3]:
cmu_df.head(1)

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countxries,Movie_genres,Movie_Summary
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","[Thriller, Science Fiction, Horror, Adventure,...","Set in the second half of the 22nd century, th..."


In [4]:
characters = pd.read_table(f"data/MovieSummaries/character.metadata.tsv", header=None)

characters.columns = [
"Wikipedia_movie_ID",
"Freebase movie ID",
"Movie release date",
"Character name",
"Actor date of birth",
"Actor gender",
"Actor height (in meters)",
"Actor ethnicity (Freebase ID)",
"Actor name",
"Actor age at movie release",
"Freebase character/actor map ID",
"Freebase character ID",
"Freebase actor ID"
]

# Merging character database with summaries
characters =  characters[["Wikipedia_movie_ID", "Actor gender", "Character name"]]
characters = characters.groupby("Wikipedia_movie_ID").agg({
    "Actor gender": list, 
    "Character name": list,
}).reset_index()


In [5]:
char_sum_CMU = pd.merge(cmu_df, characters, on="Wikipedia_movie_ID", how="inner")
# keeping only relevant columns
char_sum_CMU = char_sum_CMU[["Wikipedia_movie_ID", "Movie_name", "Actor gender", "Movie_countxries","Movie_release_date" ,"Character name","Movie_Summary"]]
char_sum_CMU.head(1)

Unnamed: 0,Wikipedia_movie_ID,Movie_name,Actor gender,Movie_countxries,Movie_release_date,Character name,Movie_Summary
0,975900,Ghosts of Mars,"[F, F, M, M, F, F, F, M, M, M, M, M, M, M, M, ...","{""/m/09c7w0"": ""United States of America""}",2001-08-24,"[Akooshay, Lieutenant Melanie Ballard, Desolat...","Set in the second half of the 22nd century, th..."


In [6]:
# Create the new column with the female percentage
char_sum_CMU['Female Percentage'] = char_sum_CMU['Actor gender'].apply(utils.calculate_female_percentage)
char_sum_CMU = char_sum_CMU[char_sum_CMU['Female Percentage'] != -1]

In [7]:
char_sum_CMU.head(1)

Unnamed: 0,Wikipedia_movie_ID,Movie_name,Actor gender,Movie_countxries,Movie_release_date,Character name,Movie_Summary,Female Percentage
0,975900,Ghosts of Mars,"[F, F, M, M, F, F, F, M, M, M, M, M, M, M, M, ...","{""/m/09c7w0"": ""United States of America""}",2001-08-24,"[Akooshay, Lieutenant Melanie Ballard, Desolat...","Set in the second half of the 22nd century, th...",35.294118


In [11]:
print(f"Total movies combined: {char_sum_CMU.shape[0]}")

Total movies combined: 37489


## Preprocessing movies summaries


In [9]:
char_sum_CMU['Clean_Summary'] = char_sum_CMU['Movie_Summary'].apply(utils.preprocess_text)

Preprocessing movie summaries...
Preprocessing completed.


In [16]:
preprocessed_file_path = 'data/pickles/preprocessed_movies.parquet' 

print(f"Saving preprocessed data to {preprocessed_file_path}...")
char_sum_CMU.to_parquet(preprocessed_file_path, index=False)
print("Preprocessed data saved successfully.")


Saving preprocessed data to preprocessed_movies.parquet...
Preprocessed data saved successfully.
