In [1]:
import re
import json
import nltk
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from tqdm import tqdm

### Preprocess

In [111]:
# CHECK BEFORE RUNNING!!!
flag_preprocess = False # If false, skip the generation of a preprocessed corpus
flag_cluster_labels = False 
flag_cluster = False # If false, stop overwrite of a cluster matrix

In [3]:
# Define column names for movie.metadata.tsv
movie_metadata_columns = ["Wikipedia Movie ID", "Freebase Movie ID", "Movie name", "Movie release date",
                          "Movie box office revenue", "Movie runtime", "Movie languages", "Movie countries",
                          "Movie genres"]

# Read movie.metadata.tsv into a DataFrame
movie_metadata_df = pd.read_csv("data/movie.metadata.tsv", sep='\t', header=None, names=movie_metadata_columns)

In [4]:
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    # Convert to lowercase and tokenize
    words = nltk.word_tokenize(text.lower())
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Open 'plot_summaries.txt'
with open('data/plot_summaries.txt', 'r', encoding='utf-8') as file:
    summaries = file.readlines()

# Extract movie IDs 
movie_ids = [int(summary.split()[0]) for summary in summaries]

# Preprocess the summaries 
preprocessed_summaries = []
if flag_preprocess:
    for summary in tqdm(summaries, desc="Processing summaries"):
        preprocessed_summaries.append(preprocess_text(summary.split('\t')[1]))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mathiaskroismoller/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mathiaskroismoller/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Save preprocessed summaries to a pkl file
import csv
if flag_preprocess:
    # Save preprocessed_summaries list to a file
    with open('preprocessed_summaries.pkl', 'wb') as file:
        pickle.dump(preprocessed_summaries, file)

In [6]:
# Combined stemmed summaries with labels 
summaries_preprocessed_dist = []
if flag_preprocess:
    for i in range(len(preprocessed_summaries)):
        summaries_preprocessed_dist.append({"id": movie_ids[i], "text": preprocessed_summaries[i]})
    
    # Save summaries_preprocessed_dist to a JSON file
    with open('summaries_preprocessed_dist.json', 'w') as f:
        json.dump(summaries_preprocessed_dist, f)

# Home made version TF/IDF vectorization

In [7]:
# Load preprocessed_summaries list from the saved file
with open('preprocessed_summaries.pkl', 'rb') as file:
    preprocessed_summaries = pickle.load(file)

In [8]:
# Convert text data into numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_summaries)


In [11]:
# Apply KMeans clustering
if flag_cluster_labels:
    num_clusters = 100  # You can adjust the number of clusters -> the more clusters the heavier the computation
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(tfidf_matrix) # This takes a loooong time...
    
    # Save cluster_labels to a file
    with open('cluster_labels.pkl', 'wb') as f:
        pickle.dump(cluster_labels, f)


  super()._check_params_vs_input(X, default_n_init=10)


In [26]:
# Load cluster_labels from a file
with open('cluster_labels.pkl', 'rb') as f:
    cluster_labels = pickle.load(f)

In [106]:
# Create a dictionary to store movie IDs and their corresponding cluster labels
movie_clusters = dict(zip(movie_ids, cluster_labels))

# Save the dictionary to a file using pickle
with open('movie_clusters_id.pkl', 'wb') as file:
    pickle.dump(movie_clusters, file)

In [107]:
# Create a DataFrame to store cluster information
cluster_df = pd.DataFrame(columns=[f'Cluster {i}' for i in range(num_clusters)])

# Iterate through movie clusters and match with movie_metadata_df
if flag_cluster:
    for movie_id, cluster_label in tqdm(movie_clusters.items()):
        # Find the corresponding row in movie_metadata_df based on Wikipedia Movie ID
        movie_row = movie_metadata_df[movie_metadata_df['Wikipedia Movie ID'] == movie_id]
        
        # Extract movie name from the row
        movie_name = movie_row['Movie name'].values[0] if not movie_row.empty else f'Movie {movie_id}'
        
        # Update cluster_df with movie name in the appropriate cluster column
        cluster_df.loc[len(cluster_df), f'Cluster {cluster_label}'] = movie_name
    
    
    # Fill NaN values with empty string for better representation
    cluster_df.fillna('', inplace=True)

100%|██████████| 42306/42306 [20:07<00:00, 35.04it/s]  


In [110]:
# Save cluster_df as CSV
if flag_cluster:
    cluster_df.to_csv('clustered_movies.csv', index=True)

In [82]:
# Load cluster_df from a file
cluster_df = pd.read_csv('clustered_movies.csv')

  cluster_df = pd.read_csv('clustered_movies.csv')


### Top 10 words pr cluster

In [0]:
def get_top_words_for_clusters(tfidf_matrix, cluster_labels, vectorizer, top_n=10):
    cluster_words = {}
    terms = vectorizer.get_feature_names_out()
    
    for cluster_label in range(max(cluster_labels) + 1):
        cluster_indices = np.where(cluster_labels == cluster_label)[0]
        cluster_tfidf_scores = np.sum(tfidf_matrix[cluster_indices], axis=0)
        sorted_indices = np.argsort(cluster_tfidf_scores)[0, ::-1][:top_n]
        top_words = [terms[i] for i in sorted_indices]
        cluster_words[cluster_label] = top_words
        
    return cluster_words

# Call the function to get cluster words
cluster_words = get_top_words_for_clusters(tfidf_matrix, cluster_labels, vectorizer)

In [63]:
cluster_words[0][0][0][:10]

array(['peter', 'love', 'man', 'wendy', 'spider', 'father', 'jack',
       'back', 'home', 'one'], dtype=object)

In [71]:
# Dataframe for top 1000 words ranked for every cluster
cluster_words_ranked = pd.DataFrame()

for i in range(len(cluster_words)):
    cluster_words_ranked[f'{i}'] = cluster_words[i][0][0][:1000]
    
cluster_words_ranked

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,peter,jesse,raju,school,henry,cite,agent,bill,one,love,...,vijay,jim,charles,joan,man,sylvester,harry,martin,bob,creature
1,love,james,karan,students,anne,web,president,new,police,falls,...,love,silver,katie,joseph,young,tweety,one,police,larry,monster
2,man,willy,amar,high,catherine,synopsis,united,life,two,marriage,...,ravi,father,edward,jack,old,cat,voldemort,wife,kevin,godzilla
3,wendy,ethan,vicky,teacher,sir,film,states,connie,life,girl,...,kumar,new,bella,life,one,granny,two,family,song,frankenstein
4,spider,celine,love,student,family,based,soviet,film,new,daughter,...,father,stifler,sir,father,wife,speedy,man,blomkvist,however,dragon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,staff,moving,sell,yuko,shakespeare,classic,opens,ever,youth,aristocrat,...,birth,cop,ceremony,encounter,jealous,vacation,feet,words,detective,close
996,lomax,radio,koya,post,accepts,sik,especially,honesty,pass,satya,...,indeed,eye,quickly,famine,dave,exhibit,pc,engineer,tree,cold
997,mcnally,pull,millionaire,reluctant,befriends,murderer,married,goldie,program,present,...,motley,protecting,neighbor,dinner,action,holding,elevator,sexual,search,liquid
998,nurse,shadow,microdot,naomi,realize,train,interviews,jackie,miller,american,...,kallu,suddenly,health,bloodthirsty,stage,mission,states,highwaymen,rob,weapons


In [72]:
cluster_words_ranked.to_csv('cluster_words.csv', index=False, header=True)

# Rating

In [117]:
# Load cluster_labels from a file
with open('movie_clusters_id.pkl', 'rb') as f:
    movie_clusters_id = pickle.load(f)
    
# Load cluster_df from a file
TMDB = pd.read_csv('data/movie_metadata_TMDB.csv')
TMDB

Unnamed: 0.1,Unnamed: 0,Wikipedia Movie ID,Freebase Movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages,Movie countries,Movie genres,TMDB_id,TMDB_original_language,TMDB_original_title,TMDB_overview,TMDB_popularity,TMDB_release_date,TMDB_title,TMDB_vote_average,TMDB_vote_count
0,0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",10016.0,en,Ghosts of Mars,"In 2176, a Martian police unit is sent to pick...",17.280,2001-08-24,Ghosts of Mars,5.123,980.0
1,1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...",784579.0,en,Getting Away with Murder: The JonBenét Ramsey ...,Dramatization of the story behind the murder o...,0.750,2000-02-16,Getting Away with Murder: The JonBenét Ramsey ...,8.000,1.0
2,2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",396302.0,no,Brun bitter,A stolen bicycle case ends with drunken detect...,0.600,1988-11-17,Hair of the Dog,0.000,0.0
3,3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...",33592.0,en,White of the Eye,"In a wealthy and isolated desert community, a ...",7.336,1987-06-19,White of the Eye,5.742,64.0
4,4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}",11192.0,de,Die flambierte Frau,"Eva, an upper-class housewife, frustratedly le...",2.397,1983-05-11,A Woman in Flames,5.300,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81736,81736,35228177,/m/0j7hxnt,Mermaids: The Body Found,2011-03-19,,120.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}",117124.0,en,Mermaids: The Body Found,A story that imagines how these real-world phe...,5.098,2011-03-19,Mermaids: The Body Found,4.500,20.0
81737,81737,34980460,/m/0g4pl34,Knuckle,2011-01-21,,96.0,"{""/m/02h40lc"": ""English Language""}","{""/m/03rt9"": ""Ireland"", ""/m/07ssc"": ""United Ki...","{""/m/03bxz7"": ""Biographical film"", ""/m/07s9rl0...",44946.0,en,Knucklehead,A fight promoter deeply in debt to his crooked...,9.789,2010-10-22,Knucklehead,5.500,50.0
81738,81738,9971909,/m/02pygw1,Another Nice Mess,1972-09-22,,66.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/06nbt"": ""Satire"", ""/m/01z4y"": ""Comedy""}",285337.0,en,Another Nice Mess,Nixon and Agnew played as Laurel and Hardy.,1.960,1972-08-23,Another Nice Mess,0.000,0.0
81739,81739,913762,/m/03pcrp,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ...",,,,,,,,,


In [123]:
movie_ratings_cleand = TMDB['TMDB_vote_average']
movie_ratings_cleand

0        5.123
1        8.000
2        0.000
3        5.742
4        5.300
         ...  
81736    4.500
81737    5.500
81738    0.000
81739      NaN
81740    6.000
Name: TMDB_vote_average, Length: 81741, dtype: float64

In [122]:
cluster_counts = {}

for cluster_label in movie_clusters.values():
    cluster_counts[cluster_label] = cluster_counts.get(cluster_label, 0) + 1

cluster_counts

{9: 2215,
 8: 9340,
 74: 4716,
 85: 191,
 70: 104,
 3: 865,
 27: 275,
 34: 82,
 12: 2088,
 75: 1183,
 0: 174,
 25: 1219,
 86: 90,
 39: 223,
 33: 141,
 96: 168,
 32: 91,
 29: 91,
 18: 87,
 59: 399,
 84: 201,
 67: 124,
 83: 95,
 36: 596,
 37: 125,
 56: 127,
 58: 119,
 23: 1122,
 42: 262,
 19: 126,
 44: 923,
 53: 723,
 14: 785,
 73: 91,
 93: 77,
 43: 208,
 76: 148,
 69: 454,
 97: 101,
 6: 691,
 22: 137,
 72: 476,
 49: 78,
 15: 665,
 99: 265,
 94: 995,
 48: 266,
 47: 158,
 2: 163,
 24: 655,
 66: 214,
 13: 205,
 60: 184,
 5: 467,
 65: 307,
 21: 205,
 28: 275,
 87: 189,
 95: 60,
 63: 121,
 98: 107,
 64: 273,
 45: 102,
 31: 243,
 50: 159,
 61: 91,
 92: 187,
 20: 223,
 89: 426,
 79: 106,
 80: 64,
 46: 205,
 62: 244,
 51: 98,
 7: 138,
 40: 111,
 78: 104,
 68: 60,
 55: 144,
 41: 145,
 82: 70,
 4: 106,
 17: 102,
 11: 139,
 91: 142,
 16: 108,
 88: 155,
 52: 149,
 71: 189,
 54: 65,
 77: 102,
 35: 95,
 1: 58,
 38: 83,
 90: 96,
 81: 122,
 57: 146,
 26: 107,
 30: 33,
 10: 84}

In [None]:
cluster_rating_df = 

# NPL version (Nothing yet)

In [85]:
# Load summaries_preprocessed_dist from a JSON file
with open('summaries_preprocessed_dist.json', 'r') as f:
    summaries_preprocessed_dist = json.load(f)

In [63]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [64]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [86]:
tokenized_sum = summaries_preprocessed_dist.map(preprocess_function, batched=True)

AttributeError: 'list' object has no attribute 'map'

# Enriching data
#### 1. Adding data from www.themoviedb.org (TMDB)

In this section we are going to be using the API of www.themoviedb.org (TMDB) to populate the existing dataframe with more information about the movie. The script is going to query the website with the movie name and match with the movie, where the release date is the closest. If no match is found the code skips the movie and goes to the next.

This 1. step is most importantly adding the TMDB_ID which is a unique identifier for the movie which can be used to further enrich the data.

This script was run separately in another file in /scratch/add_TMDB_movie_metadata.py as it needed to be run several times since the API was limited by 40 requests/s.
 


In [10]:
import tmdbsimple as tmdb # Wrapper library for the API of themoviedb.org  (TMDB)
from tqdm import tqdm # Progress bar for the script
import pandas as pd
from datetime import datetime # Used for comparing movie release dates
from dotenv import load_dotenv # Makes keeping the API-key as local environment file simpler
import os # Used for loading the .env file

load_dotenv() # Loads .env files aka. the TMDB_API_KEY

headers_movie_metadata = ["Wikipedia Movie ID", "Freebase Movie ID", "Movie name", "Movie release date",
                          "Movie box office revenue", "Movie runtime", "Movie languages", "Movie countries",
                          "Movie genres"]
movie_metadata = pd.read_csv('/data/movie.metadata.tsv', sep="\t", names=headers_movie_metadata)


# Load API key
TMDB_API_KEY = os.environ.get("TMDB_API_KEY")
tmdb.API_KEY = TMDB_API_KEY
tmdb.REQUESTS_TIMEOUT = 5  # Seconds, for both connect and read

# Create a list to save progress
saved_progress = []

# Determine where to resume
start_index = 28000

# Progress file that can be used to resume
#saved_progress = pd.read_json('progress.json')['index'].tolist()
#start_index = saved_progress[-1] + 1  # Start from the next index

# Create a DataFrame to store the data
movie_metadata_TMDB = movie_metadata.copy()

for index, row in tqdm(movie_metadata_TMDB.iterrows(), total=len(movie_metadata_TMDB), desc="Processing"): # Wraps for loop in progress bar.
    
    # Skip previously processed indices - Commented out in favor of manual start_index
    #if index in saved_progress:
    #    continue
    if index < start_index:
        continue
    try:
        if not pd.isna(row["Movie release date"]):
            search = tmdb.Search()
            response = search.movie(query=row["Movie name"])

            # Convert dataframe release date to datetime
            movie_release_date_str = row["Movie release date"]
            if len(movie_release_date_str) == 4:  # Handle "YYYY" format
                movie_release_date = datetime.strptime(movie_release_date_str, "%Y").date()
            elif len(movie_release_date_str) == 7:  # Handle "YYYY-DD" format
                movie_release_date = datetime.strptime(movie_release_date_str, "%Y-%m").date()
            else:  # Assume it's in the format "YYYY-MM-DD"
                movie_release_date = datetime.strptime(movie_release_date_str, "%Y-%m-%d").date()

            # Handle if release_date is empty.
            date_list_converted = [datetime.strptime(each_date['release_date'], "%Y-%m-%d").date() for each_date in
                                   search.results if each_date.get('release_date')]

            # Create list of differences in time
            differences = [abs(movie_release_date - each_date) for each_date in date_list_converted]
            
            # If differences are empty = skip
            if not differences:
                continue
            minimum_index = differences.index(min(differences))  # Index of the closest match
            match = search.results[minimum_index]
            # print(f"Closest match: {match['title']} (Release Date: {match['release_date']})")

            # Add info in dataframe about the movie
            movie_metadata_TMDB.loc[index, 'TMDB_id'] = match['id']
            movie_metadata_TMDB.loc[index, 'TMDB_original_language'] = match['original_language']
            movie_metadata_TMDB.loc[index, 'TMDB_original_title'] = match['original_title']
            movie_metadata_TMDB.loc[index, 'TMDB_overview'] = match['overview']
            movie_metadata_TMDB.loc[index, 'TMDB_popularity'] = match['popularity']
            movie_metadata_TMDB.loc[index, 'TMDB_release_date'] = match['release_date']
            movie_metadata_TMDB.loc[index, 'TMDB_title'] = match['title']
            movie_metadata_TMDB.loc[index, 'TMDB_vote_average'] = match['vote_average']
            movie_metadata_TMDB.loc[index, 'TMDB_vote_count'] = match['vote_count']
            
            # Save the index as progress
            saved_progress.append(index)

            # Save progress periodically (in case of interruption)
            if index % 50 == 0:
                progress_df = pd.DataFrame({'index': saved_progress})
                progress_df.to_json('progress.json')
                movie_metadata_TMDB.to_csv('movie_metadata_TMDB.csv', index=False)

    except Exception as e:
        print(f"Error at index {index}: {str(e)}")

# Save final progress
progress_df = pd.DataFrame({'index': saved_progress})
progress_df.to_json('progress.json')

# Save your final DataFrame
movie_metadata_TMDB.to_csv('modified_data/movie_metadata_TMDB.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/data/movie.metadata.tsv'

In [3]:
# Comparing how many rows were populated in the enriching.
import pandas as pd
df = pd.read_csv('modified_data/movie_metadata_TMDB.csv')
print(f"Added TMDB ID to {df['TMDB_id'].count()} movies. Total movies: {df['Wikipedia Movie ID'].count()}. \n Percentage populated: {round(df['TMDB_id'].count()/df['Wikipedia Movie ID'].count()*100,2)}%")


Added TMDB ID to 68944 movies. Total movies: 81741. 
 Percentage populated: 84.34%


#### 2. Adding Movie Revenue
Many of the movies are missing their revenue and TMDB has that information.
E.g.: https://www.themoviedb.org/movie/51284