In [1]:
import re
import json
import nltk
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from tqdm import tqdm

### Preprocess

In [74]:
# CHECK BEFORE RUNNING!!!
flag_preprocess = False # If false, skip the generation of a preprocessed corpus
flag_cluster_labels = False 
flag_cluster = False # If false, stop overwrite of a cluster matrix

In [3]:
# Define column names for movie.metadata.tsv
movie_metadata_columns = ["Wikipedia Movie ID", "Freebase Movie ID", "Movie name", "Movie release date",
                          "Movie box office revenue", "Movie runtime", "Movie languages", "Movie countries",
                          "Movie genres"]

# Read movie.metadata.tsv into a DataFrame
movie_metadata_df = pd.read_csv("data/movie.metadata.tsv", sep='\t', header=None, names=movie_metadata_columns)

In [4]:
# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)
    # Convert to lowercase and tokenize
    words = nltk.word_tokenize(text.lower())
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Assuming your movie plot summaries are stored in 'summaries.txt'
with open('data/plot_summaries.txt', 'r', encoding='utf-8') as file:
    summaries = file.readlines()

# Extract movie IDs 
movie_ids = [int(summary.split()[0]) for summary in summaries]

# Preprocess the summaries with a progress bar
preprocessed_summaries = []
if flag_preprocess:
    for summary in tqdm(summaries, desc="Processing summaries"):
        preprocessed_summaries.append(preprocess_text(summary.split('\t')[1]))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mathiaskroismoller/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mathiaskroismoller/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Save preprocessed summaries to a pkl file
import csv
if flag_preprocess:
    # Save preprocessed_summaries list to a file
    with open('preprocessed_summaries.pkl', 'wb') as file:
        pickle.dump(preprocessed_summaries, file)

In [6]:
# Combined stemmed summaries with labels 
summaries_dist = []
if flag_preprocess:
    for i in range(len(preprocessed_summaries)):
        summaries_dist.append({"label": movie_ids[i], "text": preprocessed_summaries[i]})
    
    # Save summaries_dist to a JSON file
    with open('summaries_dist.json', 'w') as f:
        json.dump(summaries_dist, f)

# Home made version TF/IDF vectorization

In [7]:
# Load preprocessed_summaries list from the saved file
with open('preprocessed_summaries.pkl', 'rb') as file:
    preprocessed_summaries = pickle.load(file)

In [8]:
# Convert text data into numerical features using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_summaries)


In [11]:
# Apply KMeans clustering
if flag_cluster_labels:
    num_clusters = 100  # You can adjust the number of clusters as needed
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(tfidf_matrix)
    
    # Save cluster_labels to a file
    with open('cluster_labels.pkl', 'wb') as f:
        pickle.dump(cluster_labels, f)


  super()._check_params_vs_input(X, default_n_init=10)


In [26]:
# Load cluster_labels from a file
with open('cluster_labels.pkl', 'rb') as f:
    cluster_labels = pickle.load(f)

In [14]:
# Create a dictionary to store movie IDs and their corresponding cluster labels
movie_clusters = dict(zip(movie_ids, cluster_labels))

# Print movie IDs and their corresponding cluster labels
for movie_id, cluster_label in movie_clusters.items():
    print(f"Movie ID: {movie_id}, Cluster Label: {cluster_label}")

Movie ID: 23890098, Cluster Label: 9
Movie ID: 31186339, Cluster Label: 8
Movie ID: 20663735, Cluster Label: 8
Movie ID: 2231378, Cluster Label: 74
Movie ID: 595909, Cluster Label: 85
Movie ID: 5272176, Cluster Label: 70
Movie ID: 1952976, Cluster Label: 74
Movie ID: 24225279, Cluster Label: 3
Movie ID: 2462689, Cluster Label: 27
Movie ID: 20532852, Cluster Label: 34
Movie ID: 15401493, Cluster Label: 74
Movie ID: 18188932, Cluster Label: 8
Movie ID: 2940516, Cluster Label: 12
Movie ID: 1335380, Cluster Label: 75
Movie ID: 1480747, Cluster Label: 74
Movie ID: 24448645, Cluster Label: 74
Movie ID: 15072401, Cluster Label: 8
Movie ID: 4018288, Cluster Label: 3
Movie ID: 4596602, Cluster Label: 8
Movie ID: 15224586, Cluster Label: 75
Movie ID: 15585766, Cluster Label: 0
Movie ID: 1760737, Cluster Label: 8
Movie ID: 29062594, Cluster Label: 3
Movie ID: 29326153, Cluster Label: 12
Movie ID: 9252321, Cluster Label: 25
Movie ID: 8388648, Cluster Label: 9
Movie ID: 10644072, Cluster Label: 8
M

In [78]:
# Create a DataFrame to store cluster information
cluster_df = pd.DataFrame(columns=[f'Cluster {i}' for i in range(num_clusters)])

# Iterate through movie clusters and match with movie_metadata_df
if flag_cluster:
    for movie_id, cluster_label in tqdm(movie_clusters.items()):
        # Find the corresponding row in movie_metadata_df based on Wikipedia Movie ID
        movie_row = movie_metadata_df[movie_metadata_df['Wikipedia Movie ID'] == movie_id]
        
        # Extract movie name from the row
        movie_name = movie_row['Movie name'].values[0] if not movie_row.empty else f'Movie {movie_id}'
        
        # Update cluster_df with movie name in the appropriate cluster column
        cluster_df.loc[len(cluster_df), f'Cluster {cluster_label}'] = movie_name
    
    
    # Fill NaN values with empty string for better representation
    cluster_df.fillna('', inplace=True)

100%|██████████| 42306/42306 [18:24<00:00, 38.32it/s]


In [79]:
# Save cluster_df as CSV
if flag_cluster:
    cluster_df.to_csv('clustered_movies.csv', index=True)

In [96]:
# Load cluster_df from a file
cluster_df = pd.read_csv('clustered_movies.csv')

def transform_dataframe(df):
    # Replace NaN values with np.nan (optional if you want to keep np.nan instead of removing them)
    df = df.replace(np.nan, pd.NA)
    
    # Drop columns with NaN values
    df_cleaned = df.dropna(axis=1)
    
    return df_cleaned

cluster_df = transform_dataframe(cluster_df)


  cluster_df = pd.read_csv('clustered_movies.csv')


In [90]:
# Save cluster_df as CSV
if flag_cluster:
    cluster_df.to_csv('clustered_movies_test.csv', index=True)

In [82]:
# Load cluster_df from a file
cluster_df = pd.read_csv('clustered_movies.csv')

  cluster_df = pd.read_csv('clustered_movies.csv')


### Top 10 words pr cluster

In [0]:
def get_top_words_for_clusters(tfidf_matrix, cluster_labels, vectorizer, top_n=10):
    cluster_words = {}
    terms = vectorizer.get_feature_names_out()
    
    for cluster_label in range(max(cluster_labels) + 1):
        cluster_indices = np.where(cluster_labels == cluster_label)[0]
        cluster_tfidf_scores = np.sum(tfidf_matrix[cluster_indices], axis=0)
        sorted_indices = np.argsort(cluster_tfidf_scores)[0, ::-1][:top_n]
        top_words = [terms[i] for i in sorted_indices]
        cluster_words[cluster_label] = top_words
        
    return cluster_words

# Call the function to get cluster words
cluster_words = get_top_words_for_clusters(tfidf_matrix, cluster_labels, vectorizer)

In [63]:
cluster_words[0][0][0][:10]

array(['peter', 'love', 'man', 'wendy', 'spider', 'father', 'jack',
       'back', 'home', 'one'], dtype=object)

In [71]:
# Dataframe for top 1000 words ranked for every cluster
cluster_words_ranked = pd.DataFrame()

for i in range(len(cluster_words)):
    cluster_words_ranked[f'{i}'] = cluster_words[i][0][0][:1000]
    
cluster_words_ranked

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,peter,jesse,raju,school,henry,cite,agent,bill,one,love,...,vijay,jim,charles,joan,man,sylvester,harry,martin,bob,creature
1,love,james,karan,students,anne,web,president,new,police,falls,...,love,silver,katie,joseph,young,tweety,one,police,larry,monster
2,man,willy,amar,high,catherine,synopsis,united,life,two,marriage,...,ravi,father,edward,jack,old,cat,voldemort,wife,kevin,godzilla
3,wendy,ethan,vicky,teacher,sir,film,states,connie,life,girl,...,kumar,new,bella,life,one,granny,two,family,song,frankenstein
4,spider,celine,love,student,family,based,soviet,film,new,daughter,...,father,stifler,sir,father,wife,speedy,man,blomkvist,however,dragon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,staff,moving,sell,yuko,shakespeare,classic,opens,ever,youth,aristocrat,...,birth,cop,ceremony,encounter,jealous,vacation,feet,words,detective,close
996,lomax,radio,koya,post,accepts,sik,especially,honesty,pass,satya,...,indeed,eye,quickly,famine,dave,exhibit,pc,engineer,tree,cold
997,mcnally,pull,millionaire,reluctant,befriends,murderer,married,goldie,program,present,...,motley,protecting,neighbor,dinner,action,holding,elevator,sexual,search,liquid
998,nurse,shadow,microdot,naomi,realize,train,interviews,jackie,miller,american,...,kallu,suddenly,health,bloodthirsty,stage,mission,states,highwaymen,rob,weapons


In [72]:
cluster_words_ranked.to_csv('cluster_words.csv', index=False, header=True)

# NPL version (Nothing yet)

In [85]:
# Load summaries_dist from a JSON file
with open('summaries_dist.json', 'r') as f:
    summaries_dist = json.load(f)

In [63]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [64]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [86]:
tokenized_sum = summaries_dist.map(preprocess_function, batched=True)

AttributeError: 'list' object has no attribute 'map'