In [1]:
import pandas as pd
import os

In [2]:
df = pd.read_csv(os.path.abspath("../data/processed/movie_dataset_processed.csv"))

In [10]:
# nulls per col]
# script feature eng.py shouldn't save indices anymore aka Unamed: 0
nulls_per_col  = df.isnull().sum()
print(nulls_per_col)

id                    0
title                 0
vote_average          0
vote_count            0
runtime               0
overview           7644
popularity            0
tagline          151349
genres            26826
keywords         116259
poster_file       43094
backdrop_file    122301
release_year      17176
month_sin         17176
month_cos         17176
dtype: int64


In [4]:
df.columns

Index(['Unnamed: 0', 'id', 'title', 'vote_average', 'vote_count', 'runtime',
       'overview', 'popularity', 'tagline', 'genres', 'keywords',
       'poster_file', 'backdrop_file', 'release_year', 'month_sin',
       'month_cos'],
      dtype='object')

In [6]:
df['Unnamed: 0']

0              0
1              1
2              2
3              3
4              4
           ...  
230269    230269
230270    230270
230271    230271
230272    230272
230273    230273
Name: Unnamed: 0, Length: 230274, dtype: int64

In [7]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'runtime', 'overview',
       'popularity', 'tagline', 'genres', 'keywords', 'poster_file',
       'backdrop_file', 'release_year', 'month_sin', 'month_cos'],
      dtype='object')

In [11]:
df.to_csv(os.path.abspath("../data/processed/movie_dataset_processed.csv"), index=False)

In [12]:
df = pd.read_csv(os.path.abspath("../data/processed/movie_dataset_processed.csv"))

In [13]:
df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'runtime', 'overview',
       'popularity', 'tagline', 'genres', 'keywords', 'poster_file',
       'backdrop_file', 'release_year', 'month_sin', 'month_cos'],
      dtype='object')

In [5]:
nulls_per_col  = df.isnull().sum()
print(nulls_per_col)

id                    0
title                 0
vote_average          0
vote_count            0
runtime               0
overview           7644
popularity            0
tagline          151349
genres            26826
keywords         116259
poster_file       43094
backdrop_file    122301
release_year      17176
month_sin         17176
month_cos         17176
dtype: int64


In [6]:
# Number of duplicate values per column
duplicates_per_col = df.apply(lambda col: col.duplicated().sum())
print(duplicates_per_col)


id                    0
title             14300
vote_average     225539
vote_count       226776
runtime          229735
overview           8880
popularity       212695
tagline          152626
genres           221630
keywords         146069
poster_file       43093
backdrop_file    122300
release_year     230133
month_sin        230262
month_cos        230263
dtype: int64


In [7]:
null_rows = df[df.isnull().any(axis=1)]['poster_file']

In [9]:
print(null_rows) # soemthing just felt fishy, Idk (looks good)

109       C:\Users\ds3\Desktop\mth4370_group_project\ml\...
141       C:\Users\ds3\Desktop\mth4370_group_project\ml\...
181       C:\Users\ds3\Desktop\mth4370_group_project\ml\...
221       C:\Users\ds3\Desktop\mth4370_group_project\ml\...
234       C:\Users\ds3\Desktop\mth4370_group_project\ml\...
                                ...                        
230269                                                  NaN
230270                                                  NaN
230271                                                  NaN
230272                                                  NaN
230273                                                  NaN
Name: poster_file, Length: 189648, dtype: object


In [5]:
df.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'runtime', 'overview',
       'popularity', 'tagline', 'genres', 'keywords', 'poster_file',
       'backdrop_file', 'release_year', 'month_sin', 'month_cos'],
      dtype='object')

In [12]:
df['genres'].unique()

array(['Action, Science Fiction, Adventure',
       'Adventure, Drama, Science Fiction',
       'Drama, Action, Crime, Thriller', ...,
       'Comedy, Mystery, Crime, Family, Adventure',
       'Mystery, Thriller, Adventure, Romance',
       'Western, Action, Adventure, Science Fiction'],
      shape=(8644,), dtype=object)

In [19]:
# Only keep rows where 'genres' has no comma
single_word_genres = df[df['genres'].str.count(',') == 0]
single_word_counts = single_word_genres['genres'].value_counts()
print(single_word_counts)


genres
Documentary        40528
Drama              18132
Comedy             13517
Music               8363
Horror              7620
Animation           4873
Action              2876
Thriller            2451
Western             2348
Science Fiction     1331
Family               776
Crime                707
Romance              646
Adventure            492
Mystery              478
Fantasy              448
TV Movie             191
War                  159
History              143
Name: count, dtype: int64


In [20]:
len(single_word_counts)

19

In [26]:
df['keywords'].value_counts()

keywords
short film                                                                                                                                                                                                             2778
woman director                                                                                                                                                                                                         2381
stand-up comedy                                                                                                                                                                                                        1399
concert                                                                                                                                                                                                                 761
boxing                                                                                                         

In [33]:
import re

def clean_text(x):
    # 1. Handle missing or nan
    if not x or str(x).lower() == "nan":
        return ""

    text = str(x)

    # 2. Remove weird whitespace
    text = text.replace("\n", " ").replace("\t", " ")

    # 3. Collapse multiple spaces
    text = re.sub(r"\s+", " ", text)

    # 4. Remove duplicate punctuation
    text = re.sub(r"[.]{2,}", ".", text)        # "...." → "."
    text = re.sub(r"[,]{2,}", ",", text)
    text = re.sub(r"[!]{2,}", "!", text)
    text = re.sub(r"[?]{2,}", "?", text)

    # 5. Strip leading/trailing spaces & punctuation
    return text.strip().strip(".")

In [34]:
import pickle

with open(os.path.abspath('../data/processed/mappings.pkl'), 'rb') as f:
    mappings = pickle.load(f)

movie_db = mappings['movie_database']

r = 0
for tmdb_id, movie in movie_db.items():
    idx = mappings['tmdb_to_idx'][tmdb_id]  # Get array index

    title = clean_text(movie.get("title"))
    tagline = clean_text(movie.get("tagline"))
    overview = clean_text(movie.get("overview"))

    parts = []
    if title:
        parts.append(f"{title}.")
    if tagline:
        parts.append(f"{tagline}.")
    if overview:
        parts.append(f"{overview}.")

    text = " ".join(parts)

    print(text)
    print()

    if r > 5:
        break

    r += 1

Shadows in Paradise. Nikander, a rubbish collector and would-be entrepreneur finds his plans for success dashed when his business associate dies. One evening, he meets Ilona, a down-on-her luck cashier in a local supermarket—and, falteringly, a bond begins to develop between them.

Four Rooms. Twelve outrageous guests. Four scandalous requests. And one lone bellhop, in his first day on the job, who's in for the wildest New year's Eve of his life. It's Ted the Bellhop's first night on the job.and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another.

Judgment Night. Don't move. Don't whisper. Don't even breathe. While racing to a boxing match, Frank, Mike, John and Rey get more than they bargained for. A wrong turn lands them directly in the path of Fallon, a vicious, wise-cracking drug lord. After accidentally witnessing Fallon murder a disloyal henchman, 

In [40]:
df['keywords'].value_counts()

keywords
short film                                                                                                                                                                                                             2778
woman director                                                                                                                                                                                                         2381
stand-up comedy                                                                                                                                                                                                        1399
concert                                                                                                                                                                                                                 761
boxing                                                                                                         

In [41]:
def split_keywords(x):
    if not x or str(x).strip().lower() == "nan":
        return []
    return [kw.strip() for kw in x.split(",") if kw.strip()]

In [42]:
for i in range(5):
    print(split_keywords(df['keywords'][i]))
    print()


['rescue', 'mission', 'dream', 'airplane', 'paris', 'france', 'virtual reality', 'kidnapping', 'philosophy', 'spy', 'allegory', 'manipulation', 'car crash', 'heist', 'memory', 'architecture', 'los angeles', 'california', 'dream world', 'subconscious']

['rescue', 'future', 'spacecraft', 'race against time', 'artificial intelligence (a.i.)', 'nasa', 'time warp', 'dystopia', 'expedition', 'space travel', 'wormhole', 'famine', 'black hole', 'quantum mechanics', 'family relationships', 'space', 'robot', 'astronaut', 'scientist', 'single father', 'farmer', 'space station', 'curious', 'space adventure', 'time paradox', 'thoughtful', 'time-manipulation', 'father daughter relationship', '2060s', 'cornfield', 'time manipulation', 'complicated']

['joker', 'sadism', 'chaos', 'secret identity', 'crime fighter', 'superhero', 'anti hero', 'scarecrow', 'based on comic', 'vigilante', 'organized crime', 'tragic hero', 'anti villain', 'criminal mastermind', 'district attorney', 'super power', 'super vi

In [3]:
import torch
from typing import Optional
from ml.src.processing.keywords_encoder import WordEmbedding


def precompute_keywords(movie, word2vec : WordEmbedding) -> Optional[torch.Tensor]:
    keywords = word2vec.split_keywords(movie.get("keywords")) # makes sure keywords are in glove 840b.300d
    embeddings = [word2vec.get_embedding(keyword) for keyword in keywords]

    if not embeddings:
        return None

    embeddings_tensor = torch.vstack(embeddings)  # shape: [len(words), 300]
    pooled_embedding = embeddings_tensor.mean(dim=0)  # shape: [300]
    return pooled_embedding

In [4]:
import pickle

with open(os.path.abspath('../data/processed/mappings.pkl'), 'rb') as f:
    mappings = pickle.load(f)

movie_db = mappings['movie_database']

r = 0
for tmdb_id, movie in movie_db.items():
    idx = mappings['tmdb_to_idx'][tmdb_id]  # Get array index

    print(precompute_keywords(movie, WordEmbedding()))

    if r > 5:
        break

    r += 1

tensor([ 0.2332, -0.1933, -0.0902, -0.0883,  0.2440,  0.0048,  0.0014, -0.4147,
        -0.1488, -0.0182, -0.2916, -0.3798, -0.1828,  0.2009, -0.1921, -0.1870,
         0.0059, -0.0425,  0.6854,  0.0533,  0.2627,  0.1330,  0.1853, -0.0688,
         0.0696, -0.1272, -0.1721, -0.1609, -0.1641, -0.0333,  0.2355, -0.2496,
        -0.1106,  0.2737, -0.0756,  0.3016,  0.2197, -0.0476, -0.1034,  0.2099,
        -0.0607, -0.0055, -0.3560,  0.0256, -0.0143, -0.0582,  0.2256, -0.3117,
        -0.1128,  0.0598, -0.2542,  0.0606,  0.0863, -0.3029,  0.2423, -0.1183,
         0.1379, -0.2920,  0.0785, -0.1632, -0.3983, -0.4470, -0.0895, -0.5394,
        -0.1614,  0.2292,  0.1488,  0.2111, -0.0151,  0.0536, -0.1808,  0.3831,
        -0.2616, -0.0400,  0.2142,  0.1587, -0.0895, -0.1163,  0.1232,  0.2642,
         0.0335, -0.1594,  0.0580, -0.0579, -0.1183,  0.0337,  0.6440, -0.1798,
        -0.3185, -0.0087, -0.1266, -0.2166, -0.1211, -0.1020,  0.0969,  0.0182,
        -0.2959,  0.0847,  0.0911,  0.10

In [47]:
df['genres'].value_counts()

genres
Documentary                                          40528
Drama                                                18132
Comedy                                               13517
Music                                                 8363
Horror                                                7620
                                                     ...  
Animation, Adventure, Comedy, Romance, Thriller          1
Music, Comedy, Family, Animation                         1
Animation, Comedy, Drama, Action                         1
Animation, Science Fiction, Adventure, Mystery           1
Mystery, Western, Action, Crime, Thriller, Comedy        1
Name: count, Length: 8643, dtype: int64

In [48]:
all_genres = df['genres'].dropna().apply(lambda x: [g.strip() for g in x.split(',')])
flat_genres = [g for sublist in all_genres for g in sublist]
unique_genres = set(flat_genres)
print(f"Number of unique genres: {len(unique_genres)}")
print(unique_genres)

Number of unique genres: 19
{'Science Fiction', 'Thriller', 'Mystery', 'History', 'Adventure', 'Music', 'Western', 'Crime', 'Horror', 'TV Movie', 'War', 'Animation', 'Drama', 'Action', 'Comedy', 'Fantasy', 'Family', 'Romance', 'Documentary'}


In [52]:
import numpy as np
import pandas as pd

# Your unique genres (19)
ALL_GENRES = [
    'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
    'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
    'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western'
]

GENRE_IDX = {genre: i for i, genre in enumerate(ALL_GENRES)}

with open(os.path.abspath('../data/processed/mappings.pkl'), 'rb') as f:
    mappings = pickle.load(f)

movie_db = mappings['movie_database']

def one_hot_encode_genres(movie) -> Optional[torch.Tensor]:
        genres = movie.get('genres')
        if not genres:
            return None

        genres = [g.strip() for g in genres.split(',') if g.strip()]

        genre_features = torch.zeros(len(GENRE_IDX), dtype=torch.float32)
        for g in genres:
            if g in GENRE_IDX:
                genre_features[GENRE_IDX[g]] = 1.0

        if genre_features.sum() == 0:
            return None

        return genre_features

r = 0
for tmdb_id, movie in movie_db.items():
    idx = mappings['tmdb_to_idx'][tmdb_id]  # Get array index

    print(one_hot_encode_genres(movie))

    if r > 5:
        break

    r += 1


tensor([0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0.])
tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.])
tensor([1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0.])
tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.])
tensor([1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0.])
tensor([0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.])
tensor([0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0.])


In [54]:
df['release_year']

0         0.341561
1         0.503471
2         0.260606
3         0.301084
4         0.422516
            ...   
230269    0.908247
230270    0.908247
230271    0.908247
230272         NaN
230273    0.908247
Name: release_year, Length: 230274, dtype: float64

In [58]:
from ml.src.processing.cnn_encoder import ResNet50Encoder
from PIL import Image
from torchvision.transforms import transforms, InterpolationMode


def precompute_images(movie, resnet : ResNet50Encoder) -> Optional[torch.Tensor]:
    transform = transforms.Compose([
        transforms.RandomCrop((280, 280), pad_if_needed=True),
        transforms.Resize(224, interpolation=InterpolationMode.BILINEAR),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ])

    path = movie.get('poster_file')
    if path is None:
        return None

    img = Image.open(path).convert("RGB")
    img_tensor = transform(img).unsqueeze(0)
    poster_emb = resnet.forward(img_tensor).squeeze(0)

    return poster_emb

In [59]:
r = 0
for tmdb_id, movie in movie_db.items():
    idx = mappings['tmdb_to_idx'][tmdb_id]  # Get array index

    print(precompute_images(movie,ResNet50Encoder()))

    if r > 5:
        break

    r += 1

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to C:\Users\ds3/.cache\torch\hub\checkpoints\resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:31<00:00, 3.29MB/s]


tensor([ 0.0377,  0.0017,  0.1555, -0.0613, -0.0338,  0.0760,  0.1320, -0.1841,
        -0.0560,  0.0082, -0.0676,  0.0700, -0.0077, -0.0805, -0.0124,  0.0592,
        -0.0389,  0.0491,  0.0479, -0.1361, -0.0008,  0.0087,  0.0281, -0.0973,
         0.0481, -0.0848,  0.0070, -0.0249,  0.0372, -0.0136,  0.1034,  0.0534,
        -0.1350,  0.0889,  0.0205, -0.1432,  0.0637,  0.0770, -0.1153,  0.0627,
         0.0246, -0.0622,  0.0850, -0.1322, -0.0935,  0.1440,  0.0739,  0.0775,
        -0.1487,  0.0795,  0.1742,  0.0217,  0.1171, -0.0470,  0.0318, -0.0278,
        -0.0299,  0.0491,  0.0382,  0.0556,  0.0234,  0.1008,  0.1332, -0.0060,
        -0.0149,  0.0583, -0.1394, -0.0112,  0.0363,  0.1132, -0.0263, -0.0541,
        -0.0387, -0.0445, -0.0242, -0.0844, -0.1068, -0.0119,  0.0980, -0.0616,
         0.1448,  0.0299,  0.0849,  0.0986,  0.1001, -0.0924,  0.1635,  0.1199,
         0.1645,  0.0624, -0.1085, -0.0505,  0.1084,  0.0032,  0.0087,  0.1347,
         0.2518, -0.0771, -0.0040,  0.03

In [65]:
df['poster_file'][230269:230273] # pandas considers duplicates as nulls good to know.

230269    NaN
230270    NaN
230271    NaN
230272    NaN
Name: poster_file, dtype: object