In [None]:
import pandas as pd
import numpy as np
# import altair as alt

In [2]:
ratings = pd.read_csv("data/ml-1m/ratings.dat", sep="::", engine="python", names=["userId", "movieId", "rating", "timestamp"])
movies = pd.read_csv("data/ml-1m/movies.dat", sep="::", engine="python", names=["movieId", "title", "genres"], encoding='latin-1')
users = pd.read_csv("data/ml-1m/users.dat", sep="::", engine="python", names=["userId", "gender", "age", "occupation", "zip"])

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(3883, 3)

In [5]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)', expand=False)
movies['title'] = movies['title'].str.replace(r'\s*\(\d{4}\)', '', regex=True)
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [6]:
movies['title'].nunique()  # some movies have same title but different years

3841

In [7]:
movies.isna().sum()

movieId    0
title      0
genres     0
year       0
dtype: int64

In [8]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [9]:
users.shape

(6040, 5)

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [11]:
ratings['userId'].nunique(), ratings['movieId'].nunique()

(6040, 3706)

In [12]:
movies = movies[movies['movieId'].isin(ratings['movieId'].unique())]
movies.shape, movies['movieId'].max()

((3706, 4), np.int64(3952))

In [13]:
n_available_ratings = ratings.shape[0]
n_possible_ratings = users.shape[0] * movies.shape[0]
sparsity = 1 - (n_available_ratings / n_possible_ratings)
n_available_ratings, n_possible_ratings, sparsity

(1000209, 22384240, 0.9553163743776871)

In [14]:
combined_df = ratings.merge(movies, on='movieId').merge(users, on='userId')
combined_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,gender,age,occupation,zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest,Drama,1975,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach,Animation|Children's|Musical,1996,F,1,10,48067
2,1,914,3,978301968,My Fair Lady,Musical|Romance,1964,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich,Drama,2000,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A",Animation|Children's|Comedy,1998,F,1,10,48067


In [15]:
combined_df['userId'] = combined_df['userId'].astype('category')
combined_df['movieId'] = combined_df['movieId'].astype('category')

In [16]:
# from scipy.sparse import csr_matrix

# num_users = combined_df["userId"].max() + 1
# num_movies = combined_df["movieId"].max() + 1

# csr = csr_matrix(
#     (combined_df["rating"], (combined_df["userId"], combined_df["movieId"])),
#     shape=(num_users, num_movies)
# )

# print("CSR matrix:\n", csr.shape)
# print("\nDense matrix:\n", csr.toarray().shape)


In [17]:
def calculate_pmi(df, attribute="movieId"):
    """
    Calculate PMI (Pointwise Mutual Information) between user and a given attribute (e.g., movieId).
    """
    # Total score (sum of all ratings)
    N = df["rating"].sum()

    # User total ratings
    user_totals = df.groupby("userId")["rating"].sum().rename("user_total_score")

    # Attribute total ratings (movie total ratings)
    attr_totals = df.groupby(attribute)["rating"].sum().rename(f"{attribute}_score")

    # Merge user totals and attribute totals into original df
    df = df.join(user_totals, on="userId")
    df = df.join(attr_totals, on=attribute)

    # p(attribute | user) = score / user_total_score
    df[f"p_{attribute}_user"] = df["rating"] / df["user_total_score"]

    # p(attribute) = attribute_score / N
    df[f"p_{attribute}"] = df[f"{attribute}_score"] / N

    # PMI = log( p(attr|user) / p(attr) )
    df["pmi"] = np.log(df[f"p_{attribute}_user"]) - np.log(df[f"p_{attribute}"])

    return df#[["userId", attribute, "pmi", f"p_{attribute}_user", f"p_{attribute}"]]



pmi_df = calculate_pmi(combined_df, attribute="movieId")
pmi_df.head()

  user_totals = df.groupby("userId")["rating"].sum().rename("user_total_score")
  attr_totals = df.groupby(attribute)["rating"].sum().rename(f"{attribute}_score")


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,gender,age,occupation,zip,user_total_score,movieId_score,p_movieId_user,p_movieId,pmi
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest,Drama,1975,F,1,10,48067,222,7574,0.022523,0.002114,2.365803
1,1,661,3,978302109,James and the Giant Peach,Animation|Children's|Musical,1996,F,1,10,48067,222,1819,0.013514,0.000508,3.281412
2,1,914,3,978301968,My Fair Lady,Musical|Romance,1964,F,1,10,48067,222,2642,0.013514,0.000738,2.908163
3,1,3408,4,978300275,Erin Brockovich,Drama,2000,F,1,10,48067,222,5081,0.018018,0.001418,2.541873
4,1,2355,5,978824291,"Bug's Life, A",Animation|Children's|Comedy,1998,F,1,10,48067,222,6564,0.022523,0.001832,2.508924


In [18]:
pmi_df.shape

(1000209, 16)

In [19]:
pmi_df.groupby('title')['movieId'].nunique().sort_values(ascending=False).head(10)

title
Hamlet                      5
Mummy, The                  3
Jungle Book, The            2
Parent Trap, The            2
Godzilla (Gojira)           2
Dracula                     2
King Kong                   2
Thomas Crown Affair, The    2
General, The                2
My Man Godfrey              2
Name: movieId, dtype: int64

In [20]:
movie_id_to_title_year = movies.set_index('movieId').apply(lambda row: f"{row['title']} ({row['genres']})", axis=1).to_dict()
movie_id_to_title_year[1]

"Toy Story (Animation|Children's|Comedy)"

In [21]:
from ydata_profiling import ProfileReport

profile = ProfileReport(pmi_df, title="Profiling Report", minimal=True)


In [22]:
profile.to_file("data/data_profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 16/16 [00:01<00:00, 10.10it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

# SVD

In [None]:
# this is not suitable for implicit feedback data

from scipy.sparse.linalg import svds

def intrinsic_eval(matrix, k=32):
    """
    Intrinsic evaluation of embeddings from SVD.
    
    Args:
        matrix (ndarray or sparse): user-item interaction matrix (ratings or PMI)
        k (int): number of latent factors
    
    Returns:
        dict: reconstruction error (Frobenius norm), explained variance ratio
    """
    # Perform truncated SVD
    U, s, Vt = svds(matrix, k=k)
    
    # Sort singular values in descending order (svds returns ascending)
    s = np.sort(s)[::-1]
    U = U[:, ::-1]
    Vt = Vt[::-1, :]
    
    # Reconstruct
    S = np.diag(s)
    reconstructed = U @ S @ Vt
    
    # Reconstruction error (Frobenius norm)
    error = np.linalg.norm(matrix - reconstructed, 'fro')
    
    # Explained variance ratio
    total_variance = np.linalg.norm(matrix, 'fro')**2
    explained_variance = np.sum(s**2)
    explained_variance_ratio = explained_variance / total_variance
    
    return {
        "reconstruction_error": error, # lower the better
        "explained_variance_ratio": explained_variance_ratio # close to 1 is better
    }


In [102]:
for k in [32, 64, 128, 256, 512]:
    print(f"Evaluating for k={k} latent factors:")
    for pmi in [False, True]:
        print(f"Using {'PMI' if pmi else 'ratings'} matrix:")
        value_col = 'rating'
        if pmi:
            value_col = 'pmi'
        
        ratings_matrix = pmi_df.pivot(index='userId', columns='movieId', values=value_col).fillna(0)
        eval_results = intrinsic_eval(ratings_matrix.to_numpy(), k=k)
        print(f"  Reconstruction Error: {eval_results['reconstruction_error']:.4f}")
        print(f"  Explained Variance Ratio: {eval_results['explained_variance_ratio']:.4f}")
        print()

Evaluating for k=32 latent factors:
Using ratings matrix:
  Reconstruction Error: 2746.6180
  Explained Variance Ratio: 0.4642

Using PMI matrix:
  Reconstruction Error: 1945.6656
  Explained Variance Ratio: 0.2435

Evaluating for k=64 latent factors:
Using ratings matrix:
  Reconstruction Error: 2611.8100
  Explained Variance Ratio: 0.5155

Using PMI matrix:
  Reconstruction Error: 1880.7822
  Explained Variance Ratio: 0.2931

Evaluating for k=128 latent factors:
Using ratings matrix:
  Reconstruction Error: 2421.1262
  Explained Variance Ratio: 0.5836

Using PMI matrix:
  Reconstruction Error: 1796.4077
  Explained Variance Ratio: 0.3551

Evaluating for k=256 latent factors:
Using ratings matrix:
  Reconstruction Error: 2124.7130
  Explained Variance Ratio: 0.6793

Using PMI matrix:
  Reconstruction Error: 1666.3644
  Explained Variance Ratio: 0.4451

Evaluating for k=512 latent factors:
Using ratings matrix:
  Reconstruction Error: 1669.4361
  Explained Variance Ratio: 0.8020

Using

In [36]:
import scipy.sparse as sp
from implicit.als import AlternatingLeastSquares

for value_col in ['rating', 'pmi']:
    print(f"Training ALS model using {value_col} matrix:")
    # Step 1: Create mapping from userId/movieId to indices (ALS requires 0..N-1 indices)
    user_mapping = {u: i for i, u in enumerate(pmi_df["userId"].unique())}
    movie_mapping = {m: i for i, m in enumerate(pmi_df["movieId"].unique())}

    pmi_df["user_idx"] = pmi_df["userId"].map(user_mapping)
    pmi_df["movie_idx"] = pmi_df["movieId"].map(movie_mapping)

    # Step 2: Build sparse user-item matrix
    num_users = len(user_mapping)
    num_movies = len(movie_mapping)

    # implicit expects item-user matrix (movies × users), so transpose is common
    ratings_matrix = sp.coo_matrix(
        (pmi_df[value_col], (pmi_df["movie_idx"], pmi_df["user_idx"])),
        shape=(num_movies, num_users)
    ).tocsr()

    # Step 3: Train ALS model
    k = 64  # embedding dimension
    als_model = AlternatingLeastSquares(
        factors=k,
        regularization=0.1,
        iterations=50,
        use_gpu=False,  # set False if no GPU
        calculate_training_loss=True
    )

    als_model.fit(ratings_matrix)



Training ALS model using rating matrix:


  0%|          | 0/50 [00:00<?, ?it/s]

Training ALS model using pmi matrix:


  0%|          | 0/50 [00:00<?, ?it/s]

In [99]:
# Step 4: Extract embeddings (reverse due to transposed matrix)
movie_embeddings = als_model.user_factors  # shape: [num_movies, k]
user_embeddings = als_model.item_factors   # shape: [num_users, k]

movie_embeddings.shape, user_embeddings.shape

((3706, 64), (6040, 64))

In [111]:
predicted_matrix = user_embeddings @ movie_embeddings.T
predicted_matrix.shape

  predicted_matrix = user_embeddings @ movie_embeddings.T
  predicted_matrix = user_embeddings @ movie_embeddings.T
  predicted_matrix = user_embeddings @ movie_embeddings.T


(6040, 3706)

In [112]:
predicted_matrix[0, :10]  # first user's scores for first 10 movies

array([0.53000754, 0.5583489 , 0.44985285, 0.42583156, 0.78119   ,
       0.50396585, 0.1622364 , 0.4514541 , 0.74202394, 0.82502043],
      dtype=float32)

In [118]:
predicted_df = pd.DataFrame(predicted_matrix, index=[u for u, idx in sorted(user_mapping.items(), key=lambda x: x[1])], columns=[m for m, idx in sorted(movie_mapping.items(), key=lambda x: x[1])])
predicted_df = predicted_df.rename_axis('userId').reset_index()
predicted_df = predicted_df.melt(id_vars='userId', var_name='movieId', value_name='predicted_score')
predicted_df = predicted_df.merge(movies, on='movieId', how='left')
predicted_df = predicted_df.merge(users, on='userId', how='left')
predicted_df.head()

Unnamed: 0,userId,movieId,predicted_score,title,genres,year,gender,age,occupation,zip
0,1,1193,0.530008,One Flew Over the Cuckoo's Nest,Drama,1975,F,1,10,48067
1,2,1193,0.766901,One Flew Over the Cuckoo's Nest,Drama,1975,M,56,16,70072
2,3,1193,0.416457,One Flew Over the Cuckoo's Nest,Drama,1975,M,25,15,55117
3,4,1193,0.164497,One Flew Over the Cuckoo's Nest,Drama,1975,M,45,7,2460
4,5,1193,0.026296,One Flew Over the Cuckoo's Nest,Drama,1975,M,25,20,55455


In [119]:
predicted_df.isna().sum()

userId             0
movieId            0
predicted_score    0
title              0
genres             0
year               0
gender             0
age                0
occupation         0
zip                0
dtype: int64

In [122]:
def get_top_k_per_group(df, value_col='rating', k=5):
    result = []
    ages = sorted(df['age'].unique())
    for gender in ['M', 'F']:
        for age in ages:
            group = df[(df['age'] == age) & (df['gender'] == gender)]
            if group.empty:
                continue
            top_genres = group.groupby('genres')[value_col].sum().sort_values(ascending=False).head(k).index.tolist()
            top_titles = group.groupby('title')[value_col].sum().sort_values(ascending=False).head(k).index.tolist()
            result.append({'age': age, 'gender': gender, f'top_genres_{value_col}': top_genres, f'top_titles_{value_col}': top_titles})
    return pd.DataFrame(result)

df_rating = get_top_k_per_group(pmi_df, value_col='rating', k=5)
df_pmi = get_top_k_per_group(pmi_df, value_col='pmi', k=5)
df_predicted = get_top_k_per_group(predicted_df, value_col='predicted_score', k=5)

top_k_combined = df_rating.merge(df_pmi, on=['age', 'gender'], how='outer').merge(df_predicted, on=['age','gender'], how='outer')

In [123]:
top_k_combined

Unnamed: 0,age,gender,top_genres_rating,top_titles_rating,top_genres_pmi,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,1,F,"[Comedy, Drama, Comedy|Romance, Comedy|Drama, ...","[Toy Story, Toy Story 2, Shakespeare in Love, ...","[Comedy, Drama, Comedy|Romance, Children's|Com...","[Parent Trap, The, 101 Dalmatians, Jungle Book...","[Comedy, Drama, Comedy|Romance, Children's|Com...","[101 Dalmatians, Parent Trap, The, Jungle Book..."
1,1,M,"[Comedy, Drama, Comedy|Drama, Action|Thriller,...","[Matrix, The, Sixth Sense, The, Star Wars: Epi...","[Comedy, Drama, Horror, Comedy|Romance, Comedy...","[Toy Story 2, X-Men, Big Daddy, Mission: Impos...","[Comedy, Drama, Comedy|Drama, Comedy|Romance, ...","[Star Wars: Episode I - The Phantom Menace, St..."
2,18,F,"[Comedy, Drama, Comedy|Romance, Comedy|Drama, ...","[American Beauty, Shakespeare in Love, Princes...","[Comedy, Drama, Comedy|Romance, Drama|Romance,...","[Hamlet, Titanic, American Beauty, 10 Things I...","[Comedy, Drama, Comedy|Romance, Drama|Romance,...","[Hamlet, American Beauty, Shakespeare in Love,..."
3,18,M,"[Comedy, Drama, Comedy|Drama, Comedy|Romance, ...","[American Beauty, Star Wars: Episode V - The E...","[Comedy, Drama, Comedy|Drama, Horror, Comedy|R...","[American Pie, X-Men, Fight Club, Braveheart, ...","[Comedy, Drama, Comedy|Drama, Comedy|Romance, ...","[American Beauty, Matrix, The, Star Wars: Epis..."
4,25,F,"[Drama, Comedy, Comedy|Romance, Comedy|Drama, ...","[American Beauty, Silence of the Lambs, The, S...","[Drama, Comedy, Comedy|Romance, Drama|Romance,...","[Sabrina, American Beauty, Shakespeare in Love...","[Drama, Comedy, Comedy|Romance, Comedy|Drama, ...","[American Beauty, Shakespeare in Love, Silence..."
5,25,M,"[Comedy, Drama, Comedy|Drama, Comedy|Romance, ...","[American Beauty, Star Wars: Episode IV - A Ne...","[Comedy, Drama, Comedy|Drama, Horror, Comedy|R...","[Fly, The, American Pie, X-Men, Thomas Crown A...","[Comedy, Drama, Comedy|Drama, Comedy|Romance, ...",[Star Wars: Episode V - The Empire Strikes Bac...
6,35,F,"[Drama, Comedy, Comedy|Romance, Drama|Romance,...","[American Beauty, Shakespeare in Love, Silence...","[Drama, Comedy, Comedy|Romance, Drama|Romance,...","[Gone with the Wind, 101 Dalmatians, Babe, Sen...","[Drama, Comedy, Comedy|Romance, Drama|Romance,...","[Shakespeare in Love, American Beauty, Babe, C..."
7,35,M,"[Drama, Comedy, Comedy|Drama, Comedy|Romance, ...","[Star Wars: Episode IV - A New Hope, Star Wars...","[Drama, Comedy, Comedy|Drama, Horror, Comedy|R...","[Fly, The, King Kong, Butch Cassidy and the Su...","[Drama, Comedy, Comedy|Drama, Comedy|Romance, ...","[Star Wars: Episode IV - A New Hope, Fly, The,..."
8,45,F,"[Drama, Comedy, Comedy|Romance, Drama|Romance,...","[American Beauty, Shakespeare in Love, Schindl...","[Drama, Comedy, Drama|Romance, Comedy|Romance,...","[Sabrina, Titanic, Hamlet, Casablanca, Annie H...","[Drama, Comedy, Comedy|Romance, Drama|Romance,...","[Shakespeare in Love, American Beauty, Titanic..."
9,45,M,"[Drama, Comedy, Comedy|Drama, Comedy|Romance, ...","[Star Wars: Episode IV - A New Hope, American ...","[Drama, Comedy, Comedy|Drama, Comedy|Romance, ...","[Fly, The, Butch Cassidy and the Sundance Kid,...","[Drama, Comedy, Comedy|Drama, Comedy|Romance, ...","[Star Wars: Episode IV - A New Hope, Fly, The,..."


In [125]:
for gender in ['M', 'F']:
    for age in sorted(top_k_combined['age'].unique()):
        row = top_k_combined[(top_k_combined['age'] == age) & (top_k_combined['gender'] == gender)]
        print(f"Age: {age}, Gender: {gender}")
        display(pd.DataFrame({"top_genres_rating": row['top_genres_rating'].values[0],
                            "top_genres_pmi": row['top_genres_pmi'].values[0],
                            "top_titles_rating": row['top_titles_rating'].values[0],
                            "top_titles_pmi": row['top_titles_pmi'].values[0],
                            "top_genres_predicted_score": row['top_genres_predicted_score'].values[0],
                            "top_titles_predicted_score": row['top_titles_predicted_score'].values[0],
                            }))
        

Age: 1, Gender: M


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Comedy,Comedy,"Matrix, The",Toy Story 2,Comedy,Star Wars: Episode I - The Phantom Menace
1,Drama,Drama,"Sixth Sense, The",X-Men,Drama,Star Wars: Episode IV - A New Hope
2,Comedy|Drama,Horror,Star Wars: Episode IV - A New Hope,Big Daddy,Comedy|Drama,Star Wars: Episode VI - Return of the Jedi
3,Action|Thriller,Comedy|Romance,Star Wars: Episode VI - Return of the Jedi,Mission: Impossible 2,Comedy|Romance,"Matrix, The"
4,Comedy|Romance,Comedy|Drama,Star Wars: Episode V - The Empire Strikes Back,Austin Powers: The Spy Who Shagged Me,Action|Thriller,Toy Story 2


Age: 18, Gender: M


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Comedy,Comedy,American Beauty,American Pie,Comedy,American Beauty
1,Drama,Drama,Star Wars: Episode V - The Empire Strikes Back,X-Men,Drama,"Matrix, The"
2,Comedy|Drama,Comedy|Drama,"Matrix, The",Fight Club,Comedy|Drama,Star Wars: Episode VI - Return of the Jedi
3,Comedy|Romance,Horror,Star Wars: Episode IV - A New Hope,Braveheart,Comedy|Romance,Saving Private Ryan
4,Action|Thriller,Comedy|Romance,Star Wars: Episode VI - Return of the Jedi,Gladiator,Horror,Star Wars: Episode V - The Empire Strikes Back


Age: 25, Gender: M


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Comedy,Comedy,American Beauty,"Fly, The",Comedy,Star Wars: Episode V - The Empire Strikes Back
1,Drama,Drama,Star Wars: Episode IV - A New Hope,American Pie,Drama,American Beauty
2,Comedy|Drama,Comedy|Drama,Star Wars: Episode V - The Empire Strikes Back,X-Men,Comedy|Drama,Terminator 2: Judgment Day
3,Comedy|Romance,Horror,"Matrix, The","Thomas Crown Affair, The",Comedy|Romance,Star Wars: Episode IV - A New Hope
4,Action|Thriller,Comedy|Romance,Terminator 2: Judgment Day,Hamlet,Horror,"Matrix, The"


Age: 35, Gender: M


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,Star Wars: Episode IV - A New Hope,"Fly, The",Drama,Star Wars: Episode IV - A New Hope
1,Comedy,Comedy,Star Wars: Episode V - The Empire Strikes Back,King Kong,Comedy,"Fly, The"
2,Comedy|Drama,Comedy|Drama,American Beauty,Butch Cassidy and the Sundance Kid,Comedy|Drama,Star Wars: Episode V - The Empire Strikes Back
3,Comedy|Romance,Horror,Terminator 2: Judgment Day,Alien,Comedy|Romance,Terminator 2: Judgment Day
4,Action|Thriller,Comedy|Romance,Saving Private Ryan,One Flew Over the Cuckoo's Nest,Horror,Jurassic Park


Age: 45, Gender: M


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,Star Wars: Episode IV - A New Hope,"Fly, The",Drama,Star Wars: Episode IV - A New Hope
1,Comedy,Comedy,American Beauty,Butch Cassidy and the Sundance Kid,Comedy,"Fly, The"
2,Comedy|Drama,Comedy|Drama,Star Wars: Episode V - The Empire Strikes Back,One Flew Over the Cuckoo's Nest,Comedy|Drama,Butch Cassidy and the Sundance Kid
3,Comedy|Romance,Comedy|Romance,"Godfather, The","African Queen, The",Comedy|Romance,"Godfather, The"
4,Drama|Romance,Horror,Raiders of the Lost Ark,Annie Hall,Drama|Romance,2001: A Space Odyssey


Age: 50, Gender: M


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,American Beauty,"Fly, The",Drama,Butch Cassidy and the Sundance Kid
1,Comedy,Comedy,Star Wars: Episode IV - A New Hope,Butch Cassidy and the Sundance Kid,Comedy,"Fly, The"
2,Comedy|Drama,Comedy|Drama,"Godfather, The",King Kong,Comedy|Drama,American Beauty
3,Comedy|Romance,Comedy|Romance,Star Wars: Episode V - The Empire Strikes Back,"Guns of Navarone, The",Comedy|Romance,Saving Private Ryan
4,Drama|Romance,Drama|Romance,Terminator 2: Judgment Day,"Mummy, The",Drama|Romance,Star Wars: Episode IV - A New Hope


Age: 56, Gender: M


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,American Beauty,"Maltese Falcon, The",Drama,American Beauty
1,Comedy,Comedy,Schindler's List,"African Queen, The",Comedy,Schindler's List
2,Comedy|Drama,Comedy|Drama,"Godfather, The",Butch Cassidy and the Sundance Kid,Comedy|Drama,Shakespeare in Love
3,Comedy|Romance,Comedy|Romance,Saving Private Ryan,Gone with the Wind,Comedy|Romance,Saving Private Ryan
4,Drama|Romance,Drama|Romance,Fargo,American Beauty,Drama|Romance,Casablanca


Age: 1, Gender: F


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Comedy,Comedy,Toy Story,"Parent Trap, The",Comedy,101 Dalmatians
1,Drama,Drama,Toy Story 2,101 Dalmatians,Drama,"Parent Trap, The"
2,Comedy|Romance,Comedy|Romance,Shakespeare in Love,"Jungle Book, The",Comedy|Romance,"Jungle Book, The"
3,Comedy|Drama,Children's|Comedy,Aladdin,10 Things I Hate About You,Children's|Comedy,Toy Story 2
4,Children's|Comedy,Comedy|Drama,"Sixth Sense, The","Kid in King Arthur's Court, A",Comedy|Drama,Toy Story


Age: 18, Gender: F


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Comedy,Comedy,American Beauty,Hamlet,Comedy,Hamlet
1,Drama,Drama,Shakespeare in Love,Titanic,Drama,American Beauty
2,Comedy|Romance,Comedy|Romance,"Princess Bride, The",American Beauty,Comedy|Romance,Shakespeare in Love
3,Comedy|Drama,Drama|Romance,"Sixth Sense, The",10 Things I Hate About You,Drama|Romance,"Princess Bride, The"
4,Drama|Romance,Comedy|Drama,"Shawshank Redemption, The",Sabrina,Comedy|Drama,Titanic


Age: 25, Gender: F


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,American Beauty,Sabrina,Drama,American Beauty
1,Comedy,Comedy,"Silence of the Lambs, The",American Beauty,Comedy,Shakespeare in Love
2,Comedy|Romance,Comedy|Romance,Shakespeare in Love,Shakespeare in Love,Comedy|Romance,"Silence of the Lambs, The"
3,Comedy|Drama,Drama|Romance,"Princess Bride, The",Titanic,Comedy|Drama,"Princess Bride, The"
4,Drama|Romance,Comedy|Drama,"Sixth Sense, The",Hamlet,Drama|Romance,Fargo


Age: 35, Gender: F


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,American Beauty,Gone with the Wind,Drama,Shakespeare in Love
1,Comedy,Comedy,Shakespeare in Love,101 Dalmatians,Comedy,American Beauty
2,Comedy|Romance,Comedy|Romance,"Silence of the Lambs, The",Babe,Comedy|Romance,Babe
3,Drama|Romance,Drama|Romance,"Sixth Sense, The",Sense and Sensibility,Drama|Romance,Casablanca
4,Comedy|Drama,Comedy|Drama,Star Wars: Episode IV - A New Hope,Sabrina,Comedy|Drama,Groundhog Day


Age: 45, Gender: F


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,American Beauty,Sabrina,Drama,Shakespeare in Love
1,Comedy,Comedy,Shakespeare in Love,Titanic,Comedy,American Beauty
2,Comedy|Romance,Drama|Romance,Schindler's List,Hamlet,Comedy|Romance,Titanic
3,Drama|Romance,Comedy|Romance,Star Wars: Episode IV - A New Hope,Casablanca,Drama|Romance,Schindler's List
4,Comedy|Drama,Comedy|Drama,"Shawshank Redemption, The",Annie Hall,Comedy|Drama,Annie Hall


Age: 50, Gender: F


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,American Beauty,Annie Hall,Drama,American Beauty
1,Comedy,Comedy,Shakespeare in Love,"African Queen, The",Comedy,Shakespeare in Love
2,Comedy|Romance,Drama|Romance,Fargo,Breakfast at Tiffany's,Comedy|Romance,Hamlet
3,Drama|Romance,Comedy|Romance,"Godfather, The",Casablanca,Drama|Romance,Annie Hall
4,Comedy|Drama,Comedy|Drama,Star Wars: Episode IV - A New Hope,Gone with the Wind,Comedy|Drama,Casablanca


Age: 56, Gender: F


Unnamed: 0,top_genres_rating,top_genres_pmi,top_titles_rating,top_titles_pmi,top_genres_predicted_score,top_titles_predicted_score
0,Drama,Drama,American Beauty,Gone with the Wind,Drama,American Beauty
1,Comedy,Comedy,Schindler's List,One Flew Over the Cuckoo's Nest,Comedy,Shakespeare in Love
2,Comedy|Romance,Drama|Romance,One Flew Over the Cuckoo's Nest,Schindler's List,Drama|Romance,Schindler's List
3,Drama|Romance,Comedy|Romance,Shakespeare in Love,Hamlet,Comedy|Romance,Gone with the Wind
4,Comedy|Drama,Comedy|Drama,Gone with the Wind,"Maltese Falcon, The",Comedy|Drama,Casablanca


In [110]:
# tensorboard embedding visualization

from torch.utils.tensorboard import SummaryWriter
import os

os.makedirs("tensorboard_embeddings", exist_ok=True)

def prepare_metadata(df, path):
    df_str = df.astype(str)
    df_str.to_csv(path, sep='\t', index=False, header=True)  # Save TSV with header
    metadata_list = df_str.values.tolist()  # Do NOT join with tab, keep as list of lists
    return metadata_list, list(df_str.columns)  # Return header too

# --- User embeddings ---
assert user_embeddings.shape[0] == len(users), "User embeddings and metadata size mismatch"
user_metadata_list, user_metadata_header = prepare_metadata(users[['userId','gender','age','occupation','zip']],
                                                           "tensorboard_embeddings/user_meta.tsv")

writer = SummaryWriter("tensorboard_embeddings/user")
writer.add_embedding(user_embeddings, metadata=user_metadata_list, tag="user_embeddings", metadata_header=user_metadata_header)
writer.close()

# --- Movie embeddings ---
assert movie_embeddings.shape[0] == len(movies), "Movie embeddings and metadata size mismatch"
movie_metadata_list, movie_metadata_header = prepare_metadata(movies[['movieId','title','genres','year']],
                                                             "tensorboard_embeddings/movie_meta.tsv")

writer = SummaryWriter("tensorboard_embeddings/movie")
writer.add_embedding(movie_embeddings, metadata=movie_metadata_list, tag="movie_embeddings", metadata_header=movie_metadata_header)
writer.close()

# --- Combined embeddings ---
combined_emb = np.vstack([user_embeddings, movie_embeddings])
combined_meta = pd.concat([
    users[['userId','gender','age','occupation','zip']].assign(type='user'),
    movies[['movieId','title','genres','year']].assign(type='movie')
], ignore_index=True)

assert combined_emb.shape[0] == combined_meta.shape[0], "Combined embeddings and metadata mismatch"

combined_metadata_list, combined_metadata_header = prepare_metadata(combined_meta, "tensorboard_embeddings/combined_meta.tsv")

writer = SummaryWriter("tensorboard_embeddings/combined")
writer.add_embedding(combined_emb, metadata=combined_metadata_list, tag="combined_embeddings", metadata_header=combined_metadata_header)
writer.close()




In [95]:
import random
# find related items based on item-item similarity from ALS model
for i in range(5):
    item_id = random.choice(list(movie_mapping.keys()))
    print("Query item: ", movie_id_to_title_year[movie_mapping[item_id]])
    related = als_model.similar_items(item_id, N=5)
    for similar_item in related[0][1:]:
        similar_item = movie_mapping.get(int(similar_item))
        if similar_item:
            print(movie_id_to_title_year[similar_item])
    print('\n')

Query item:  I Shot Andy Warhol (Drama)
Disclosure (Drama|Thriller)
Mirage (Action|Thriller)
Strike! (a.k.a. All I Wanna Do, The Hairy Bird) (Comedy)


Query item:  Drop Zone (Action)
Field of Dreams (Drama)
Coma (Thriller)
Best Years of Our Lives, The (Drama|War)


Query item:  Darby O'Gill and the Little People (Adventure|Children's|Fantasy)
Dunston Checks In (Children's|Comedy)
Swamp Thing (Horror|Sci-Fi)
Fright Night Part II (Horror)


Query item:  Blow-Out (La Grande Bouffe) (Drama)
Mummy's Hand, The (Horror)
20 Dates (Comedy)
Ghost of Frankenstein, The (Horror)


Query item:  Dadetown (Documentary)
My Science Project (Adventure|Sci-Fi)
French Connection, The (Action|Crime|Drama|Thriller)


