# Import

In [1]:
import os
import requests
import zipfile
import gzip
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import random
import math
from typing import List, Tuple

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

from scipy.sparse import hstack



# Data Collection and Preprocessing

## Data Collection

### Downloading the Data

In [2]:
URL_MOVIELENS = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
URL_IMBD_NAMES_BASICS = "https://datasets.imdbws.com/name.basics.tsv.gz"
URL_IMBD_TITLE_BASICS = "https://datasets.imdbws.com/title.basics.tsv.gz"
URL_IMBD_TITLE_RATINGS = "https://datasets.imdbws.com/title.ratings.tsv.gz"
# Local path where the file will be saved
LOCAL_MOVIELENS_PATH = "ml-1m.zip"
LOCAL_IMBD_NAMES_BASICS_PATH = "name.basics.tsv.gz"
LOCAL_IMBD_TITLE_BASICS_PATH = "title.basics.tsv.gz"
LOCAL_IMBD_TITLE_RATINGS_PATH = "title.ratings.tsv.gz"
# Directory where the dataset will be extracted
EXTRACT_DIR = "dataset"

In [3]:
# Function to download the file
def download_file(url, local_filename):
    print(f"Downloading {url} to {local_filename}")
    # Check if the file already exists
    if os.path.exists(local_filename):
        print(f"File {local_filename} already exists")
        return local_filename
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    print(f"Downloaded {url} to {local_filename}")
    return local_filename

# Function to unzip the file
def unzip_file(zip_path, extract_to):
    print(f"Unzipping {zip_path} to {extract_to}")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f"Unzipped {zip_path} to {extract_to}")

def gunzip_file(gz_path, extract_to):
    print(f"Gunzipping {gz_path} to {extract_to}")
    with gzip.open(gz_path, 'rb') as f_in:
        with open(extract_to, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f"Gunzipped {gz_path} to {extract_to}")

In [4]:
# Ensure the dataset directory exists
os.makedirs(EXTRACT_DIR, exist_ok=True)
# Download the file
download_file(URL_MOVIELENS, LOCAL_MOVIELENS_PATH)
download_file(URL_IMBD_NAMES_BASICS, LOCAL_IMBD_NAMES_BASICS_PATH)
download_file(URL_IMBD_TITLE_BASICS, LOCAL_IMBD_TITLE_BASICS_PATH)
download_file(URL_IMBD_TITLE_RATINGS, LOCAL_IMBD_TITLE_RATINGS_PATH)

Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip to ml-1m.zip
File ml-1m.zip already exists
Downloading https://datasets.imdbws.com/name.basics.tsv.gz to name.basics.tsv.gz
File name.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.basics.tsv.gz to title.basics.tsv.gz
File title.basics.tsv.gz already exists
Downloading https://datasets.imdbws.com/title.ratings.tsv.gz to title.ratings.tsv.gz
File title.ratings.tsv.gz already exists


'title.ratings.tsv.gz'

In [5]:
# Extract the files
print("Unzipping file...")
unzip_file(LOCAL_MOVIELENS_PATH, EXTRACT_DIR)

print("Gunzipping files...")
gunzip_file(LOCAL_IMBD_NAMES_BASICS_PATH, os.path.join(EXTRACT_DIR, "name.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_BASICS_PATH, os.path.join(EXTRACT_DIR, "title.basics.tsv"))
gunzip_file(LOCAL_IMBD_TITLE_RATINGS_PATH, os.path.join(EXTRACT_DIR, "title.ratings.tsv"))

print("Extraction complete.")

Unzipping file...
Unzipping ml-1m.zip to dataset
Unzipped ml-1m.zip to dataset
Gunzipping files...
Gunzipping name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipped name.basics.tsv.gz to dataset\name.basics.tsv
Gunzipping title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipped title.basics.tsv.gz to dataset\title.basics.tsv
Gunzipping title.ratings.tsv.gz to dataset\title.ratings.tsv
Gunzipped title.ratings.tsv.gz to dataset\title.ratings.tsv
Extraction complete.


## Preprocessing

In [6]:
movies_dat_df = pd.read_csv('dataset/ml-1m/movies.dat', sep='::', header=None, engine='python', names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')
ratings_dat_df = pd.read_csv('dataset/ml-1m/ratings.dat', sep='::', header=None, engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')
users_dat_df = pd.read_csv('dataset/ml-1m/users.dat', sep='::', header=None, engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')

name_basics_df = pd.read_csv('dataset/name.basics.tsv', sep='\t', header=0)
title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)
title_ratings_df = pd.read_csv('dataset/title.ratings.tsv', sep='\t', header=0)

# Filter only movies from title_basics_df
title_basics_df = title_basics_df[title_basics_df['titleType'] == 'movie']

# Preprocess the Title columns
movies_dat_df['Title'] = movies_dat_df['Title'].str.lower()
movies_dat_df['Title'] = movies_dat_df['Title'].str.replace(r"\(.*\)", "", regex=True).str.strip()

title_basics_df['primaryTitle'] = title_basics_df['primaryTitle'].str.lower().str.strip()

ratings_dat_df.drop(columns=['Timestamp'], inplace=True)



  title_basics_df = pd.read_csv('dataset/title.basics.tsv', sep='\t', header=0)


In [54]:
features_df = pd.merge(movies_dat_df, title_basics_df, left_on='Title', right_on='primaryTitle', how='left')

# Replace NaN values with empty strings
features_df['Genres'] = features_df['Genres'].fillna('')
features_df['genres'] = features_df['genres'].fillna('')

# Replace '|' with ',' in Genres column and convert to lowercase
features_df['Genres'] = features_df['Genres'].str.replace('|', ',')
features_df['Genres'] = features_df['Genres'].str.lower()

# Convert genres column to lowercase
features_df['genres'] = features_df['genres'].str.lower()

#Delete the \\N values in the genres column
features_df = features_df[features_df['genres'] != '\\n']

# Function to combine and clean genre columns
def combine_genres(row):
    genres = set(row['genres'].split(',')) | set(row['Genres'].split(','))
    genres.discard('')  # Remove empty strings
    return ','.join(sorted(genres))

# Apply the function to combine the genres
features_df['combined_genres'] = features_df.apply(combine_genres, axis=1)

# Drop the original genre columns
features_df = features_df.drop(columns=['genres', 'Genres', 'primaryTitle'])
features_df = pd.merge(features_df, title_ratings_df, on='tconst', how='left')
features_df = features_df.drop(columns=['tconst'])
# Drop the column endYear as it is always NaN
features_df = features_df.drop(columns=['endYear'])
# Drop also the column titleType as we will keep only the rated movies
features_df = features_df.drop(columns=['titleType'])
# Drop the original Title column as we will use the cleaned one
features_df = features_df.drop(columns=['Title'])
# For the runtimeMinutes column, we will replace the NaN values with the median
features_df['runtimeMinutes'] = pd.to_numeric(features_df['runtimeMinutes'], errors='coerce')
features_df['runtimeMinutes'] = features_df['runtimeMinutes'].fillna(features_df['runtimeMinutes'].median())
features_df

Unnamed: 0,MovieID,originalTitle,isAdult,startYear,runtimeMinutes,combined_genres,averageRating,numVotes
0,1,Toy Story,0,1995,81.0,"adventure,animation,children's,comedy",8.3,1074033.0
1,2,Jumanji,0,1995,104.0,"adventure,children's,comedy,family,fantasy",7.1,379284.0
2,3,Grumpier Old Men,0,1995,101.0,"comedy,romance",6.6,29842.0
3,4,Waiting to Exhale,0,1995,124.0,"comedy,drama,romance",6.0,12281.0
4,5,Father of the Bride Part II,0,1995,106.0,"comedy,family,romance",6.1,41883.0
...,...,...,...,...,...,...,...,...
8372,3950,Tigerland,0,2000,101.0,"drama,war",6.9,43453.0
8373,3950,Taken by the Tiger,0,2019,91.0,"documentary,drama",6.9,72.0
8374,3950,Tigerland,0,\N,97.0,drama,,
8375,3951,Two Family House,0,2000,108.0,"comedy,drama,romance",7.2,1732.0


In [55]:

# Merge ratings with movie features
ratings_with_movies = ratings_dat_df.merge(features_df, on='MovieID', how='inner')

# Check for duplicates and aggregate if necessary
ratings_with_movies = ratings_with_movies.groupby(['UserID', 'MovieID'], as_index=False).agg({'Rating': 'mean'})

# Pivot the data to create user-item interaction matrix
user_item_matrix = ratings_with_movies.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)
# Set the 0 values in the rating column to NaN
user_item_matrix = user_item_matrix.replace(0, np.nan)
print(user_item_matrix)

MovieID  1     2     3     4     5     6     7     8     9     10    ...  \
UserID                                                               ...   
1         5.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   2.0   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
6036      NaN   NaN   NaN   2.0   NaN   3.0   NaN   NaN   NaN   NaN  ...   
6037      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6038      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6039      NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6040      3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

MovieID  39

In [56]:
user_item_long = user_item_matrix.stack().reset_index()
user_item_long.columns = ['UserID', 'MovieID', 'Rating']

# Merge with user data
user_item_long = user_item_long.merge(users_dat_df, on='UserID', how='left')

# Merge with movie data
user_item_long = user_item_long.merge(features_df, on='MovieID', how='left')

# Drop rows with NaN values
user_item_long = user_item_long.dropna()

# Handle missing values and convert data types
user_item_long['runtimeMinutes'] = pd.to_numeric(user_item_long['runtimeMinutes'], errors='coerce')
user_item_long['averageRating'] = pd.to_numeric(user_item_long['averageRating'], errors='coerce')
user_item_long['numVotes'] = pd.to_numeric(user_item_long['numVotes'], errors='coerce')
user_item_long['startYear'] = pd.to_numeric(user_item_long['startYear'], errors='coerce')
user_item_long['isAdult'] = user_item_long['isAdult'].astype(int)

In [69]:
#print the list of every genre
genres = set()
for genre in user_item_long['combined_genres']:
    genres.update(genre.split(','))
def calculate_genre_rating(genre, user_movies):
    ratings = user_movies['Rating']
    genre_mask = user_movies['combined_genres'].apply(lambda x: genre in x.split(','))
    if genre_mask.sum() == 0:
        return 3  # Default rating if no movies of this genre are rated by the user
    else:
        return ratings[genre_mask].mean()

In [70]:
cat_cols = ['Gender', 'Age', 'Occupation', 'Zip-code']
for col in cat_cols:
    encoder = LabelEncoder()
    user_item_long[col] = encoder.fit_transform(user_item_long[col].astype(str))

# Normalize numerical columns
num_cols = ['startYear', 'runtimeMinutes', 'averageRating', 'numVotes']
scaler = StandardScaler()
user_item_long[num_cols] = scaler.fit_transform(user_item_long[num_cols])

# Aggregate ratings and features per user
user_agg = user_item_long.groupby('UserID').agg({
    'Gender': 'mean',
    'Age': 'mean',
    'Occupation': 'mean',
    'Zip-code': 'mean',
    'originalTitle': 'count',  # Count number of rated movies
    'isAdult': 'mean',
    'startYear': 'mean',
    'runtimeMinutes': 'mean',
    'combined_genres': lambda x: ','.join(x),  # Concatenate genres
    'averageRating': 'mean',
    'numVotes': 'mean'
}).reset_index()

# Convert concatenated genres to count of genres
genre_counts = user_item_long['combined_genres'].str.get_dummies(sep=',').sum()
genre_cols = genre_counts.index.tolist()

for col in genre_cols:
    user_agg[col] = user_item_long.groupby('UserID')['combined_genres'].apply(lambda x: x.str.contains(col).sum())

user_agg.drop(columns=['combined_genres'], inplace=True)
#Remove the NaN rows
user_agg = user_agg.dropna()
for genre in genres:
    user_agg[f'rating_{genre}'] = user_agg.apply(lambda row: calculate_genre_rating(genre, user_item_long[user_item_long['UserID'] == row['UserID']]), axis=1)
user_agg.fillna(3, inplace=True)  # Fill remaining NaN values with 3


In [71]:
user_agg

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,originalTitle,isAdult,startYear,runtimeMinutes,averageRating,...,rating_horror,rating_family,rating_adult,rating_musical,rating_sport,rating_reality-tv,rating_fantasy,rating_romance,rating_crime,rating_adventure
1,2,1.0,6.0,11.0,434.0,145,0.006897,-0.078256,0.311508,0.187093,...,3.000000,4.000000,3.000000,3.000000,5.000000,3.0,3.500000,3.900000,3.107143,3.621622
2,3,1.0,2.0,10.0,3397.0,46,0.000000,-0.000860,-0.057104,0.485189,...,4.000000,3.000000,3.000000,3.000000,4.500000,3.0,3.857143,4.200000,4.500000,4.103448
3,4,1.0,4.0,2.0,2826.0,28,0.000000,-0.169659,0.541304,0.580972,...,3.750000,4.000000,3.000000,3.000000,5.000000,3.0,3.500000,4.571429,4.500000,3.875000
4,5,1.0,2.0,16.0,52.0,255,0.003922,0.280207,-0.117712,0.144005,...,3.000000,3.000000,4.000000,2.000000,2.000000,3.0,2.937500,3.061728,3.178571,2.958333
5,6,0.0,5.0,5.0,3397.0,123,0.016260,0.010205,-0.005454,-0.227197,...,4.000000,4.000000,4.000000,4.285714,4.000000,3.0,3.782609,4.060606,3.846154,4.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,6036,0.0,2.0,10.0,802.0,1269,0.003152,-0.024708,-0.079467,-0.028892,...,3.295775,3.304348,3.000000,3.833333,3.357143,3.0,3.231579,3.470219,3.448980,3.177273
6036,6037,0.0,4.0,1.0,582.0,335,0.005970,-0.228401,0.028057,0.130907,...,3.550000,3.428571,4.000000,4.200000,4.000000,3.0,3.600000,3.853333,3.636364,3.625000
6037,6038,0.0,6.0,1.0,2253.0,19,0.000000,-0.294695,0.530594,0.514167,...,2.666667,3.000000,3.000000,3.000000,3.000000,3.0,3.800000,3.714286,3.000000,3.500000
6038,6039,0.0,4.0,0.0,2613.0,186,0.000000,-0.534146,-0.089568,0.074540,...,4.000000,3.769231,3.000000,3.843373,3.000000,3.0,3.862069,3.878788,3.958333,3.852941


In [72]:
ratings_with_movies

Unnamed: 0,UserID,MovieID,Rating
0,1,1,5.0
1,1,48,5.0
2,1,150,5.0
3,1,260,4.0
4,1,527,5.0
...,...,...,...
1000203,6040,3683,4.0
1000204,6040,3703,4.0
1000205,6040,3735,4.0
1000206,6040,3751,4.0


In [73]:
features_df

Unnamed: 0,MovieID,originalTitle,isAdult,startYear,runtimeMinutes,combined_genres,averageRating,numVotes
0,1,Toy Story,0,1995,81.0,"adventure,animation,children's,comedy",8.3,1074033.0
1,2,Jumanji,0,1995,104.0,"adventure,children's,comedy,family,fantasy",7.1,379284.0
2,3,Grumpier Old Men,0,1995,101.0,"comedy,romance",6.6,29842.0
3,4,Waiting to Exhale,0,1995,124.0,"comedy,drama,romance",6.0,12281.0
4,5,Father of the Bride Part II,0,1995,106.0,"comedy,family,romance",6.1,41883.0
...,...,...,...,...,...,...,...,...
8372,3950,Tigerland,0,2000,101.0,"drama,war",6.9,43453.0
8373,3950,Taken by the Tiger,0,2019,91.0,"documentary,drama",6.9,72.0
8374,3950,Tigerland,0,\N,97.0,drama,,
8375,3951,Two Family House,0,2000,108.0,"comedy,drama,romance",7.2,1732.0


In [47]:
# Define and train user embedding model (autoencoder)
input_shape = X_train.shape[1]

input_layer = Input(shape=(input_shape,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(32, activation='relu')(encoded)  # User embedding size

decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(decoded)
decoded = Dense(input_shape, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder_model = Model(input_layer, encoded)

In [48]:
# Example of potential adjustments
from tensorflow.keras.optimizers import Adam

# Ensure there are no NaNs in input and output data
print(X_train.isnull().sum())
print(X_val.isnull().sum())
autoencoder.compile(optimizer='adam', loss='mse')

# Example of retraining with revised optimizer
autoencoder.fit(X_train, X_train, epochs=10, batch_size=64, validation_data=(X_val, X_val))

Gender            0
Age               0
Occupation        0
Zip-code          0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
averageRating     0
numVotes          0
\n                0
action            0
adult             0
adventure         0
animation         0
biography         0
children's        0
comedy            0
crime             0
documentary       0
drama             0
family            0
fantasy           0
film-noir         0
history           0
horror            0
music             0
musical           0
mystery           0
news              0
reality-tv        0
romance           0
sci-fi            0
sport             0
thriller          0
war               0
western           0
dtype: int64
Gender            0
Age               0
Occupation        0
Zip-code          0
originalTitle     0
isAdult           0
startYear         0
runtimeMinutes    0
averageRating     0
numVotes          0
\n                0
action            0
adult  

<keras.src.callbacks.history.History at 0x170b073f410>

In [None]:


autoencoder.compile(optimizer='adam', loss='mse')

# Ensure there are no NaNs in the data
if np.any(np.isnan(X_train)):
    raise ValueError("Training data contains NaN values.")
if np.any(np.isnan(X_val)):
    raise ValueError("Validation data contains NaN values.")

# Train the autoencoder
autoencoder.fit(X_train, X_train, epochs=10, batch_size=64, validation_data=(X_val, X_val))

In [39]:
X

Unnamed: 0,Gender,Age,Occupation,Zip-code,originalTitle,isAdult,startYear,runtimeMinutes,averageRating,numVotes,...,musical,mystery,news,reality-tv,romance,sci-fi,sport,thriller,war,western
0,0.0,0.0,2.0,1588.0,100,0.010000,-0.183069,-0.134215,0.124850,0.187078,...,,,,,,,,,,
1,1.0,6.0,8.0,2248.0,147,0.006803,-0.072855,0.321871,0.188350,0.242374,...,54.0,1.0,0.0,0.0,18.0,3.0,0.0,4.0,2.0,1.0
2,1.0,2.0,7.0,1863.0,48,0.000000,-0.046561,-0.064882,0.395565,0.713089,...,1.0,10.0,0.0,0.0,40.0,20.0,3.0,52.0,17.0,6.0
3,1.0,4.0,18.0,140.0,29,0.000000,-0.186165,0.517421,0.493320,1.033738,...,0.0,3.0,0.0,0.0,5.0,7.0,2.0,7.0,2.0,10.0
4,1.0,2.0,13.0,1938.0,259,0.003861,0.285133,-0.115522,0.137583,-0.040331,...,0.0,1.0,0.0,0.0,7.0,8.0,1.0,7.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0.0,2.0,7.0,1152.0,1286,0.003110,-0.020617,-0.078335,-0.029254,-0.218102,...,4.0,18.0,0.0,0.0,96.0,107.0,6.0,57.0,16.0,8.0
6036,0.0,4.0,1.0,2367.0,339,0.005900,-0.225698,0.030579,0.134674,-0.038163,...,73.0,110.0,0.0,0.0,324.0,248.0,14.0,339.0,60.0,19.0
6037,0.0,6.0,1.0,626.0,19,0.000000,-0.287001,0.540114,0.523785,0.596538,...,10.0,51.0,0.0,0.0,76.0,64.0,1.0,140.0,17.0,5.0
6038,0.0,4.0,0.0,13.0,189,0.000000,-0.539968,-0.087407,0.073606,-0.145803,...,0.0,1.0,0.0,0.0,7.0,1.0,1.0,2.0,4.0,0.0


In [None]:
user_item_long

In [None]:
# Define and train user embedding model (autoencoder)
input_shape = X_train.shape[1]

input_layer = Input(shape=(input_shape,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(32, activation='relu')(encoded)  # User embedding size

decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(decoded)
decoded = Dense(input_shape, activation='linear')(decoded)

autoencoder = Model(input_layer, decoded)
encoder_model = Model(input_layer, encoded)

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=10, batch_size=64, validation_data=(X_val, X_val))

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split

# Load the data
# Encode categorical variables
cat_cols = ['Gender', 'Age', 'Occupation', 'Zip-code', 'originalTitle', 'combined_genres']
for col in cat_cols:
    encoder = LabelEncoder()
    user_item_long[col] = encoder.fit_transform(user_item_long[col].astype(str))

# Normalize numerical columns
num_cols = ['startYear', 'runtimeMinutes', 'averageRating', 'numVotes']
scaler = StandardScaler()
user_item_long[num_cols] = scaler.fit_transform(user_item_long[num_cols])

# Separate features and target (assuming we predict 'Rating')
X = user_item_long.drop(['UserID', 'MovieID', 'Rating'], axis=1)
y = user_item_long['Rating']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Input layer
input_shape = X_train.shape[1]
input_layer = Input(shape=(input_shape,))

# Encoder layers
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(64, activation='relu')(encoded)
encoded = Dense(32, activation='relu')(encoded)  # User embedding size

# Decoder layers
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(decoded)
decoded = Dense(input_shape, activation='linear')(decoded)

# Autoencoder model
autoencoder = Model(input_layer, decoded)

# Encoder model (for user embeddings)
encoder_model = Model(input_layer, encoded)

# Compile and train the autoencoder
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(X_train, X_train, epochs=10, batch_size=64, validation_data=(X_val, X_val))


Epoch 1/10
[1m16718/16718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 826us/step - loss: 5201.0645 - val_loss: 5.2514
Epoch 2/10
[1m16718/16718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 820us/step - loss: 16.5022 - val_loss: 5.2030
Epoch 3/10
[1m16718/16718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 806us/step - loss: 12.3349 - val_loss: 13.5645
Epoch 4/10
[1m16718/16718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 814us/step - loss: 10.5516 - val_loss: 4.3839
Epoch 5/10
[1m16718/16718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 827us/step - loss: 8.7193 - val_loss: 3.4009
Epoch 6/10
[1m16718/16718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 847us/step - loss: 8.3498 - val_loss: 1.9642
Epoch 7/10
[1m16718/16718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 821us/step - loss: 6.6715 - val_loss: 0.4495
Epoch 8/10
[1m16718/16718[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 803us/step - loss: 5.9

<keras.src.callbacks.history.History at 0x1721550a330>

In [21]:
user_data = {
    'UserID': [1]*5,  # Assuming UserID is 1
    'MovieID': [1, 48, 150, 3671, 3751],  # MovieIDs the user has rated
    'Rating': [5.0, 5.0, 5.0, 4.0, 4.0],
    'Gender': ['F']*5,  # Example gender
    'Age': [1]*5,  # Example age
    'Occupation': [10]*5,  # Example occupation
    'Zip-code': ['48067']*5,  # Example zip-code
    'originalTitle': ['Toy Story', 'Pocahontas', 'Apollo 13', 'Blazing Saddles', 'Chicken Run'],
    'isAdult': [0]*5,
    'startYear': [1995, 1995, 1995, 1974, 2000],
    'runtimeMinutes': [81.0, 81.0, 140.0, 93.0, 84.0],
    'combined_genres': ["adventure,animation,children's,comedy",
                        "adventure,animation,children's,drama,musical,romance",
                        "adventure,drama,history",
                        "comedy,western",
                        "adventure,animation,children's,comedy"],
    'averageRating': [8.3, 6.7, 7.7, 7.7, 7.1],
    'numVotes': [1074033.0, 203974.0, 317159.0, 153338.0, 213067.0]
}

# Encode categorical variables
cat_cols = ['Gender', 'Age', 'Occupation', 'Zip-code']
label_encoders = {}
for col in cat_cols:
    label_encoders[col] = LabelEncoder()
    user_item_long[col] = label_encoders[col].fit_transform(user_item_long[col])
    user_data[col] = label_encoders[col].transform(user_data[col])

# Define input dimensions
num_users = user_item_long['UserID'].nunique()
num_movies = user_item_long['MovieID'].nunique()
num_cat_features = len(cat_cols)
num_numeric_features = len(['startYear', 'runtimeMinutes', 'averageRating', 'numVotes'])

# Define model architecture
# Embedding layers for categorical features
input_cat = Input(shape=(num_cat_features,))
embeddings = []
for i in range(num_cat_features):
    vocab_size = len(label_encoders[cat_cols[i]].classes_)
    embed_dim = int(np.ceil(np.log2(vocab_size)))
    emb_layer = Embedding(input_dim=vocab_size, output_dim=embed_dim)(input_cat[:, i])
    embeddings.append(emb_layer)

# Numeric features input
input_num = Input(shape=(num_numeric_features,))
numeric_dense = Dense(8, activation='relu')(input_num)

# Concatenate embeddings and numeric features
concatenated = Concatenate(axis=1)(embeddings + [numeric_dense])

# Fully connected layers
fc1 = Dense(64, activation='relu')(concatenated)
fc2 = Dense(32, activation='relu')(fc1)

# Output layer
output = Dense(1, activation='linear')(fc2)  # Regression output

# Model creation
model = Model(inputs=[input_cat, input_num], outputs=output)
model.compile(optimizer=Adam(), loss='mse')

# Prepare input data
X_cat = [user_item_long[cat_cols].values]
X_num = user_item_long[['startYear', 'runtimeMinutes', 'averageRating', 'numVotes']].values
y = user_item_long['Rating'].values

# Train the model
model.fit(X_cat + [X_num], y, epochs=10, batch_size=32, validation_split=0.2)

# After training, use user_data to predict user embeddings
# Prepare input for user_data
X_user_cat = [np.array(user_data[col]).reshape(-1, 1) for col in cat_cols]
X_user_num = np.array(user_data[['startYear', 'runtimeMinutes', 'averageRating', 'numVotes']])

# Predict user embeddings
user_embedding_model = Model(inputs=[input_cat, input_num], outputs=fc2)  # Output until second-to-last layer
user_embeddings = user_embedding_model.predict(X_user_cat + [X_user_num])

# Print or use user_embeddings for further processing (e.g., recommending movies)
print("User Embedding Shape:", user_embeddings.shape)
print("User Embedding:", user_embeddings)

ValueError: invalid literal for int() with base 10: 'F'

In [18]:
# Predict user embedding
user_embedding = encoder_model.predict(X_synthetic)

print("User Embedding Shape:", user_embedding.shape)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
User Embedding Shape: (5, 32)


In [19]:
# Example of recommending top-N movies based on similarity to user embedding
# This requires having a movie embedding model and similarity computation (not fully implemented here)

# Assuming you have a function to compute cosine similarity between vectors
def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

# Example of computing similarity with movie embeddings (not implemented here)
# You would typically retrieve movie embeddings and compute similarity with user embedding

# Example recommendation based on similarity (not fully implemented here)
movie_embeddings = np.array([[0.1, 0.2, 0.3], [0.2, 0.3, 0.4], [0.5, 0.6, 0.7]])  # Example movie embeddings
user_embedding = np.array([0.8, 0.9, 1.0])  # Example user embedding

# Compute similarities with user embedding
similarities = [cosine_similarity(user_embedding, movie_emb) for movie_emb in movie_embeddings]

# Select top-N movies based on highest similarities
top_n_indices = np.argsort(similarities)[::-1][:5]  # Select top 5 movies with highest similarity
recommended_movies = [movie_embeddings[idx] for idx in top_n_indices]

print("Recommended Movie Embeddings:", recommended_movies)


Recommended Movie Embeddings: [array([0.5, 0.6, 0.7]), array([0.2, 0.3, 0.4]), array([0.1, 0.2, 0.3])]
