In [22]:
import pandas as pd

In [24]:
movies = pd.read_csv('./Data/movies.dat',sep="::",header=None,engine='python',names=["MovieID","title","genres"],encoding="ISO-8859-1")

In [25]:
ratings = pd.read_csv('./Data/ratings.dat',sep="::",header=None,engine='python',names=["UserID","MovieID","Ratings","Timestamp"],encoding="ISO-8859-1")


In [26]:
ratings['Timestamp'] = pd.to_datetime(ratings['Timestamp'], unit='s')

In [27]:
age_mapping = {
    1: "Under 18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+"
}

occupation_mapping = {
    0: "other",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer"
}


In [28]:
users = pd.read_csv('./Data/users.dat', sep="::", header=None, names=["UserID", "gender", "age", "occupation", "zip_code"], engine='python',encoding="ISO-8859-1")

In [29]:
users = users.drop(columns=['zip_code'])

In [30]:
users['age'] = users['age'].map(age_mapping)
users['occupation'] = users['occupation'].map(occupation_mapping)

In [31]:
movies.head()

Unnamed: 0,MovieID,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [32]:
ratings.head()

Unnamed: 0,UserID,MovieID,Ratings,Timestamp
0,1,1193,5,2000-12-31 22:12:40
1,1,661,3,2000-12-31 22:35:09
2,1,914,3,2000-12-31 22:32:48
3,1,3408,4,2000-12-31 22:04:35
4,1,2355,5,2001-01-06 23:38:11


In [33]:
users.head()

Unnamed: 0,UserID,gender,age,occupation
0,1,F,Under 18,K-12 student
1,2,M,56+,self-employed
2,3,M,25-34,scientist
3,4,M,45-49,executive/managerial
4,5,M,25-34,writer


In [34]:
users_rating = pd.merge(users,ratings,on="UserID",how="inner")

In [35]:
users_rating.head()

Unnamed: 0,UserID,gender,age,occupation,MovieID,Ratings,Timestamp
0,1,F,Under 18,K-12 student,1193,5,2000-12-31 22:12:40
1,1,F,Under 18,K-12 student,661,3,2000-12-31 22:35:09
2,1,F,Under 18,K-12 student,914,3,2000-12-31 22:32:48
3,1,F,Under 18,K-12 student,3408,4,2000-12-31 22:04:35
4,1,F,Under 18,K-12 student,2355,5,2001-01-06 23:38:11


In [52]:
merged_data = pd.merge(users_rating,movies,on="MovieID",how="inner")

In [53]:
merged_data.head()

Unnamed: 0,UserID,gender,age,occupation,MovieID,Ratings,Timestamp,title,genres
0,1,F,Under 18,K-12 student,1193,5,2000-12-31 22:12:40,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,F,Under 18,K-12 student,661,3,2000-12-31 22:35:09,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,F,Under 18,K-12 student,914,3,2000-12-31 22:32:48,My Fair Lady (1964),Musical|Romance
3,1,F,Under 18,K-12 student,3408,4,2000-12-31 22:04:35,Erin Brockovich (2000),Drama
4,1,F,Under 18,K-12 student,2355,5,2001-01-06 23:38:11,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [93]:
import tensorflow as tf


In [94]:
from tensorflow.keras.layers import Input, Embedding, Concatenate ,Dense , Flatten, Dot

In [95]:
from tensorflow.keras.models import Model

In [96]:
num_users = merged_data['UserID'].nunique()
num_movies = merged_data['MovieID'].nunique()
num_genders = merged_data['gender'].nunique()
num_ages = merged_data['age'].nunique()
num_occupations = merged_data['occupation'].nunique()

In [117]:
user_embedding_size = min(50, num_users // 2)
gender_embedding_size = 3  
age_embedding_size = min(50, num_ages // 2)
occupation_embedding_size = min(50, num_occupations // 2)  # Should be 10
movie_embedding_size = min(50, num_movies // 2)  # S

In [118]:
user_input = Input(shape=(1,), name='input_layer_5')
gender_input = Input(shape=(1,), name='input_layer_7')
age_input = Input(shape=(1,), name='input_layer_8')
occupation_input = Input(shape=(1,), name='input_layer_occupation')
movie_input = Input(shape=(1,), name='input_layer_movie')

In [119]:
user_embedding = Embedding(num_users, user_embedding_size)(user_input)
gender_embedding = Embedding(num_genders, gender_embedding_size)(gender_input)
age_embedding = Embedding(num_ages, age_embedding_size)(age_input)
occupation_embedding = Embedding(num_occupations, occupation_embedding_size)(occupation_input)
movie_embedding = Embedding(num_movies, movie_embedding_size)(movie_input)

In [120]:
user_embedding = Flatten()(user_embedding)
gender_embedding = Flatten()(gender_embedding)
age_embedding = Flatten()(age_embedding)
occupation_embedding = Flatten()(occupation_embedding)
movie_embedding = Flatten()(movie_embedding)

In [121]:
print(f"Occupation embedding shape: {occupation_embedding.shape}")  # Should be (None, 10)
print(f"Movie embedding shape: {movie_embedding.shape}")  # Should be (None, 1853)

Occupation embedding shape: (None, 10)
Movie embedding shape: (None, 50)
