In [None]:
genres = ['unknown', 'Action', 'Adventure','Animation', "Children's", 'Comedy', 'Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']
occupations = ['administrator','artist','doctor','educator','engineer','entertainment',
               'executive','healthcare','homemaker','lawyer','librarian','marketing','none',
               'other','programmer','retired','salesman','scientist','student','technician','writer']

from google.colab import drive
import pandas as pd

drive.mount('/content/drive', force_remount=True)

# specify the path in your Drive
save_path = "/content/drive/MyDrive/ml-100k/"


# load dataset
ratings_df = pd.read_csv(save_path+'u.data', sep='\t', header=None, names=['userId', 'movieId', 'rating', 'timestamp'], engine='python', encoding='latin')
movies_df = pd.read_csv(save_path+'u.item', sep='|', header=None, names=['movieId', 'title', 'release_date', 'video_release_date', 'IMDB']+genres, engine='python', encoding='latin')
users_df = pd.read_csv(save_path+'u.user', sep='|', header=None, names=['userId', 'age', 'gender', 'occupation', 'zip_code'], engine='python')


Mounted at /content/drive


In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

movies_df['title'] = movies_df['title'].str.rsplit(' (', 1).str[0]
users_df.drop(columns=['zip_code'], inplace=True)
users_df['gender'] = users_df['gender'].apply(lambda x: 0 if x == 'M' else 1)

genres = list(movies_df.columns[2:])
movies_df['genres'] = movies_df[genres].apply(lambda x: list(x.index[x.values == 1]), axis=1)
movies_df['release_date'] = pd.to_datetime(movies_df['release_date'])
movies_df['movie_age'] = (pd.Timestamp.now() - movies_df['release_date']).dt.days / 365.25
# Min-Max scaling for movie_age
movies_df['movie_age'] = (movies_df['movie_age'] - movies_df['movie_age'].min()) / (movies_df['movie_age'].max() - movies_df['movie_age'].min())

ratings_df = ratings_df.sort_values(by='timestamp')
ratings_df['liked'] = ratings_df['rating'].apply(lambda x: 1 if x >= 3 else 0)

liked_movies_dict = ratings_df[ratings_df['liked'] == 1].groupby('userId')['movieId'].apply(list).to_dict()
disliked_movies_dict = ratings_df[ratings_df['liked'] == 0].groupby('userId')['movieId'].apply(list).to_dict()
watched_movies_dict = ratings_df.groupby('userId')['movieId'].apply(list).to_dict()

users_df['liked_movies'] = users_df['userId'].map(liked_movies_dict)
users_df['disliked_movies'] = users_df['userId'].map(disliked_movies_dict)
users_df['watched_movies'] = users_df['userId'].map(watched_movies_dict)

movies_df.set_index('movieId', inplace=True)

watched_genres_dict = {user_id: list(set([genre for movie in movies for genre in movies_df.loc[movie]['genres']])) for user_id, movies in watched_movies_dict.items()}
users_df['watched_genres'] = users_df['userId'].map(watched_genres_dict)

watched_movie_age_dict = {user_id: [movies_df.loc[movie]['movie_age'] for movie in movies] for user_id, movies in watched_movies_dict.items()}
users_df['watched_movie_age'] = users_df['userId'].map(watched_movie_age_dict)

liked_movie_age_dict = {user_id: [movies_df.loc[movie]['movie_age'] for movie in movies] for user_id, movies in liked_movies_dict.items()}
users_df['liked_movie_age'] = users_df['userId'].map(liked_movie_age_dict)

def avg_with_nan_replacement(lst):
    if lst and all(isinstance(i, (int, float)) for i in lst):
        mean_val = np.nanmean(lst)
        lst = [mean_val if np.isnan(i) else i for i in lst]
    return lst

watched_movie_age_dict = {user_id: avg_with_nan_replacement([movies_df.loc[movie]['movie_age'] for movie in movies]) for user_id, movies in watched_movies_dict.items()}
users_df['watched_movie_age'] = users_df['userId'].map(watched_movie_age_dict)

liked_movie_age_dict = {user_id: avg_with_nan_replacement([movies_df.loc[movie]['movie_age'] for movie in movies]) for user_id, movies in liked_movies_dict.items()}
users_df['liked_movie_age'] = users_df['userId'].map(liked_movie_age_dict)

# Min-Max scaling
users_df['age'] = (users_df['age'] - users_df['age'].min()) / (users_df['age'].max() - users_df['age'].min())

occupations = users_df['occupation'].unique().tolist()
occupation_dict = {occupation: index for index, occupation in enumerate(occupations)}
users_df['occupation'] = users_df['occupation'].map(occupation_dict)

genre_dict = {genre: index for index, genre in enumerate(genres)}
users_df['watched_genres'] = users_df['watched_genres'].apply(lambda x: [genre_dict[genre] for genre in x])

WINDOW_SIZE = 10
TARGET = 'liked_movies'

def extract_windows_and_labels(row):
    windows = []
    labels = []
    labels_like = []
    avg_liked_movie_ages = []

    if isinstance(row[TARGET], list):
        if len(row[TARGET]) > WINDOW_SIZE:
            for i in range(len(row[TARGET]) - WINDOW_SIZE):
                windows.append(row[TARGET][i:i+WINDOW_SIZE])
                avg_liked_movie_ages.append((row['liked_movie_age'][i+WINDOW_SIZE]))
                next_movie = row[TARGET][i+WINDOW_SIZE]
                labels.append(next_movie)
                labels_like.append(1 if isinstance(row['liked_movies'], list) and next_movie in row['liked_movies'] else 0)
        # else:
        #     windows.append(row[TARGET][:-1])
        #     avg_liked_movie_ages.append(np.mean(row['liked_movie_age']))
        #     labels.append(row[TARGET][-1])
        #     labels_like.append(1 if isinstance(row['liked_movies'], list) and row[TARGET][-1] in row['liked_movies'] else 0)

    return pd.Series([windows, labels, labels_like, avg_liked_movie_ages])

users_df[['windows', 'labels', 'labels_like', 'avg_liked_movie_age']] = users_df.apply(extract_windows_and_labels, axis=1)

expanded_df = users_df.explode('windows').reset_index(drop=True)
labels_df = expanded_df.explode('labels').reset_index(drop=True)
labels_like_df = expanded_df.explode('labels_like').reset_index(drop=True)

expanded_df['labels'] = labels_df['labels']
expanded_df['labels_like'] = labels_like_df['labels_like']

expanded_df['avg_liked_movie_age'] = expanded_df.explode('avg_liked_movie_age').reset_index(drop=True)['avg_liked_movie_age']
expanded_df['avg_liked_movie_age'] = expanded_df['avg_liked_movie_age'].astype('float64')

expanded_df['disliked_movies'] = expanded_df['disliked_movies'].apply(lambda x: [] if isinstance(x, float) else x)

def handle_nan(x):
    if isinstance(x, (list, np.ndarray)):
        return x
    elif pd.isnull(x):
        return []
    else:
        return [x]

expanded_df['disliked_movies'] = expanded_df['disliked_movies'].apply(handle_nan)

# Adding user features
gender_ohe = OneHotEncoder()
gender_encoded = gender_ohe.fit_transform(users_df[['gender']]).toarray()

occupation_ohe = OneHotEncoder()
occupation_encoded = occupation_ohe.fit_transform(users_df[['occupation']]).toarray()

combined_features = np.hstack((gender_encoded, users_df[['age']].values, occupation_encoded))
users_df['user_feat'] = combined_features.tolist()

# Adding 'user_feat' to the 'expanded_df'
expanded_df['user_feat'] = expanded_df['userId'].map(users_df.set_index('userId')['user_feat'])
# # 'userId'와 'movieId'가 공통 키인 경우
# # 'userId'와 'labels'가 병합의 키인 경우
# expanded_df = expanded_df.merge(ratings_df[['userId', 'movieId', 'rating']],
#                                 left_on=['userId', 'labels'],
#                                 right_on=['userId', 'movieId'],
#                                 how='left').drop(columns=['movieId'])



expanded_df = expanded_df.dropna()
# Assuming the lists are in string format, we convert them to lists
# Create a condition for the rows to be kept
condition = (expanded_df['disliked_movies'].str.len() > 10) & (expanded_df['liked_movies'].str.len() > 15)

# Keep only the rows that satisfy the condition
expanded_df = expanded_df[condition]



# Compute the number of rows for each user
row_counts_per_user = expanded_df.groupby('userId').size()

# Compute summary statistics
statistics = row_counts_per_user.describe()

# Display the statistics
print(f"Max: {statistics['max']}")
print(f"75th percentile: {statistics['75%']}")
print(f"Median: {statistics['50%']}")
print(f"25th percentile: {statistics['25%']}")
print(f"Min: {statistics['min']}")

# Compute the number of rows for each user
row_counts_per_user = expanded_df.groupby('userId').size()

# Filter out users with less than 68 rows
users_to_keep = row_counts_per_user[row_counts_per_user >= 100].index

# Keep only the rows with 'userId' in 'users_to_keep'
filtered_df = expanded_df[expanded_df['userId'].isin(users_to_keep)]
expanded_df = filtered_df
expanded_df


users_df['disliked_movies'] = users_df['disliked_movies'].apply(lambda x: [] if isinstance(x, float) else x)

def handle_nan(x):
    if isinstance(x, (list, np.ndarray)):
        return x
    elif pd.isnull(x):
        return []
    else:
        return [x]

expanded_df['disliked_movies'] = expanded_df['disliked_movies'].apply(handle_nan)

liked_movies_lengths = users_df['liked_movies'].apply(len)
disliked_movies_lengths = users_df['disliked_movies'].apply(len)

print("Liked Movies Lengths:")
print(f"Max: {liked_movies_lengths.max()}")
print(f"75%: {liked_movies_lengths.quantile(0.75)}")
print(f"Median: {liked_movies_lengths.median()}")
print(f"25%: {liked_movies_lengths.quantile(0.25)}")
print(f"Min: {liked_movies_lengths.min()}")

print("\nDisliked Movies Lengths:")
print(f"Max: {disliked_movies_lengths.max()}")
print(f"75%: {disliked_movies_lengths.quantile(0.75)}")
print(f"Median: {disliked_movies_lengths.median()}")
print(f"25%: {disliked_movies_lengths.quantile(0.25)}")
print(f"Min: {disliked_movies_lengths.min()}")


num_unique_users = expanded_df['userId'].nunique()
print(f"Number of unique users: {num_unique_users}")



  movies_df['title'] = movies_df['title'].str.rsplit(' (', 1).str[0]


Max: 499.0
75th percentile: 186.0
Median: 116.0
25th percentile: 68.0
Min: 6.0
Liked Movies Lengths:
Max: 509
75%: 123.5
Median: 55.0
25%: 27.0
Min: 6

Disliked Movies Lengths:
Max: 558
75%: 21.0
Median: 9.0
25%: 4.0
Min: 0
Number of unique users: 243


In [None]:
expanded_df

Unnamed: 0,userId,age,gender,occupation,liked_movies,disliked_movies,watched_movies,watched_genres,watched_movie_age,liked_movie_age,windows,labels,labels_like,avg_liked_movie_age,user_feat
0,1,0.257576,0,0,"[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...","[0.3229842446709916, 0.24488486490340053, 0.16...","[0.3229842446709916, 0.24488486490340053, 0.16...","[168, 172, 165, 156, 166, 196, 187, 14, 250, 127]",181,1,0.020960,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,..."
1,1,0.257576,0,0,"[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...","[0.3229842446709916, 0.24488486490340053, 0.16...","[0.3229842446709916, 0.24488486490340053, 0.16...","[172, 165, 156, 166, 196, 187, 14, 250, 127, 181]",117,1,0.030940,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,..."
2,1,0.257576,0,0,"[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...","[0.3229842446709916, 0.24488486490340053, 0.16...","[0.3229842446709916, 0.24488486490340053, 0.16...","[165, 156, 166, 196, 187, 14, 250, 127, 181, 117]",109,1,0.032687,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,..."
3,1,0.257576,0,0,"[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...","[0.3229842446709916, 0.24488486490340053, 0.16...","[0.3229842446709916, 0.24488486490340053, 0.16...","[156, 166, 196, 187, 14, 250, 127, 181, 117, 109]",1,1,0.049583,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,..."
4,1,0.257576,0,0,"[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[168, 172, 165, 156, 166, 196, 187, 14, 250, 1...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...","[0.3229842446709916, 0.24488486490340053, 0.16...","[0.3229842446709916, 0.24488486490340053, 0.16...","[166, 196, 187, 14, 250, 127, 181, 117, 109, 1]",246,1,0.023526,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73098,943,0.227273,0,5,"[64, 181, 28, 174, 100, 127, 508, 763, 50, 475...","[1067, 412, 1028, 756, 1047, 284, 831, 1011, 1...","[64, 181, 28, 174, 100, 1067, 127, 508, 763, 5...","[9, 18, 16, 4, 11, 17, 12, 19, 15, 7, 14, 20, ...","[0.06259356954444996, 0.02095957795679758, 0.0...","[0.06259356954444996, 0.02095957795679758, 0.0...","[722, 38, 765, 1074, 1188, 41, 1228, 373, 796,...",211,1,0.375062,"[1.0, 0.0, 0.22727272727272727, 0.0, 0.0, 0.0,..."
73099,943,0.227273,0,5,"[64, 181, 28, 174, 100, 127, 508, 763, 50, 475...","[1067, 412, 1028, 756, 1047, 284, 831, 1011, 1...","[64, 181, 28, 174, 100, 1067, 127, 508, 763, 5...","[9, 18, 16, 4, 11, 17, 12, 19, 15, 7, 14, 20, ...","[0.06259356954444996, 0.02095957795679758, 0.0...","[0.06259356954444996, 0.02095957795679758, 0.0...","[38, 765, 1074, 1188, 41, 1228, 373, 796, 237,...",469,1,0.075604,"[1.0, 0.0, 0.22727272727272727, 0.0, 0.0, 0.0,..."
73100,943,0.227273,0,5,"[64, 181, 28, 174, 100, 127, 508, 763, 50, 475...","[1067, 412, 1028, 756, 1047, 284, 831, 1011, 1...","[64, 181, 28, 174, 100, 1067, 127, 508, 763, 5...","[9, 18, 16, 4, 11, 17, 12, 19, 15, 7, 14, 20, ...","[0.06259356954444996, 0.02095957795679758, 0.0...","[0.06259356954444996, 0.02095957795679758, 0.0...","[765, 1074, 1188, 41, 1228, 373, 796, 237, 133...",525,1,0.687531,"[1.0, 0.0, 0.22727272727272727, 0.0, 0.0, 0.0,..."
73101,943,0.227273,0,5,"[64, 181, 28, 174, 100, 127, 508, 763, 50, 475...","[1067, 412, 1028, 756, 1047, 284, 831, 1011, 1...","[64, 181, 28, 174, 100, 1067, 127, 508, 763, 5...","[9, 18, 16, 4, 11, 17, 12, 19, 15, 7, 14, 20, ...","[0.06259356954444996, 0.02095957795679758, 0.0...","[0.06259356954444996, 0.02095957795679758, 0.0...","[1074, 1188, 41, 1228, 373, 796, 237, 1330, 15...",197,1,0.414130,"[1.0, 0.0, 0.22727272727272727, 0.0, 0.0, 0.0,..."


In [None]:
selected_df = expanded_df[['userId', 'windows', 'disliked_movies', 'watched_genres', 'labels', 'user_feat']].copy()
selected_df['example_age'] = expanded_df['avg_liked_movie_age']
selected_df

Unnamed: 0,userId,windows,disliked_movies,watched_genres,labels,user_feat,example_age
0,1,"[168, 172, 165, 156, 166, 196, 187, 14, 250, 127]","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...",181,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,...",0.020960
1,1,"[172, 165, 156, 166, 196, 187, 14, 250, 127, 181]","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...",117,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,...",0.030940
2,1,"[165, 156, 166, 196, 187, 14, 250, 127, 181, 117]","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...",109,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,...",0.032687
3,1,"[156, 166, 196, 187, 14, 250, 127, 181, 117, 109]","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...",1,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,...",0.049583
4,1,"[166, 196, 187, 14, 250, 127, 181, 117, 109, 1]","[245, 260, 264, 126, 237, 11, 8, 143, 94, 145,...","[18, 11, 12, 10, 5, 8, 9, 19, 15, 7, 14, 20, 1...",246,"[1.0, 0.0, 0.25757575757575757, 1.0, 0.0, 0.0,...",0.023526
...,...,...,...,...,...,...,...
72313,932,"[389, 709, 228, 566, 222, 663, 161, 1065, 617,...","[890, 640, 64, 841, 235, 385, 550, 541, 679, 1...","[9, 16, 18, 11, 4, 17, 12, 19, 15, 13, 7, 14, ...",528,"[1.0, 0.0, 0.7727272727272727, 0.0, 0.0, 0.0, ...",0.192807
72315,932,"[228, 566, 222, 663, 161, 1065, 617, 167, 1411...","[890, 640, 64, 841, 235, 385, 550, 541, 679, 1...","[9, 16, 18, 11, 4, 17, 12, 19, 15, 13, 7, 14, ...",491,"[1.0, 0.0, 0.7727272727272727, 0.0, 0.0, 0.0, ...",0.791687
72318,932,"[663, 161, 1065, 617, 167, 1411, 708, 121, 676...","[890, 640, 64, 841, 235, 385, 550, 541, 679, 1...","[9, 16, 18, 11, 4, 17, 12, 19, 15, 13, 7, 14, ...",506,"[1.0, 0.0, 0.7727272727272727, 0.0, 0.0, 0.0, ...",0.570364
72320,932,"[1065, 617, 167, 1411, 708, 121, 676, 399, 103...","[890, 640, 64, 841, 235, 385, 550, 541, 679, 1...","[9, 16, 18, 11, 4, 17, 12, 19, 15, 13, 7, 14, ...",517,"[1.0, 0.0, 0.7727272727272727, 0.0, 0.0, 0.0, ...",0.257895


In [None]:



import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
class CandidateGeneration(nn.Module):
    def __init__(self, n_movies, n_genres, embedding_dim):
        super(CandidateGeneration, self).__init__()

        self.genre_embedding_dim = 16
        self.user_embedding_dim = 24
        self.n_movies = n_movies
        self.movie_embedding = nn.Embedding(n_movies+1, embedding_dim, padding_idx=0).to(device)
        self.genre_embedding = nn.Embedding(n_genres, self.genre_embedding_dim).to(device)
        self.l2_norm = nn.LayerNorm(embedding_dim).to(device)
        self.l2_norm2 = nn.LayerNorm(self.genre_embedding_dim).to(device)

        # Rest of your code


        self.dense_1 = nn.Linear(embedding_dim + self.genre_embedding_dim + self.user_embedding_dim  + 1 , 512)
        self.batch_norm_1 = nn.BatchNorm1d(512)
        self.dense_2 = nn.Linear(512, 256)
        self.batch_norm_2 = nn.BatchNorm1d(256)
        self.dense_3 = nn.Linear(256, 128)
        self.batch_norm_3 = nn.BatchNorm1d(128)
        self.dense_output = nn.Linear(128, embedding_dim)

    def forward(self, windows, watched_genres, user_feat, watched_movie_age):


        windows_embedded = self.movie_embedding(windows)
        windows_embedded = self.l2_norm(windows_embedded)
        windows_embedded = windows_embedded.mean(dim=1)

        watched_genres = self.genre_embedding(watched_genres)
        watched_genres = self.l2_norm2(watched_genres)
        watched_genres = watched_genres.mean(dim=1)

        windows_embedded = windows_embedded.float()
        watched_genres = watched_genres.float()
        watched_movie_age = watched_movie_age.float()
        user_feat = user_feat.float()

        # concatenate all the embeddings

        x = torch.cat([windows_embedded, watched_genres, user_feat, watched_movie_age ], dim=1)
        # Pass through dense layers with ReLU and Batch Norm
        x = torch.relu(self.batch_norm_1(self.dense_1(x)))
        x = torch.relu(self.batch_norm_2(self.dense_2(x)))
        x = torch.relu(self.batch_norm_3(self.dense_3(x)))

        # Output layer
        user_embedding = self.dense_output(x)
        return user_embedding



    def loss(self, user_embedding, target_movies, negative_samples):
        target_movie_embeddings = self.movie_embedding(target_movies)
        negative_movie_embeddings = self.movie_embedding(negative_samples)

        target_scores = (user_embedding * target_movie_embeddings).sum(dim=1)
        negative_scores = (user_embedding.unsqueeze(1) * negative_movie_embeddings).sum(dim=-1)
        logits = torch.cat([target_scores, negative_scores], dim=1)
        targets = torch.zeros(logits.shape[0], dtype=torch.long, device=logits.device)

        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logits, targets)

        return loss

cpu


In [None]:

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.model_selection import GroupShuffleSplit, KFold
from torch.optim.lr_scheduler import ReduceLROnPlateau
from copy import deepcopy

class MovieDataset(Dataset):
    def __init__(self, user_df):
        self.user_df = user_df

    def __len__(self):
        return len(self.user_df)

    def __getitem__(self, idx):
        user = self.user_df.iloc[idx]
        try:
          windows = torch.tensor(user['windows'], dtype=torch.long)
        except:
          print(user['windows'])
        watched_genres = torch.tensor(user['watched_genres'], dtype=torch.long)
        disliked_movies = torch.tensor(user['disliked_movies'], dtype=torch.long)


        def pad_or_truncate(tensor, max_length):
            if len(tensor) < max_length:
                padding = torch.zeros(max_length - len(tensor), *tensor.shape[1:], dtype=tensor.dtype)
                tensor = torch.cat((tensor, padding))
            else:
                tensor = tensor[:max_length]
            return tensor

        windows = pad_or_truncate(windows, WINDOW_SIZE)
        disliked_movies = pad_or_truncate(disliked_movies, 10)

        watched_genres = pad_or_truncate(watched_genres, 19)
        user_id = torch.tensor([user['userId']])
        #labels = pad_or_truncate(labels, self.max_seq_length)
        #labels_like = pad_or_truncate(labels_like, self.max_seq_length)
        labels = torch.tensor([user['labels']])
        labels_like = torch.tensor([user['labels_like']])
        watched_movie_age = torch.tensor([user['avg_liked_movie_age']])
        user_feat = torch.tensor(user['user_feat'])
        return user_id, windows, watched_genres, watched_movie_age, user_feat, labels, labels_like, disliked_movies





def split_data(df, test_ratio=0.05):
    train_data = []
    test_data = []

    for _, group in df.groupby('userId'):
        cutoff = int((1.0 - test_ratio) * len(group))
        train = group.iloc[:cutoff]
        test = group.iloc[cutoff:]

        train_data.append(train)
        test_data.append(test)
    return pd.concat(train_data), pd.concat(test_data)



# Hyperparameters
n_movies = len(movies_df)
n_genres = len(genres)
n_genders = 2
n_occupations = len(occupations)
embedding_dim = 128
batch_size = 32
epochs = 100
lr = 0.01
max_seq_length = 400


# Split the dataset into training and testing sets
train_df, test_df = split_data(expanded_df)
train_dataset = MovieDataset(train_df)
test_dataset = MovieDataset(test_df)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Model and optimizer
model = CandidateGeneration(n_movies, n_genres, embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
saved_weights = {}

# Training loop
for epoch in range(epochs):
    total_loss = 0

    for batch_num, ( user_ids, windows, watched_genres, watched_movie_age, user_feat, labels, labels_like, disliked_movies) in enumerate(train_dataloader, 1):
        # Move data to GPU
        user_ids, windows, watched_genres, watched_movie_age, user_feat, labels, disliked_movies = user_ids.to(device), windows.to(device), watched_genres.to(device), watched_movie_age.to(device), user_feat.to(device),  labels.to(device),disliked_movies.to(device)

        optimizer.zero_grad()

        output = model(windows, watched_genres, user_feat, watched_movie_age)
        loss = model.loss(output, labels, disliked_movies)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if batch_num % 100 == 0:
            print(f"Epoch {epoch+1}/{epochs}, Batch {batch_num}/{len(train_dataloader)}, Batch Loss: {loss.item()}")

    epoch_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Average Loss: {epoch_loss}")
    saved_weights[epoch] = model.movie_embedding.weight.detach().clone()
    if epoch > 0:
        # Find the rows that have changed
        changed_rows = (saved_weights[epoch] != saved_weights[epoch-1]).any(dim=1)
        # Print the indices of the changed rows
        print(f"Changed rows in epoch {epoch}: {changed_rows.nonzero(as_tuple=True)[0]}")
        print(model.movie_embedding.weight)

    # Testing loop
    total_test_loss = 0
    for  batch_num, ( user_ids, windows, watched_genres, watched_movie_age, user_feat, labels, labels_like, disliked_movies) in enumerate(test_dataloader, 1):
        # Move data to GPU
        user_ids, windows, watched_genres, watched_movie_age, user_feat, labels, disliked_movies = user_ids.to(device), windows.to(device), watched_genres.to(device), watched_movie_age.to(device), user_feat.to(device),  labels.to(device),disliked_movies.to(device)

        with torch.no_grad():
            output = model(windows, watched_genres, user_feat, watched_movie_age)
            loss = model.loss(output, labels, disliked_movies)
            total_test_loss += loss.item()

    average_test_loss = total_test_loss / len(test_dataloader)
    print(f"Test Loss: {average_test_loss}")




Epoch 1/100, Batch 100/1338, Batch Loss: 4.4243621826171875
Epoch 1/100, Batch 200/1338, Batch Loss: 4.864561080932617
Epoch 1/100, Batch 300/1338, Batch Loss: 2.8402206897735596
Epoch 1/100, Batch 400/1338, Batch Loss: 1.9792455434799194
Epoch 1/100, Batch 500/1338, Batch Loss: 1.8518508672714233
Epoch 1/100, Batch 600/1338, Batch Loss: 0.6283324956893921
Epoch 1/100, Batch 700/1338, Batch Loss: 0.0013867157977074385
Epoch 1/100, Batch 800/1338, Batch Loss: 0.004879412706941366
Epoch 1/100, Batch 900/1338, Batch Loss: 0.6647571325302124
Epoch 1/100, Batch 1000/1338, Batch Loss: 0.021728891879320145
Epoch 1/100, Batch 1100/1338, Batch Loss: 0.0007691372884437442
Epoch 1/100, Batch 1200/1338, Batch Loss: 0.000269660959020257
Epoch 1/100, Batch 1300/1338, Batch Loss: 8.648981747683138e-05
Epoch 1/100, Average Loss: 2.072298743354871
Test Loss: 0.03190034653112283
Epoch 2/100, Batch 100/1338, Batch Loss: 0.00010571196617092937
Epoch 2/100, Batch 200/1338, Batch Loss: 0.0006189148407429457

KeyboardInterrupt: ignored

In [None]:


# Initialize DataLoader for testing
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Initialize variables for mAP
mAP_N = 0
rmAP_N = 0
display_freq = 100
top_N_size = 100  # Specify the maximum size for top_N

# Iterate over all batches in the test set
for  batch_num, ( user_ids, windows, watched_genres, watched_movie_age, user_feat, labels, labels_like, disliked_movies) in enumerate(test_dataloader, 1):
    user_ids, windows, watched_genres, watched_movie_age, user_feat, labels, disliked_movies = user_ids.to(device), windows.to(device), watched_genres.to(device), watched_movie_age.to(device), user_feat.to(device),  labels.to(device),disliked_movies.to(device)

    windows = windows.repeat(batch_size, 1)
    watched_genre = watched_genres.repeat(batch_size, 1)
    user_feature = user_feat.repeat(batch_size, 1)
    watched_movie_age = watched_movie_age.repeat(batch_size, 1)


    # Get the user embedding from the model
    user_embedding = model(windows, watched_genre, user_feature, watched_movie_age)

    # Compute scores for all movies
    all_movie_scores = torch.matmul(user_embedding, model.movie_embedding.weight.T)

    # Sort the movie IDs by their scores in descending order
    sorted_movie_ids = all_movie_scores.argsort(descending=True)

    # Prepare liked movie data for this batch
    user_id = user_ids.item()  # get the user_id as a single number, not a list
    liked_movies = expanded_df[expanded_df['userId'] == user_id]['liked_movies'].values[0]

    for top_N in range(10, top_N_size+1):
        # Get the top N movie IDs from the sorted list
        top_movie_ids = sorted_movie_ids[0, :top_N].tolist()

        # Find the true positive movies
        true_positive = [movie_id for movie_id in top_movie_ids if movie_id in liked_movies]
        # Total positive cases
        positive = liked_movies

        # Select random movies
        random_movie_ids = torch.randint(len(model.movie_embedding.weight), (top_N,)).tolist()
        random_positive = [movie_id for movie_id in random_movie_ids if movie_id in liked_movies]

        if top_N % display_freq == 0 or top_N == 1:
            print(f'top N  = {top_N} -------------------\n')
            print(f'model  = Rank {top_N} Recall    : {len(true_positive)/len(positive)} ({len(true_positive)}/{len(positive)})')
            print(f'model  = Rank {top_N} Precision : {len(true_positive)/top_N} ({len(true_positive)}/{top_N})\n')
            print(f'random = Rank {top_N} Recall    : {len(random_positive)/len(positive)} ({len(random_positive)}/{len(positive)})')
            print(f'random = Rank {top_N} Precision : {len(random_positive)/top_N} ({len(random_positive)}/{top_N})\n')

        # Add to the mAP
        mAP_N += len(true_positive) / top_N
        rmAP_N += len(random_positive) / top_N

print(f'mAP@{top_N_size} = {mAP_N/(top_N_size * len(test_dataset))}')
print(f'rmAP@{top_N_size} = {rmAP_N/(top_N_size * len(test_dataset))}')



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
model  = Rank 100 Recall    : 0.11258278145695365 (17/151)
model  = Rank 100 Precision : 0.17 (17/100)

random = Rank 100 Recall    : 0.039735099337748346 (6/151)
random = Rank 100 Precision : 0.06 (6/100)

top N  = 100 -------------------

model  = Rank 100 Recall    : 0.09933774834437085 (15/151)
model  = Rank 100 Precision : 0.15 (15/100)

random = Rank 100 Recall    : 0.046357615894039736 (7/151)
random = Rank 100 Precision : 0.07 (7/100)

top N  = 100 -------------------

model  = Rank 100 Recall    : 0.08609271523178808 (13/151)
model  = Rank 100 Precision : 0.13 (13/100)

random = Rank 100 Recall    : 0.052980132450331126 (8/151)
random = Rank 100 Precision : 0.08 (8/100)

top N  = 100 -------------------

model  = Rank 100 Recall    : 0.08609271523178808 (13/151)
model  = Rank 100 Precision : 0.13 (13/100)

random = Rank 100 Recall    : 0.059602649006622516 (9/151)
random = Rank 100 Precision : 0.09 (9/100)

top N  = 100 -------