<a href="https://colab.research.google.com/github/codesongs/codestates_TP2/blob/main/AutoEncoder_%EC%B6%94%EC%B2%9C%EC%8B%9C%EC%8A%A4%ED%85%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/tp2'

Mounted at /content/drive


In [None]:
# 데이터 가져오기
import os
import pandas as pd

def load_ratings(path):
    COL_NAME = ['userId','movieId','rating','timestamp']
    df = pd.read_csv(os.path.join(path,"ratings.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_movies(path):
    COL_NAME = ['movieId','title','genres']
    df = pd.read_csv(os.path.join(path,"movies.dat"),sep='::', header=None, engine='python', names=COL_NAME, encoding = 'ISO-8859-1' )
    return df

def load_users(path):
    COL_NAME = ['userId','gender','age','Occupation','zip_code']
    df = pd.read_csv(os.path.join(path,"users.dat"),sep='::', header=None, engine='python', names=COL_NAME)
    return df

In [None]:
# df_ratings = load_ratings('/content')
# df_movies = load_movies('/content')
# df_users = load_users('/content')

In [None]:
df_ratings = load_ratings(path)
df_movies = load_movies(path)
df_users = load_users(path)

In [None]:
# 'movieId'와 'userId'를 기준으로 데이터프레임 통합
df_total = pd.merge(df_ratings, df_movies, on='movieId', how='left')
df_total = pd.merge(df_total, df_users, on='userId', how='left')

# 결과 확인
df_total

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,gender,age,Occupation,zip_code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,Weekend at Bernie's (1989),Comedy,M,25,6,11106
1000205,6040,1094,5,956704887,"Crying Game, The (1992)",Drama|Romance|War,M,25,6,11106
1000206,6040,562,5,956704746,Welcome to the Dollhouse (1995),Comedy|Drama,M,25,6,11106
1000207,6040,1096,4,956715648,Sophie's Choice (1982),Drama,M,25,6,11106


In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.nn.init as weight_init

class Encoder(nn.Module):
    """
    Class for Symmetrical AutoEncoder Network
    :param L: List of int, contains sizes of encoding layers and starts with feature size
    For example: [500, 20, 10] will result in:
      - encoder 2 layers: 500x20 and 20x10. Representation layer (z) will be 10
      - decoder 2 layers: 10x20 and 20x500, output size is 500, reconstructed.
    :param activation_fn: (default 'sigmoid') Type of activation function
    :param drop_prob: (default: 0.0) Dropout probability
    """
    def __init__(self, L, activation_fn='sigmoid', drop_prob=0.0):
        super(Encoder, self).__init__()
        layers = self.create_nn_structure(L)
        self.num_layers = len(L)
        # create controller for activation function
        self.activation_fn_nm = activation_fn
        # create dropout module
        self._drop_prob = drop_prob
        if drop_prob > 0.0:
            self.dropout = nn.Dropout(drop_prob)
        #initialize with empty list to store layers
        self.linears = nn.ModuleList([])
        self.linears.extend([nn.Linear(i[0], i[1]) for i in layers])

    def get_activation_fn(self):
        # user selected activation function at layers except for last layer
        if self.activation_fn_nm == 'relu':
            return nn.ReLU()
        elif self.activation_fn_nm == 'lrelu':
            return nn.LeakyReLU()
        elif self.activation_fn_nm == 'sigmoid':
            return nn.Sigmoid()
        else:
            raise ValueError('Activation function type not defined')

    def forward(self, x):
        for i,layer in enumerate(self.linears):
            if i <= self.num_layers-1:
                # create instance of activation function
                act_fn = self.get_activation_fn()
                # pass in the input
                x = act_fn(self.linears[i](x))
                if self._drop_prob > 0.0 and i <= int(self.num_layers/2):
                    # apply dropout only on encode layer by control of i
                    x = self.dropout(x)
        # No activation on the last decoding layer
        x = self.linears[-1](x)
        return x

    def create_nn_structure(self, L):
        max_ind = len(L)-1
        layers = []
        for i,v in enumerate(L):
            if i < max_ind:
                #still have i+1 available, create layer tuple
                layer = [v,L[i+1]]
                layers.append(layer)
        #then inverse the layers for decoder size
        encoder_layers = layers[:]
        for l in encoder_layers[::-1]:
            decoder_layer = l[::-1]
            layers.append(decoder_layer)
        return layers


In [None]:
import pandas as pd

def create_index_mapping(L):
    '''
    return reindexed dict on user and items
    encoded indices starts from 1
    input:
    * L: list of str
    outputs:
    * ind_2_item,item_2_ind: tuple of dictionary
    '''
    L = set(L)
    ind_2_item = {}

    for i,v in enumerate(L):
        #index start from 1
        ind_2_item[i+1] = v
    #invert the map
    item_2_ind = {v: k for k, v in ind_2_item.items()}
    return ind_2_item,item_2_ind

def reindexer(ratings_df,user_col,item_col,rating_col):
    '''
    inputs:
    * ratings_df: pandas df containing ratings/affinity for user-item pairs
    * user_col: actual col name for users
    * item_col: actual col name for items
    * rating_col: actual col name for ratings
    output:
    * ratings_df: reindexed user and item column, pandas df
    '''
    users_list = ratings_df[user_col].tolist()
    item_list = ratings_df[item_col].tolist()

    ind_2_user,user_2_ind = create_index_mapping(users_list)
    ind_2_item,item_2_ind = create_index_mapping(item_list)

    #rename ratings df
    ratings_df = ratings_df.rename(columns={user_col:'user_col',
                                            item_col:'item_col',
                                            rating_col:'rating_col'})

    #encode df using the 2 mappings
    ratings_df['encoded_users'] = ratings_df['user_col'].apply(lambda x:user_2_ind[x])
    ratings_df['encoded_items'] = ratings_df['item_col'].apply(lambda x:item_2_ind[x])

    return ratings_df[['encoded_users','encoded_items','rating_col']]

ratings = df_ratings
ratings.columns = ['userid','itemid','rating','timestamp']
ratings_reindex = reindexer(ratings,'userid','itemid','rating')

# train test셋 분리
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings_reindex,
                               stratify=ratings_reindex['encoded_users'],
                               test_size=0.2,
                               random_state=42)

training_set = np.array(train, dtype = 'int')
test_set = np.array(test, dtype = 'int')

nb_users = int(max(max(training_set[:,0]), max(test_set[:,0])))
nb_movies = int(max(max(training_set[:,1]), max(test_set[:,1])))

# def convert(data):
#     new_data = []
#     for id_users in range(nb_users+1): # 유저id는 1부터 시작
#         # each user's watched movies
#         # data[:,0], first column, all rows column users
#         id_items = data[:,1][data[:,0] == id_users]
#         # each user's rating for that item
#         id_ratings = data[:,2][data[:,0] == id_users]
#         ratings = np.zeros(nb_movies)
#         # the positions of these items are filled with ratings, creating the matrix
#         ratings[id_items-1] = id_ratings
#         new_data.append(list(ratings))
#     return new_data

def convert(data):
    new_data = []
    for id_users in range(1, nb_users+1): # 유저id는 1부터 시작
        # each user's watched movies
        # data[:,0], first column, all rows column users
        id_items = data[:,1][data[:,0] == id_users]
        # each user's rating for that item
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        # the positions of these items are filled with ratings, creating the matrix
        ratings[id_items-1] = id_ratings
        new_data.append(list(ratings))
    return new_data

training_set = convert(training_set)
test_set = convert(test_set)

training_set = torch.FloatTensor(training_set)
test_set = torch.FloatTensor(test_set)

In [None]:
import time

autoencoder_network = Encoder([nb_movies,256,128],'sigmoid',0.1)
criterion = nn.MSELoss()
# criterion = nn.BPRLoss()
#optimizer = optim.RMSprop(autoencoder_network.parameters(), lr = 0.01, weight_decay = 0.5)
optimizer = optim.Adam(autoencoder_network.parameters(), lr=0.01, weight_decay=0.5)

nb_epoch = 5
for epoch in range(1, nb_epoch + 1):
    start_time = time.time()

    train_loss = 0
    s = 0.
    # s is the number of users who rated at least 1 movies
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = autoencoder_network(input)
            #output = 1 + 4 * output ######## 스케일링 추가
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10) #making this anyway not equal to 0, as this will be a denominator
            #mean_corrector is the avg of the error, only considering the movies having ratings (non-zero ratings) for computing mean of error
            loss.backward() # decide the direction the increment of weights
            #this call will just computing all the gradients required
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            optimizer.step() # decide the amount to update the weights

    print('epoch: '+str(epoch)+' loss: '+ str(train_loss/s))
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for epoch {epoch}: {elapsed_time:.2f} seconds")


test_loss = 0
s = 0.

res = []
targets = []

# averaged difference between real rating and predicted rating
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0) # should keep the training set
    target = Variable(test_set[id_user]).unsqueeze(0) # to predict the other movies user not seen yet

    if torch.sum(target.data > 0) > 0:
        # make predictions
        output = autoencoder_network(input)
        #output = 1 + 4 * output ######## 스케일링 추가
        targets.append(target.detach().numpy())
        res.append(output.detach().numpy())
        target.require_grad = False
        output[target == 0] = 0 # dont want to measure the loss on the movies didnt get the actual rating from user
        # force to 0 and difference / loss will be 0 for those entries
        loss = criterion(output, target)

        mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
        # only consider the movies that are rated in the test set, to be included in the loss
        test_loss += np.sqrt(loss.data*mean_corrector)
        s += 1.
print('test loss: '+str(test_loss/s))


#making top k recommendation
def make_top_k_recommendations(encoder,evidence,k,filter_seen=True):
    '''
    :param encoder: autoencoder instance
    :param evidence: full set of seen ratings from all users
    :param k: top k items (by output score)
    :param filter_seen: (default True) filter controller to remove seen items from top k list
    '''
    res = []
    nb_users = evidence.shape[0]
    # to find top scored items for each user
    for id_user in range(nb_users):
        encoder_input = Variable(evidence[id_user]).unsqueeze(0) # should keep the training set
        encoder_output = encoder(encoder_input)

        target = Variable(evidence[id_user]).unsqueeze(0) # mask to find items not seen yet
        if filter_seen:
            encoder_output[target != 0] = 0 # force seen items scores to 0, will never get recommended
        res.append(encoder_output.detach().numpy())

    res = [a[0] for a in res]
    final_itemsets = []
    for each in res:
        full_ratings_predicted = list(each)
        full_ratings_indexed = list(enumerate(full_ratings_predicted))
        final_itemsets.append(sorted(full_ratings_indexed,key=lambda x:x[1],reverse =True)[:k])

    return final_itemsets

epoch: 1 loss: tensor(1.0966)
Time taken for epoch 1: 95.95 seconds
epoch: 2 loss: tensor(1.0084)
Time taken for epoch 2: 85.34 seconds
epoch: 3 loss: tensor(0.9969)
Time taken for epoch 3: 89.86 seconds
epoch: 4 loss: tensor(0.9884)
Time taken for epoch 4: 88.81 seconds
epoch: 5 loss: tensor(0.9805)
Time taken for epoch 5: 90.28 seconds
test loss: tensor(0.9752)


In [None]:
import time
import numpy as np
import torch
from torch.autograd import Variable
from torch import nn, optim

# Encoder, nb_movies, nb_users, training_set, test_set의 정의가 필요합니다.

# 평가지표 함수
def ndcg_at_k(r, k):
    """Compute NDCG@k for a single list of ratings. Higher is better."""
    r = np.asfarray(r)[:k]
    dcg_val = r[0] + np.sum(r[1:] / np.log2(np.arange(3, r.size + 2)))

    # 이상적인 DCG 값을 계산
    r_sorted = np.sort(r)[::-1]
    ideal_dcg_val = r_sorted[0] + np.sum(r_sorted[1:] / np.log2(np.arange(3, r.size + 2)))

    if ideal_dcg_val == 0:
        return 0.

    return dcg_val / ideal_dcg_val

def recall_at_k(predicted, ground_truth, k):
    """Compute recall@k. Higher is better."""
    if len(ground_truth) == 0:
        return np.nan  # or return 0, depending on your preference
    rec = [item for item in predicted if item in ground_truth]
    return len(rec) / np.min([k, len(ground_truth)])


autoencoder_network = Encoder([nb_movies,128,64],'sigmoid',0.1)
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder_network.parameters(), lr=0.001, weight_decay=0.5)

def make_top_k_recommendations(encoder, evidence, k):
    # Check if evidence is 1D (i.e., data for one user)
    if len(evidence.shape) == 1:
        evidence = evidence.reshape(1, -1)

    res = []
    nb_users = evidence.shape[0]
    for id_user in range(nb_users):
        encoder_input = Variable(torch.FloatTensor(evidence[id_user])).unsqueeze(0)
        encoder_output = encoder(encoder_input)
        encoder_output_data = encoder_output.data[0].numpy()

        # Get top-k recommendations
        top_k_indices = encoder_output_data.argsort()[-k:][::-1]

        res.append(top_k_indices)

    return res

nb_epoch = 10
for epoch in range(nb_epoch + 1):
    start_time = time.time()
    train_loss = 0
    s = 0.

    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = autoencoder_network(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data*mean_corrector)
            s += 1.
            optimizer.step()

    print('epoch: '+str(epoch)+' loss: '+ str(train_loss/s))
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken for epoch {epoch}: {elapsed_time:.2f} seconds")

    all_ndcgs, all_recalls = [], []
    k = 20

    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)

        # 사용자가 시청한 영화를 기준으로 계산합니다.
        true_ratings = np.where(training_set[id_user].numpy() > 0)[0]

        # Top-k 추천을 얻습니다.
        user_top_k_recommendations = make_top_k_recommendations(autoencoder_network, test_set[id_user].numpy(), k)

        predicted_indices = user_top_k_recommendations[0]
        ndcg_val = ndcg_at_k([int(idx in true_ratings) for idx in predicted_indices], k)
        recall_val = recall_at_k(predicted_indices, true_ratings, k)

        all_ndcgs.append(ndcg_val)
        all_recalls.append(recall_val)

    mean_ndcg = np.mean(all_ndcgs)
    mean_recall = np.nanmean(all_recalls)

    print(f"Epoch {epoch} - mean NDCG@{k} on test set: {mean_ndcg:.4f} - mean Recall@{k} on test set: {mean_recall:.4f}")


epoch: 0 loss: tensor(1.0959)
Time taken for epoch 0: 66.71 seconds
Epoch 0 - mean NDCG@20 on test set: 0.3339 - mean Recall@20 on test set: 0.1286
epoch: 1 loss: tensor(1.0047)
Time taken for epoch 1: 47.38 seconds
Epoch 1 - mean NDCG@20 on test set: 0.2924 - mean Recall@20 on test set: 0.1002
epoch: 2 loss: tensor(0.9968)
Time taken for epoch 2: 45.74 seconds
Epoch 2 - mean NDCG@20 on test set: 0.2631 - mean Recall@20 on test set: 0.0801
epoch: 3 loss: tensor(0.9902)
Time taken for epoch 3: 47.73 seconds
Epoch 3 - mean NDCG@20 on test set: 0.2555 - mean Recall@20 on test set: 0.0757
epoch: 4 loss: tensor(0.9853)
Time taken for epoch 4: 47.28 seconds
Epoch 4 - mean NDCG@20 on test set: 0.2666 - mean Recall@20 on test set: 0.0757
epoch: 5 loss: tensor(0.9817)
Time taken for epoch 5: 46.51 seconds
Epoch 5 - mean NDCG@20 on test set: 0.2944 - mean Recall@20 on test set: 0.0799
epoch: 6 loss: tensor(0.9787)
Time taken for epoch 6: 48.99 seconds
Epoch 6 - mean NDCG@20 on test set: 0.2767 -

In [None]:
def recommend_for_user(encoder, user_id, top_k=20):
    # 사용자 데이터를 입력으로 사용하여 예측 평점을 생성합니다.
    input_data = Variable(training_set[user_id]).unsqueeze(0)
    predicted_ratings = encoder(input_data).data.numpy().flatten()

    # 사용자가 이미 평가한 영화 인덱스를 가져옵니다.
    watched_indices = np.where(training_set[user_id].numpy() > 0)[0]

    # 사용자가 이미 평가한 영화의 예측 평점을 최소값으로 설정하여 추천에서 제외합니다.
    predicted_ratings[watched_indices] = -np.inf

    # 수정된 예측 평점을 기준으로 상위 k개의 영화 인덱스를 가져옵니다.
    top_k_indices = predicted_ratings.argsort()[-top_k:][::-1]

    # 결과를 저장할 DataFrame을 생성합니다.
    results_df = pd.DataFrame(columns=['movieId', 'title', 'predicted_rating'])

    # 상위 k개의 영화에 대한 정보를 결과 DataFrame에 추가합니다.
    for idx in top_k_indices:
        movie_id = idx
        movie_title = df_movies[df_movies['movieId'] == movie_id]['title'].iloc[0]
        results_df = results_df.append({
            'movieId': movie_id,
            'title': movie_title,
            'predicted_rating': predicted_ratings[idx]
        }, ignore_index=True)

    return results_df


In [None]:
n = 2222
recommended_movies = recommend_for_user(autoencoder_network, n)
print(recommended_movies)

   movieId                                       title  predicted_rating
0     1839                             My Giant (1998)          4.273378
1      843                           Lotto Land (1995)          4.118946
2     2698                              Zone 39 (1997)          4.118065
3     3081                        Sleepy Hollow (1999)          4.090534
4     1168                             Bad Moon (1996)          4.069482
5     1170  Best of the Best 3: No Turning Back (1995)          4.063285
6     1104            Streetcar Named Desire, A (1951)          4.051184
7     1134                     Johnny 100 Pesos (1993)          4.035306
8     1122                     Plutonium Circus (1995)          4.032307
9     3111                  Places in the Heart (1984)          4.028879
10      51                       Guardian Angel (1994)          4.012717
11    1032                  Alice in Wonderland (1951)          4.009475
12    3203                            Dead Calm (19

  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({
  results_df = results_df.append({


In [None]:
Variable(test_set[6039]).unsqueeze(0).data.numpy()

array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
# trainset 기반으로 추천
top_10_recommendations = make_top_k_recommendations(autoencoder_network, test_set, 20)
top_10_recommendations

In [None]:
# testset 기반으로 추천(실제에 더 가까울 수 있음)
top_10_recommendations_testset = make_top_k_recommendations(autoencoder_network, test_set, 20)
top_10_recommendations_testset

In [None]:
# 전체 셋 기반으로 추천?
top_10_recommendations_total = make_top_k_recommendations(autoencoder_network, test_set, 10, True)
top_10_recommendations_total

In [None]:
top_10_recommendations_testset[0]
# test set 첫번째 유저의 추천영화 및 평점 (안본거 기준)

[(1839, 4.994234),
 (3081, 4.7889695),
 (1168, 4.771346),
 (1032, 4.770338),
 (1122, 4.7233424),
 (2698, 4.7194366),
 (1765, 4.7165046),
 (1100, 4.713937),
 (1160, 4.713625),
 (1109, 4.6846395),
 (1170, 4.6737757),
 (843, 4.672064),
 (1117, 4.667349),
 (3203, 4.6642013),
 (1090, 4.6581817),
 (1114, 4.655756),
 (309, 4.6487474),
 (861, 4.638333),
 (1134, 4.6361923),
 (847, 4.6224427)]

In [None]:
# Convert to DataFrame
rec_df = pd.DataFrame(top_10_recommendations_testset[1211], columns=['movieId', 'pred_rating'])

# Merge with original movie data to get titles
result_df = pd.merge(rec_df, df_movies[['movieId', 'title']], on='movieId', how='left')

# Reorder columns for clarity
result_df = result_df[['movieId', 'title', 'pred_rating']]

print(result_df)

    movieId                                       title  pred_rating
0      1839                             My Giant (1998)     4.994428
1      3081                        Sleepy Hollow (1999)     4.789157
2      1168                             Bad Moon (1996)     4.771533
3      1032                  Alice in Wonderland (1951)     4.770522
4      1122                     Plutonium Circus (1995)     4.723526
5      2698                              Zone 39 (1997)     4.719625
6      1765             Letter From Death Row, A (1998)     4.716687
7      1100                      Days of Thunder (1990)     4.714121
8      1160                        Six of a Kind (1934)     4.713803
9      1109                    Charm's Incidents (1996)     4.684820
10     1170  Best of the Best 3: No Turning Back (1995)     4.673956
11      843                           Lotto Land (1995)     4.672243
12     1117  Eighth Day, The (Le Huitième jour ) (1996)     4.667525
13     3203                       

In [None]:
df_movies[df_movies.movieId==1839]

Unnamed: 0,movieId,title,genres
1770,1839,My Giant (1998),Comedy


In [None]:
import pandas as pd

# 모든 유저에 대한 추천결과를 출력하는 함수

def print_movie_names_from_df(recommendations, df_movies):
    """
    recommendations: List of movie recommendations for each user.
                     Each element is a list of (movie_id, score) tuples.
    df_movies: DataFrame containing movie IDs and names.
    """
    # 영화 ID와 영화 이름의 매핑을 생성합니다.
    movie_id_to_name = df_movies.set_index('movieId')['title'].to_dict()

    for user_index, user_recommendations in enumerate(recommendations):
        print(f"User {user_index + 1}:")
        for movieId, score in user_recommendations:
            title = movie_id_to_name.get(movieId, "Unknown Movie")
            print(f"  {title} (Score: {score:.2f})")
        print()

print_movie_names_from_df(top_10_recommendations_testset, df_movies)


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  Lotto Land (1995) (Score: 0.93)
  Sex, Lies, and Videotape (1989) (Score: 0.93)
  Radioland Murders (1994) (Score: 0.93)
  Platoon (1986) (Score: 0.92)
  Unknown Movie (Score: 0.91)
  Funhouse, The (1981) (Score: 0.91)
  Funeral, The (1996) (Score: 0.90)

User 5626:
  Guardian Angel (1994) (Score: 0.97)
  Sleepy Hollow (1999) (Score: 0.96)
  My Giant (1998) (Score: 0.93)
  Lotto Land (1995) (Score: 0.93)
  Sex, Lies, and Videotape (1989) (Score: 0.93)
  Radioland Murders (1994) (Score: 0.93)
  Platoon (1986) (Score: 0.92)
  Unknown Movie (Score: 0.91)
  Bad Moon (1996) (Score: 0.91)
  Funeral, The (1996) (Score: 0.91)

User 5627:
  Guardian Angel (1994) (Score: 0.97)
  Sleepy Hollow (1999) (Score: 0.96)
  My Giant (1998) (Score: 0.93)
  Lotto Land (1995) (Score: 0.93)
  Sex, Lies, and Videotape (1989) (Score: 0.93)
  Radioland Murders (1994) (Score: 0.92)
  Platoon (1986) (Score: 0.92)
  Unknown Movie (Score: 0.91)
  Bad Moon (1996) (

In [None]:
# 유저id를 입력하면 추천결과를 출력하는 함수

def print_recommendations_for_user(user_id, recommendations, df_movies):
    """
    user_id: ID of the user to print recommendations for.
    recommendations: List of movie recommendations for each user.
                     Each element is a list of (movieId, score) tuples.
    df_movies: DataFrame containing movie IDs and titles.
    """
    # 영화 ID와 영화 이름의 매핑을 생성합니다.
    movie_id_to_name = df_movies.set_index('movieId')['title'].to_dict()

    # user_id - 1 로 해당 사용자의 인덱스를 찾습니다.
    user_index = user_id - 1

    user_recommendations = recommendations[user_index]

    print(f"User {user_id}:")
    for movieId, score in user_recommendations:
        title = movie_id_to_name.get(movieId, "Unknown Movie")
        print(f"  {title} (Score: {score:.2f})")
    print()

print_recommendations_for_user(200, top_10_recommendations_testset, df_movies)


User 200:
  Guardian Angel (1994) (Score: 1.02)
  Funhouse, The (1981) (Score: 0.94)
  Red Firecracker, Green Firecracker (1994) (Score: 0.94)
  Unknown Movie (Score: 0.93)
  Radioland Murders (1994) (Score: 0.93)
  Zone 39 (1997) (Score: 0.93)
  My Giant (1998) (Score: 0.92)
  Sleepy Hollow (1999) (Score: 0.91)
  Lotto Land (1995) (Score: 0.91)
  Sex, Lies, and Videotape (1989) (Score: 0.91)



In [None]:
len(test_set)

6041

In [None]:
np.mean(df_total[df_total.title=='My Giant (1998)'].rating)

2.260869565217391

In [None]:
ratings_reindex

Unnamed: 0,encoded_users,encoded_items,rating_col
0,1,1105,5
1,1,640,3
2,1,854,3
3,1,3178,4
4,1,2163,5
...,...,...,...
1000204,6040,1020,1
1000205,6040,1023,5
1000206,6040,549,5
1000207,6040,1025,4


In [None]:
train, test = train_test_split(ratings_reindex,
                               stratify=ratings_reindex['encoded_users'],
                               test_size=0.1,
                               random_state=42)

In [None]:
train.encoded_users.value_counts()

4169    2083
1680    1665
4277    1569
1941    1435
1181    1369
        ... 
4775      18
2584      18
250       18
5159      18
217       18
Name: encoded_users, Length: 6040, dtype: int64

In [None]:
ratings_reindex.encoded_users.value_counts()

4169    2314
1680    1850
4277    1743
1941    1595
1181    1521
        ... 
5725      20
3407      20
1664      20
4419      20
3021      20
Name: encoded_users, Length: 6040, dtype: int64

In [None]:
encoder_input2 = Variable(training_set[222]).unsqueeze(0) # should keep the training set
encoder_output2 = autoencoder_network(encoder_input2)

In [None]:
test_set.shape

torch.Size([6040, 3706])

In [None]:
len(ratings_reindex.encoded_users.unique())

6040