In [1]:
!pip3 install torch
!pip3 install torchvision



In [2]:
!pip3 install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-1.6.3-py3-none-any.whl (584 kB)
[K     |████████████████████████████████| 584 kB 4.9 MB/s 
[?25hCollecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 72.2 MB/s 
Collecting PyYAML>=5.4
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.0 MB/s 
Collecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.8.2-py3-none-any.whl (409 kB)
[K     |████████████████████████████████| 409 kB 65.0 MB/s 
Collecting pyDeprecate<0.4.0,>=0.3.1
  Downloading pyDeprecate-0.3.2-py3-none-any.whl (10 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 59.4 MB/s 
Col

In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

np.random.seed(123)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

filename = '/content/drive/MyDrive/bigdata_analysis_final_project - movie recommender system(영화 추천 시스템)/Data Preprocessing/rating_small_dataset.csv'
ratings = pd.read_csv('/content/drive/MyDrive/bigdata_analysis_final_project - movie recommender system(영화 추천 시스템)/Data Preprocessing/rating_small_dataset.csv')
ratings.head()

Mounted at /content/drive


Unnamed: 0,userId,movieId,naver_code,rating,timestamp
0,1,31,17504,5.0,1260759144
1,1,1061,17766,6.0,1260759182
2,1,1343,11082,4.0,1260759131
3,1,2193,10619,4.0,1260759198
4,1,2294,19477,4.0,1260759108


In [5]:
#rand_userIds = np.random.choice(ratings['userId'].unique(), size=int(len(ratings['userId'].unique())*0.3), replace=False)

ratings = ratings.loc[ratings['userId'].isin(ratings['userId'])]

print('There are {} rows of data from {} users'.format(len(ratings), len(ratings['userId'])))

There are 40623 rows of data from 40623 users


In [6]:
ratings.sample(5)
#ratings[50:70]

Unnamed: 0,userId,movieId,naver_code,rating,timestamp
13928,240,3983,29013,9.0,1098940857
35307,582,3114,27578,5.0,1122167209
34481,574,153,17290,4.0,1232810237
36392,599,81562,72327,9.0,1306108963
1702,23,1917,19147,4.0,1148720837


In [7]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

In [8]:
train_ratings.loc[:, 'rating'] = 1

train_ratings.sample(5)
# train_ratings

Unnamed: 0,userId,movieId,rating
26847,461,5010,1
38591,628,342,1
34343,570,32029,1
29782,505,3160,1
13171,220,4370,1


In [9]:
# Get a list of all movie IDs
all_movieIds = ratings['movieId'].unique()

# Placeholders that will hold the training data
users, items, labels = [], [], []

# This is the set of items that each user has interaction with
user_item_set = set(zip(train_ratings['userId'], train_ratings['movieId']))

# 4:1 ratio of negative to positive samples
num_negatives = 4

for (u, i) in tqdm(user_item_set):
    users.append(u)
    items.append(i)
    labels.append(1) # items that the user has interacted with are positive
    for _ in range(num_negatives):
        # randomly select an item
        negative_item = np.random.choice(all_movieIds) 
        # check that the user has not interacted with this item
        while (u, negative_item) in user_item_set:
            negative_item = np.random.choice(all_movieIds)
        users.append(u)
        items.append(negative_item)
        labels.append(0) # items not interacted with are negative

  0%|          | 0/39512 [00:00<?, ?it/s]

In [10]:
class MovieLensTrainDataset(Dataset):
    """MovieLens PyTorch Dataset for Training
    
    Args:
        ratings (pd.DataFrame): Dataframe containing the movie ratings
        all_movieIds (list): List containing all movieIds
    
    """

    def __init__(self, ratings, all_movieIds):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_movieIds)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_movieIds):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['userId'], ratings['movieId']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_movieIds)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_movieIds)
                users.append(u)
                items.append(negative_item)
                labels.append(0)

        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [11]:
class NCF(pl.LightningModule):
    """ Neural Collaborative Filtering (NCF)
    
        Args:
            num_users (int): Number of unique users
            num_items (int): Number of unique items
            ratings (pd.DataFrame): Dataframe containing the movie ratings for training
            all_movieIds (list): List containing all movieIds (train + test)
    """
    
    def __init__(self, num_users, num_items, ratings, all_movieIds):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=32)
        self.fc2 = nn.Linear(in_features=32, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_movieIds = all_movieIds
        
    def forward(self, user_input, item_input):
        
        # Pass through embedding layers
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        # Concat the two embedding layers
        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        # Pass through dense layer
        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))
        vector = nn.ReLU()(self.fc3(vector))

        # Output layer
        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(MovieLensTrainDataset(self.ratings, self.all_movieIds),
                          batch_size=512, num_workers=4)

In [12]:
num_users = ratings['userId'].max()+1
num_items = ratings['movieId'].max()+1

all_movieIds = ratings['movieId'].unique()

model = NCF(num_users, num_items, train_ratings, all_movieIds)

In [13]:
trainer = pl.Trainer(max_epochs=5, reload_dataloaders_every_n_epochs=0, progress_bar_refresh_rate=50, logger=False, checkpoint_callback=False)

trainer.fit(model)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 5.4 K 
1 | item_embedding | Embedding | 1.3 M 
2 | fc1            | Linear    | 544   
3 | fc2            | Linear    | 2.1 K 
4 | fc3            | Linear    | 2.1 K 
5 | output         | Linear    | 33    
---------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.175     Total estimated model params size (MB)
  cpuset_checked))


Training: 0it [00:00, ?it/s]

In [14]:
torch.save(model.state_dict(), "./model.pth")

In [15]:
model = NCF(num_users, num_items, train_ratings, all_movieIds)
model.load_state_dict(torch.load("./model.pth"))
model.eval()
# User-item pairs for testing
test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(np.random.choice(list(not_interacted_items), 99))
    test_items = selected_not_interacted + [i]
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*100), 
                                        torch.tensor(test_items)).detach().numpy())
    
    top10_items = [test_items[i] for i in np.argsort(predicted_labels)[::-1][0:10].tolist()]
    
    if i in top10_items:
        hits.append(1)
    else:
        hits.append(0)
        
print("The Hit Ratio @ 10 is {:.2f}".format(np.average(hits)))

  0%|          | 0/671 [00:00<?, ?it/s]

The Hit Ratio @ 10 is 0.44


In [16]:
user_ratings = ratings[['userId', 'movieId', 'rating']][ratings['userId'] == 64]
user_ratings

Unnamed: 0,userId,movieId,rating
3711,64,110,8.0
3712,64,150,6.0
3713,64,153,10.0
3714,64,168,6.0
3715,64,266,10.0
3716,64,296,10.0
3717,64,329,8.0
3718,64,344,4.0
3719,64,349,8.0
3720,64,356,8.0


In [17]:
model = NCF(num_users, num_items, train_ratings, all_movieIds)
model.load_state_dict(torch.load("./model.pth"))
model.eval()
# User-item pairs for testing
test_user_item_set = set(zip(user_ratings['userId'], user_ratings['movieId']))

# Dict of all items that are interacted with by each user
user_interacted_items = ratings.groupby('userId')['movieId'].apply(list).to_dict()

hits = []
for (u,i) in tqdm(test_user_item_set):
    interacted_items = user_interacted_items[u]
    not_interacted_items = set(all_movieIds) - set(interacted_items)
    selected_not_interacted = list(not_interacted_items)
    
    predicted_labels = np.squeeze(model(torch.tensor([u]*len(selected_not_interacted)), torch.tensor(selected_not_interacted)).detach().numpy())
    
    top5_items = [selected_not_interacted[i] for i in np.argsort(predicted_labels)[::-1][0:5].tolist()]
    
top5_items

  0%|          | 0/15 [00:00<?, ?it/s]

[1, 588, 608, 480, 589]

In [18]:
filename = '/content/drive/MyDrive/bigdata_analysis_final_project - movie recommender system(영화 추천 시스템)/Data Preprocessing/movie_dataset.csv'
movie_df = pd.read_csv('/content/drive/MyDrive/bigdata_analysis_final_project - movie recommender system(영화 추천 시스템)/Data Preprocessing/movie_dataset.csv')
movie_df.head(3)

Unnamed: 0,movieId,naver_code,title,title_kor,genres,keyword,director,cast,director_kor,cast_kor,...,naver_user_participate,naver_critics_rate,naver_critics_participate,nation,release_date,running_time,age,story,poster_url,url
0,1,17153,Toy Story,토이 스토리,Adventure Animation Children Comedy Fantasy,"jealousy,toy,boy,friendship,friends,rivalry,bo...",John Lasseter,"Tom Hanks,Tim Allen,Don Rickles",존 라세터,"톰 행크스, 팀 알렌",...,1393,0.0,0,미국,2010.05.05 1995.12.30,77분,"전체 관람가, G",우디(톰 행크스 목소리 분)는 6살짜리 남자 아이 앤디가 가장 아끼는 카우보이 인형...,https://movie-phinf.pstatic.net/20111222_26/13...,https://movie.naver.com/movie/bi/mi/basic.nave...
1,2,17440,Jumanji,쥬만지,Adventure Children Fantasy,"board game,disappearance,based on children's b...",Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst",조 존스톤,"로빈 윌리엄스, 커스틴 던스트, 데이빗 알란 그리어",...,1591,0.0,0,미국,1996.01.20,104분,"전체 관람가, PG",1969년. 커다란 공장을 운영하는 아버지를 둔 12세 소년 알랜 패리쉬(Young...,https://movie-phinf.pstatic.net/20111223_209/1...,https://movie.naver.com/movie/bi/mi/basic.nave...
2,4,17798,Waiting To Exhale,사랑을 기다리며,Comedy Drama Romance,"based on novel,interracial relationship,single...",Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine",포레스트 휘태커,"휘트니 휴스턴, 안젤라 바셋, 로레타 드바인",...,17,0.0,0,미국,1996.04.05,127분,"청소년 관람불가, R",사반나 잭슨(Savannah Jackson: 휘트니 휴스턴 분)은 TV 프로듀서로서...,https://movie-phinf.pstatic.net/20111221_192/1...,https://movie.naver.com/movie/bi/mi/basic.nave...


In [19]:
top10_items = [527, 296, 2959, 2028, 2997, 1089, 2858, 434, 6874, 10]
recomm_movies = movie_df[['movieId', 'title', 'title_kor', 'poster_url']][movie_df['movieId'].isin(top10_items)]
recomm_movies

Unnamed: 0,movieId,title,title_kor,poster_url
7,10,GoldenEye,007 골든 아이,https://movie-phinf.pstatic.net/20111222_264/1...
101,296,Pulp Fiction,펄프 픽션,https://movie-phinf.pstatic.net/20111221_33/13...
142,434,Cliffhanger,클리프행어,https://movie-phinf.pstatic.net/20111222_1/132...
176,527,Schindler's List,쉰들러 리스트,https://movie-phinf.pstatic.net/20190122_81/15...
265,1089,Reservoir Dogs,저수지의 개들,https://movie-phinf.pstatic.net/20111222_282/1...
420,2028,Saving Private Ryan,라이언 일병 구하기,https://movie-phinf.pstatic.net/20111222_4/132...
528,2858,American Beauty,아메리칸 뷰티,https://movie-phinf.pstatic.net/20111222_29/13...
541,2959,Fight Club,파이트 클럽,https://movie-phinf.pstatic.net/20161013_298/1...
549,2997,Being John Malkovich,존 말코비치 되기,https://movie-phinf.pstatic.net/20111222_249/1...
1087,6874,Kill Bill: Vol. 1,킬 빌 - 1부,https://movie-phinf.pstatic.net/20111223_27/13...


In [20]:
recomm_movies = movie_df[['movieId', 'title', 'title_kor', 'poster_url']][movie_df['movieId'].isin([296, 2959, 2028, 2997, 6874])]
recomm_movies

Unnamed: 0,movieId,title,title_kor,poster_url
101,296,Pulp Fiction,펄프 픽션,https://movie-phinf.pstatic.net/20111221_33/13...
420,2028,Saving Private Ryan,라이언 일병 구하기,https://movie-phinf.pstatic.net/20111222_4/132...
541,2959,Fight Club,파이트 클럽,https://movie-phinf.pstatic.net/20161013_298/1...
549,2997,Being John Malkovich,존 말코비치 되기,https://movie-phinf.pstatic.net/20111222_249/1...
1087,6874,Kill Bill: Vol. 1,킬 빌 - 1부,https://movie-phinf.pstatic.net/20111223_27/13...


In [21]:
recomm_movies = movie_df[['movieId', 'title', 'title_kor', 'poster_url']][movie_df['movieId'].isin(top5_items)]
recomm_movies

Unnamed: 0,movieId,title,title_kor,poster_url
0,1,Toy Story,토이 스토리,https://movie-phinf.pstatic.net/20111222_26/13...
156,480,Jurassic Park,쥬라기 공원,https://movie-phinf.pstatic.net/20130531_210/1...
194,588,Aladdin,알라딘,https://movie-phinf.pstatic.net/20170418_64/14...
195,589,Terminator 2: Judgment Day,터미네이터 2:오리지널,https://movie-phinf.pstatic.net/20191010_99/15...
203,608,Fargo,파고,https://movie-phinf.pstatic.net/20111221_53/13...


In [22]:
recomm_movies.to_csv("Deep Learning based Recommender Systems_list.csv", mode='w', index=False)