In [75]:
import os, zipfile
import pandas as pd
import torch
import numpy as np
from torch.autograd import Variable
from tqdm.notebook import tqdm

In [19]:
# download dataset
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   889k      0  0:00:01  0:00:01 --:--:--  890k


In [20]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [87]:
# import dataset
ds_file = 'ml-latest-small'
movies_df = pd.read_csv(ds_file + '/movies.csv')
ratings_df = pd.read_csv(ds_file + '/ratings.csv')
print(movies_df.head(2))
print(ratings_df.head(2))

   movieId             title                                       genres
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy
1        2    Jumanji (1995)                   Adventure|Children|Fantasy
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247


In [88]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(movies_df.movieId.unique())
print(n_users, n_items)

610 9742


In [93]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=8):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)  # user embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors)  # item embeddings
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)
     

In [94]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()
        
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()
        
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)
    
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [95]:
n_epochs = 32
gpu = torch.cuda.is_available()

model = MatrixFactorization(n_users, n_items, n_factors=8)
print('model: ', model)

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

if gpu:
    model = model.cuda()

loss_func = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

model:  MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9742, 8)
)
user_factors.weight tensor([[0.0116, 0.0364, 0.0324,  ..., 0.0195, 0.0150, 0.0418],
        [0.0496, 0.0247, 0.0079,  ..., 0.0167, 0.0248, 0.0349],
        [0.0244, 0.0315, 0.0273,  ..., 0.0122, 0.0420, 0.0251],
        ...,
        [0.0072, 0.0259, 0.0187,  ..., 0.0198, 0.0472, 0.0152],
        [0.0291, 0.0315, 0.0319,  ..., 0.0383, 0.0250, 0.0173],
        [0.0259, 0.0080, 0.0340,  ..., 0.0193, 0.0148, 0.0045]])
item_factors.weight tensor([[0.0257, 0.0409, 0.0360,  ..., 0.0299, 0.0112, 0.0453],
        [0.0442, 0.0456, 0.0328,  ..., 0.0458, 0.0373, 0.0351],
        [0.0106, 0.0493, 0.0479,  ..., 0.0142, 0.0190, 0.0444],
        ...,
        [0.0463, 0.0311, 0.0098,  ..., 0.0166, 0.0436, 0.0066],
        [0.0373, 0.0299, 0.0138,  ..., 0.0016, 0.0371, 0.0419],
        [0.0069, 0.0298, 0.0316,  ..., 0.0243, 0.0300, 0.0065]])


In [96]:
for nth_ep in tqdm(range(n_epochs)):
    losses = []
    for x,y in train_loader:
        if gpu:
            x, y = x.cuda(), y.cuda()
        optimizer.zero_grad()
        outputs = model(x)
        loss = loss_func(outputs.squeeze(), y.type(torch.float32))
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    print('epoch {}'.format(nth_ep), ' Loss: ', sum(losses)/len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for nth_ep in tqdm(range(n_epochs)):


  0%|          | 0/32 [00:00<?, ?it/s]

epoch 0  Loss:  11.066052680088179
epoch 1  Loss:  4.745891238832232
epoch 2  Loss:  2.475745243166909
epoch 3  Loss:  1.7223003651890052
epoch 4  Loss:  1.3468749485342635
epoch 5  Loss:  1.129087515061882
epoch 6  Loss:  0.9918312261703656
epoch 7  Loss:  0.9006307883386685
epoch 8  Loss:  0.8374716678095347
epoch 9  Loss:  0.7925231027119051
epoch 10  Loss:  0.7596731693082049
epoch 11  Loss:  0.7350000817763623
epoch 12  Loss:  0.7162968554926402
epoch 13  Loss:  0.7019135373724898
epoch 14  Loss:  0.6906995851225054
epoch 15  Loss:  0.6818176688730414
epoch 16  Loss:  0.6752235205267286
epoch 17  Loss:  0.669942523281889
epoch 18  Loss:  0.6659615804550006
epoch 19  Loss:  0.6627579918367609
epoch 20  Loss:  0.6606086823843458
epoch 21  Loss:  0.6589910228769791
epoch 22  Loss:  0.6576290969845607
epoch 23  Loss:  0.6566704963215717
epoch 24  Loss:  0.655902912811882
epoch 25  Loss:  0.655137757283782
epoch 26  Loss:  0.6541149247872647
epoch 27  Loss:  0.6533689685733185
epoch 28

In [97]:
# latent factors for movies and users
c, uw, iw = 0, 0, 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)
        if c == 0:
            uw = param.data
            c += 1
        else:
            iw = param.data

user_factors.weight
item_factors.weight


In [98]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [99]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=20, n_init=10, random_state=0).fit(trained_movie_embeddings)

In [100]:
for cluster in range(10):
    print('cluster: {}'.format(cluster))
    movs = []
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
        movs.append((movie_names[movid], rat_count))
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print(' ', mov[0])

cluster: 0
  Twister (1996)
  Net, The (1995)
  Cliffhanger (1993)
  Armageddon (1998)
  Mummy, The (1999)
  Broken Arrow (1996)
  Demolition Man (1993)
  Mission: Impossible II (2000)
  Charlie's Angels (2000)
  Dragonheart (1996)
cluster: 1
  Guess Who's Coming to Dinner (1967)
  Three Billboards Outside Ebbing, Missouri (2017)
  Gallipoli (1981)
  Band of Brothers (2001)
  Lifeboat (1944)
  Man Bites Dog (C'est arrivé près de chez vous) (1992)
  Reign Over Me (2007)
  Jules and Jim (Jules et Jim) (1961)
  Discreet Charm of the Bourgeoisie, The (Charme discret de la bourgeoisie, Le) (1972)
  Neon Genesis Evangelion: The End of Evangelion (Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni) (1997)
cluster: 2
  Spice World (1997)
  Catwoman (2004)
  Jason X (2002)
  Problem Child 2 (1991)
  When a Stranger Calls (2006)
  Police Academy: Mission to Moscow (1994)
  House Party 2 (1991)
  Amityville II: The Possession (1982)
  Jaws: The Revenge (1987)
  Epic Movie (2007)
cluster: 

KeyError: 9724