In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.utils import shuffle
import pickle

In [4]:
# load data 
df = pd.read_csv('./data/very_small_rating.csv')
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,movie_idx
0,960,960,7307,1,4.5,10
1,961,961,7307,10,2.5,68
2,962,962,7307,19,3.5,143
3,963,963,7307,32,5.0,19
4,964,964,7307,39,4.5,85
...,...,...,...,...,...,...
5392020,19998291,19998291,5442,4993,5.0,33
5392021,19998292,19998292,5442,5349,3.0,115
5392022,19998293,19998293,5442,5378,4.0,211
5392023,19998295,19998295,5442,5449,4.0,1487


In [3]:
N = df.userId.max() + 1 # number of users 
M = df.movie_idx.max() + 1 # number of movies 
N, M

(10000, 2000)

In [5]:
# split into train and test 
df = shuffle(df) 
cutoff = int(0.8 * len(df))
df_train = df.iloc[:cutoff] 
df_test = df.iloc[cutoff:] 

In [6]:
# a dictionary to tell us which users have rated which movies 
user2movie = {}

# a dictionary to tell us which movies have been rated by which users 
movie2user = {}

# a dictionary to look up ratings 
usermovie2rating = {}

In [7]:
count = 0 
def update_user2movie_and_movie2user(row): 
    global count 
    count += 1 

    if count % 100000 == 0: 
        print("Processed: ", count/cutoff) 
        
    i = int(row.userId) 
    j = int(row.movie_idx) 

    if i not in user2movie: 
        user2movie[i] = [j]
    else: 
        user2movie[i].append(j)
    
    if j not in movie2user: 
        movie2user[j] = [i]
    else: 
        movie2user[j].append(i) 
    
    usermovie2rating[(i, j)] = row.rating 

df_train.apply(update_user2movie_and_movie2user, axis = 1) 

Processed:  0.02318238509650827
Processed:  0.04636477019301654
Processed:  0.0695471552895248
Processed:  0.09272954038603308
Processed:  0.11591192548254134
Processed:  0.1390943105790496
Processed:  0.16227669567555789
Processed:  0.18545908077206616
Processed:  0.20864146586857443
Processed:  0.23182385096508268
Processed:  0.25500623606159095
Processed:  0.2781886211580992
Processed:  0.3013710062546075
Processed:  0.32455339135111577
Processed:  0.34773577644762405
Processed:  0.3709181615441323
Processed:  0.3941005466406406
Processed:  0.41728293173714887
Processed:  0.44046531683365714
Processed:  0.46364770193016536
Processed:  0.48683008702667363
Processed:  0.5100124721231819
Processed:  0.5331948572196902
Processed:  0.5563772423161985
Processed:  0.5795596274127067
Processed:  0.602742012509215
Processed:  0.6259243976057233
Processed:  0.6491067827022315
Processed:  0.6722891677987398
Processed:  0.6954715528952481
Processed:  0.7186539379917564
Processed:  0.74183632308

3151294    None
401655     None
4448238    None
125462     None
1816858    None
           ... 
4469318    None
425384     None
1061384    None
4604830    None
3919337    None
Length: 4313620, dtype: object

In [8]:
# test ratings dictionary 
usermovie2rating_test = {}

count = 0 
def update_usermovie2rating_test(row): 
    global count
    count += 1 
    if count % 100000 == 0: 
        print("Processed:", count/len(df_test))

    i = int(row.userId) 
    j = int(row.movie_idx) 

    usermovie2rating_test[(i, j)] = row.rating 
df_test.apply(update_usermovie2rating_test, axis = 1) 

with open('./data/user2movie.json', 'wb') as f: 
    pickle.dump(user2movie, f) 

with open('./data/movie2user.json', 'wb') as f: 
    pickle.dump(movie2user, f) 

with open('./data/usermovie2rating.json', 'wb') as f: 
    pickle.dump(usermovie2rating, f) 

with open('./data/usermovie2rating_test.json', 'wb') as f: 
    pickle.dump(usermovie2rating_test, f) 

Processed: 0.09272954038603308
Processed: 0.18545908077206616
Processed: 0.2781886211580992
Processed: 0.3709181615441323
Processed: 0.46364770193016536
Processed: 0.5563772423161985
Processed: 0.6491067827022315
Processed: 0.7418363230882646
Processed: 0.8345658634742977
Processed: 0.9272954038603307


In [9]:
user2movie

{6125: [1428,
  171,
  1313,
  222,
  441,
  1552,
  822,
  800,
  961,
  1000,
  1390,
  176,
  4,
  1801,
  934,
  718,
  1087,
  640,
  676,
  21,
  671,
  256,
  1351,
  59,
  757,
  643,
  680,
  1853,
  833,
  455,
  7,
  631,
  251,
  1400,
  39,
  916,
  1520,
  250,
  389,
  232,
  332,
  466,
  34,
  1716,
  972,
  331,
  17,
  746,
  200,
  453,
  457,
  792,
  798,
  1860,
  374,
  592,
  1271,
  678,
  599,
  1278,
  316,
  1404,
  16,
  685,
  141,
  93,
  387,
  159,
  183,
  79,
  1074,
  1844,
  555,
  1689,
  613,
  273,
  1337,
  122,
  1469,
  292,
  571,
  1270,
  1189,
  1005,
  327,
  86,
  725,
  1004,
  1915,
  1451,
  1057,
  1648,
  1137,
  784,
  1324,
  695,
  154,
  1566,
  1032,
  633,
  1701,
  715,
  445,
  1909,
  99,
  541,
  407,
  296,
  278,
  210,
  1941,
  62,
  935,
  198,
  526,
  1101,
  27,
  1491,
  531,
  452,
  1415,
  1264,
  1923,
  373,
  19,
  830,
  166,
  185,
  253,
  193,
  165,
  620,
  1319,
  461,
  163,
  1638,
  1574,
  641,
 