### Now, we'll do a "table to dict" conversion, with an eye at optimizing lookups

In [1]:
import pandas as pd
import pickle
from sklearn.utils import shuffle

In [2]:
df = pd.read_csv('../dataset/MovieLens20M/edited_rating_small_dataset.csv')
df.head()

Unnamed: 0,userId,movieId,rating,movie_idx
0,7325,1,4.5,10
1,7325,10,2.5,68
2,7325,19,3.5,143
3,7325,32,5.0,19
4,7325,39,4.5,85


In [3]:
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies

In [4]:
# do the train-test split
df = shuffle(df)
cutoff = int(.8*(len(df)))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [5]:
# now we set up a dict to track which users've rated which movies
user2movie = {}
# another one to tell which movies've been rated by which users
movie2user = {}
# and finally, one to look up ratings
usermovie2rating = {}
count = 0

In [6]:
def update_dictionaries(row, count):

    count += 1

    if count % 100000 == 0:
        print("processed: " + str(count / cutoff))

    i = int(row.userId)
    j = int(row.movie_idx)

    if i not in user2movie:
        user2movie[i] = [j]
    else:
        user2movie[i].append(j)

    if j not in movie2user:
        movie2user[j] = [i]
    else:
        movie2user[j].append(i)

    usermovie2rating[(i, j)] = row.rating
    return 1


In [7]:
#takes about 3min
df_train.apply(update_dictionaries, count=count, axis=1)

3274005    1
2720640    1
2166338    1
1022868    1
1560127    1
          ..
4816456    1
4330532    1
3465105    1
1036574    1
2983396    1
Length: 4313584, dtype: int64

In [8]:
# we now do kind of the same thing, this time\ for the test dataset
usermovie2rating_test = {}
count = 0


def update_usermovie2rating_test(row, count):
    count += 1
    
    i = int(row.userId)
    j = int(row.movie_idx)

    usermovie2rating_test[(i, j)] = row.rating


In [9]:
# should take less than 45s
df_test.apply(update_usermovie2rating_test, count=count, axis=1)

2791769    None
3159888    None
3566175    None
3540693    None
1792057    None
           ... 
2871300    None
5382635    None
3869955    None
2508648    None
1607428    None
Length: 1078396, dtype: object

In [10]:
# we now save the data, of course (-: (takes about 30s)
with open('user2movie.json', 'wb') as f:
  pickle.dump(user2movie, f)
  
with open('movie2user.json', 'wb') as f:
  pickle.dump(movie2user, f)
  
with open('usermovie2rating.json', 'wb') as f:
  pickle.dump(usermovie2rating, f)
  
with open('usermovie2rating_test.json', 'wb') as f:
  pickle.dump(usermovie2rating_test, f)