In [1]:
import pandas as pd
import zipfile
import numpy as np 
from sklearn.model_selection import train_test_split

In [2]:
# loading ratings
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
csv_ratings='ml-latest-small/ratings.csv'
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open(csv_ratings), names=r_cols)

In [3]:
ratings=ratings[ratings.rating != 'rating']

In [4]:
ratings.user_id.unique().shape[0], ratings.movie_id.unique().shape[0], ratings.rating.min(), ratings.rating.max()

(610, 9724, '0.5', '5.0')

In [5]:
ratings.dtypes

user_id           object
movie_id          object
rating            object
unix_timestamp    object
dtype: object

In [6]:
ratings=ratings.astype({'user_id': 'int', 'movie_id': 'int', 'rating': 'float', 'unix_timestamp': 'int'})

In [7]:
ratings.dtypes

user_id             int64
movie_id            int64
rating            float64
unix_timestamp      int64
dtype: object

In [8]:
# loading movie info
csv_movies='ml-latest-small/movies.csv'
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open(csv_movies), names=m_cols)

In [9]:
movies=movies[movies.movie_id != 'movieId']

In [10]:
movies=movies.astype({'movie_id': 'int'})

In [11]:
# merging
df = pd.merge(ratings, movies, on='movie_id')

In [12]:
df.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [13]:
df.isnull().any()

user_id           False
movie_id          False
rating            False
unix_timestamp    False
title             False
genre             False
dtype: bool

In [14]:
n_movies = len(df.movie_id.unique())
n_users = len(df.user_id.unique())
print(
    "{1:,} distinct users rated {0:,} different movies (total ratings = {2:,})".format(
        n_movies, n_users, len(df),
    )
)

610 distinct users rated 9,724 different movies (total ratings = 100,836)


In [15]:
ratings.user_id.min(), ratings.user_id.max(), ratings.movie_id.min(), ratings.movie_id.max()

(1, 610, 1, 193609)

In [16]:
from sklearn.preprocessing import scale

In [17]:
ratings['y']=pd.DataFrame(scale(ratings['rating'], with_mean=True, with_std=False))

In [18]:
ratings.describe()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,y
count,100836.0,100836.0,100836.0,100836.0,100835.0
mean,326.127564,19435.295718,3.501557,1205946000.0,-5e-06
std,182.618491,35530.987199,1.042529,216261000.0,1.042533
min,1.0,1.0,0.5,828124600.0,-3.001557
25%,177.0,1199.0,3.0,1019124000.0,-0.501557
50%,325.0,2991.0,3.5,1186087000.0,-0.001557
75%,477.0,8122.0,4.0,1435994000.0,0.498443
max,610.0,193609.0,5.0,1537799000.0,1.498443


In [33]:
# Implementation 
from keras.layers import Input, Embedding, Concatenate, Flatten, Dense, Dot, Add, Multiply, Subtract, Average, Reshape
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping

In [30]:
n_movies = ratings.movie_id.unique().shape[0]
n_users = ratings.user_id.unique().shape[0]

In [None]:
ratings.to_pickle('/home/elena/Projects/DLRM-with-Keras-for-movielens-dataset/ratings.pkl')
df.to_pickle('/home/elena/Projects/DLRM-with-Keras-for-movielens-dataset/ratings_movie_info.pkl')

In [16]:
# train test split
# we split so that each user is both in the train and test set
# first we group by user_id followed by movie_id
ratings_grouped = ratings.set_index(['user_id', 'movie_id'])

In [17]:
def split_group(df_group, train_size=0.8):
    customers = df_group.index.unique()
    train_customers, test_customers = train_test_split(customers, train_size=train_size)
    train_df, test_df = df_group.loc[train_customers], df_group.loc[test_customers]
    return train_df, test_df

In [18]:
train, test = split_group(ratings_grouped)

In [19]:
# reset index
train = train.reset_index(drop=False)
test = test.reset_index(drop=False)

In [20]:
len(set(test.user_id.unique()).difference(set(train.user_id.unique())))

0

In [21]:
train.user_id.unique().shape[0], test.user_id.unique().shape[0]

(610, 610)

In [22]:
train.to_pickle('/home/elena/Projects/DLRM-with-Keras-for-movielens-dataset/ratings_train.pkl')
test.to_pickle('/home/elena/Projects/DLRM-with-Keras-for-movielens-dataset/ratings_test.pkl')

In [None]:
# scaling