In [1]:
import pandas as pd
import zipfile
import numpy as np 

In [2]:
# loading ratings
zf = zipfile.ZipFile('/home/elena/Downloads/ml-latest-small.zip')
csv_ratings='ml-latest-small/ratings.csv'
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open(csv_ratings), names=r_cols)

In [3]:
ratings.user_id.unique().shape[0], ratings.movie_id.unique().shape[0], ratings.rating.min(), ratings.rating.max()

(611, 9725, '0.5', 'rating')

In [5]:
ratings[ratings.rating=='rating']

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,userId,movieId,rating,timestamp


In [6]:
ratings=ratings[ratings.rating != 'rating']

In [7]:
ratings.rating.max()

'5.0'

In [8]:
ratings.dtypes

user_id           object
movie_id          object
rating            object
unix_timestamp    object
dtype: object

In [13]:
ratings=ratings.astype({'user_id': 'int', 'movie_id': 'int', 'rating': 'float', 'unix_timestamp': 'int'})

In [14]:
ratings.dtypes

user_id             int64
movie_id            int64
rating            float64
unix_timestamp      int64
dtype: object

In [15]:
# loading movie info
csv_movies='ml-latest-small/movies.csv'
m_cols=['movie_id', 'title', 'genre']
movies = pd.read_csv(zf.open(csv_movies), names=m_cols)

In [16]:
movies.head()

Unnamed: 0,movie_id,title,genre
0,movieId,title,genres
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,2,Jumanji (1995),Adventure|Children|Fantasy
3,3,Grumpier Old Men (1995),Comedy|Romance
4,4,Waiting to Exhale (1995),Comedy|Drama|Romance


In [17]:
movies=movies[movies.movie_id != 'movieId']

In [18]:
movies=movies.astype({'movie_id': 'int'})

In [19]:
# merging
df = pd.merge(ratings, movies, on='movie_id')

In [20]:
df.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title,genre
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [21]:
df.isnull().any()

user_id           False
movie_id          False
rating            False
unix_timestamp    False
title             False
genre             False
dtype: bool

In [23]:
df.to_pickle('/home/elena/Projects/DLRM-with-Keras-for-movielens-dataset/ratings.pkl')

In [24]:
df.user_id.unique().shape[0], df.movie_id.unique().shape[0], df.rating.min(), df.rating.max()

(610, 9724, 0.5, 5.0)

In [27]:
# The dataset has already been divided into train and test by GroupLens where the test data has 10 ratings for # each user, i.e. 9,430 rows in total!  Reading both these files into our Python environment
zf = zipfile.ZipFile('/home/elena/Downloads/ml-100k.zip')
r_cols =['movie_id', 'user_id', 'rating', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western']
items_train = pd.read_csv(zf.open('ml-100k/ua.base'), sep='\t', names=r_cols, encoding='latin-1')
items_test = pd.read_csv(zf.open('ml-100k/ua.test'), sep='\t', names=r_cols, encoding='latin-1')
items_train.shape, items_test.shape

((90570, 22), (9430, 22))

In [29]:
items_train.head()

Unnamed: 0,movie_id,user_id,rating,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,5,874965758,,,,,,,...,,,,,,,,,,
1,1,2,3,876893171,,,,,,,...,,,,,,,,,,
2,1,3,4,878542960,,,,,,,...,,,,,,,,,,
3,1,4,3,876893119,,,,,,,...,,,,,,,,,,
4,1,5,3,889751712,,,,,,,...,,,,,,,,,,


In [30]:
items_test.head()

Unnamed: 0,movie_id,user_id,rating,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,20,4,887431883,,,,,,,...,,,,,,,,,,
1,1,33,4,878542699,,,,,,,...,,,,,,,,,,
2,1,61,4,878542420,,,,,,,...,,,,,,,,,,
3,1,117,3,874965739,,,,,,,...,,,,,,,,,,
4,1,155,2,878542201,,,,,,,...,,,,,,,,,,


In [31]:
train=items_train[['movie_id', 'user_id','rating']]
test = items_test[['movie_id', 'user_id','rating']]

In [32]:
train.isnull().any()

movie_id    False
user_id     False
rating      False
dtype: bool

In [33]:
test.isnull().any()

movie_id    False
user_id     False
rating      False
dtype: bool

In [34]:
train.shape, test.shape

((90570, 3), (9430, 3))

In [35]:
train.movie_id.unique().shape[0], test.movie_id.unique().shape[0]

(943, 943)

In [36]:
train.user_id.unique().shape[0], test.user_id.unique().shape[0]

(1680, 1129)

In [37]:
train.to_pickle('/home/elena/Projects/DLRM-with-Keras-for-movielens-dataset/ratings_train.pkl')
test.to_pickle('/home/elena/Projects/DLRM-with-Keras-for-movielens-dataset/ratings_test.pkl')

In [44]:
len(set(train.user_id.unique()).intersection(set(test.user_id.unique())))

553

In [45]:
dd=pd.concat([train,test], ignore_index=True)
users_dd=dd.user_id.unique()

In [47]:
len(users_dd)

1682

In [49]:
len(set(dd.movie_id.unique()).intersection(set(df.movie_id.unique())))

724

In [51]:
rr=ratings.groupby(['user_id', 'movie_id']).count()

In [52]:
rr[rr.rating >1]

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,unix_timestamp
user_id,movie_id,Unnamed: 2_level_1,Unnamed: 3_level_1


In [53]:
from sklearn.model_selection import train_test_split

def split_group(df_group, train_size=0.8):
    customers = df_group.index.unique()
    train_customers, test_customers = train_test_split(customers, train_size=train_size)
    train_df, test_df = df_group.loc[train_customers], df_group.loc[test_customers]
    return train_df, test_df

In [54]:
train2, test2 = split_group(rr)

In [55]:
train2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,unix_timestamp
user_id,movie_id,Unnamed: 2_level_1,Unnamed: 3_level_1
202,589,1,1
15,70286,1,1
527,1566,1,1
256,60069,1,1
331,6863,1,1


In [57]:
train22=train2.reset_index(drop=False)

In [59]:
test22=test2.reset_index(drop=False)

In [60]:
train22.user_id.unique().shape[0], test22.user_id.unique().shape[0]

(610, 608)

In [61]:
len(set(test22.user_id.unique()).intersection(set(train22.user_id.unique())))

608

In [65]:
rrr=ratings.set_index(['user_id','movie_id'])

In [66]:
rrr

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,unix_timestamp
user_id,movie_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,964982703
1,3,4.0,964981247
1,6,4.0,964982224
1,47,5.0,964983815
1,50,5.0,964982931
...,...,...,...
610,166534,4.0,1493848402
610,168248,5.0,1493850091
610,168250,5.0,1494273047
610,168252,5.0,1493846352


In [67]:
train, test = split_group(rrr)

In [69]:
train=train.reset_index(drop=False)
test = test.reset_index(drop=False)

In [70]:
train.user_id.unique().shape[0], test.user_id.unique().shape[0]

(610, 610)