In [1]:
# MovieLens dataset; contains 100.003 movie ratings from 943 users and a selection of 1682 movies
# data downloaded as a zip file
import os
import pandas as pd
import zipfile
import numpy as np 
zf = zipfile.ZipFile('/home/elena/Downloads/ml-100k.zip')

In [2]:
# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(zf.open('ml-100k/u.user'), sep='|', names=u_cols,encoding='latin-1')
print("\nUser Data :")
print("shape : ", users.shape)
print(users.head())


User Data :
shape :  (943, 5)
   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213


In [4]:
users.user_id.unique().shape[0]

943

In [5]:
# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(zf.open('ml-100k/u.data'), sep='\t', names=r_cols,encoding='latin-1')
print("\nRatings Data :")
print("shape : ", ratings.shape)
print(ratings.head())


Ratings Data :
shape :  (100000, 4)
   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596


In [6]:
ratings.movie_id.unique().shape[0]

1682

In [4]:
# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv(zf.open('ml-100k/u.item'), sep='|', names=i_cols,
encoding='latin-1')
# we drop the features  'movie title' ,'release date','video release date' and'IMDb URL' 
# probably don't affect a recommendation model
items_used=items.drop(['movie title' ,'release date','video release date','IMDb URL'], axis=1)

In [5]:
items_used.columns=['movie_id','unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items_used.columns

Index(['movie_id', 'unknown', 'Action', 'Adventure', 'Animation', 'Children's',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'],
      dtype='object')

In [6]:
data=pd.merge(items_used, ratings, on='movie_id')
cols=['movie_id', 'user_id', 'rating', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western']
data=data[cols]
data.head()
# (100000, 22) shape

Unnamed: 0,movie_id,user_id,rating,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,308,4,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,287,5,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,148,4,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,280,4,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,66,3,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# The dataset has already been divided into train and test by GroupLens where the test data has 10 ratings for # each user, i.e. 9,430 rows in total!  Reading both these files into our Python environment
r_cols =data.columns
items_train = pd.read_csv(zf.open('ml-100k/ua.base'), sep='\t', names=r_cols, encoding='latin-1')
items_test = pd.read_csv(zf.open('ml-100k/ua.test'), sep='\t', names=r_cols, encoding='latin-1')
items_train.shape, items_test.shape

((90570, 22), (9430, 22))