In [1]:
import pandas as pd
import graphlab

In [2]:
#reading users file
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,
                    encoding='latin-1', parse_dates=True) 

#reading ratings file
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,
                      encoding='latin-1')

#reading items file
m_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=m_cols, usecols=range(5),
                     encoding='latin-1')

In [3]:
#Explore users file:
print users.shape
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
#Explore rating file
print ratings.shape
ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
#Explore item file
print(movies.shape)
movies.head()

(1682, 5)


Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [6]:
# Dropping columns that are not required
movies.drop(['title', 'release_date', 'video_release_date', 'imdb_url'], inplace = True, axis=1)
movies.head()

Unnamed: 0,movie_id
0,1
1,2
2,3
3,4
4,5


In [7]:
# Dropping columns that are not required
users.drop(['age', 'sex', 'occupation', 'zip_code'], inplace = True, axis=1)
users.head()

Unnamed: 0,user_id
0,1
1,2
2,3
3,4
4,5


In [8]:
# Merging Dataframes
movie_ratings = pd.merge(movies, ratings)
df = pd.merge(movie_ratings, users)

df.head()

Unnamed: 0,movie_id,user_id,rating,unix_timestamp
0,1,308,4,887736532
1,4,308,5,887737890
2,5,308,4,887739608
3,7,308,4,887738847
4,8,308,5,887736696


In [9]:
df = df[['user_id', 'movie_id', 'rating', 'unix_timestamp']]
df.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,308,1,4,887736532
1,308,4,5,887737890
2,308,5,4,887739608
3,308,7,4,887738847
4,308,8,5,887736696


In [10]:
# to check if there is empty values
df.isnull().any()

user_id           False
movie_id          False
rating            False
unix_timestamp    False
dtype: bool

In [11]:
df.shape

(100000, 4)

In [12]:
sf = graphlab.SFrame(df)
train_data, test_data = sf.random_split(.8, seed=5)
print(len(train_data), len(test_data))

This non-commercial license of GraphLab Create for academic use is assigned to gess.fathan@mail.ugm.ac.id and will expire on May 07, 2019.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\geszn\AppData\Local\Temp\graphlab_server_1545287760.log.0


(80033, 19967)


In [13]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

In [14]:
#Train Model
item_sim_model = graphlab.item_similarity_recommender.create(train_data, user_id='user_id', item_id='movie_id', similarity_type='pearson')
#Make Recommendations:
item_sim_recomm = item_sim_model.recommend(users=range(1,6),k=5)
item_sim_recomm.print_rows(num_rows=25)

+---------+----------+----------------+------+
| user_id | movie_id |     score      | rank |
+---------+----------+----------------+------+
|    1    |    50    |      1.0       |  1   |
|    1    |   100    | 0.903719912473 |  2   |
|    1    |   286    | 0.842450765864 |  3   |
|    1    |   294    | 0.833698030635 |  4   |
|    1    |   288    | 0.827133479212 |  5   |
|    2    |   181    | 0.90590809628  |  1   |
|    2    |   100    | 0.903719912473 |  2   |
|    2    |   121    | 0.776805251641 |  3   |
|    2    |   174    | 0.74398249453  |  4   |
|    2    |    7     | 0.682713347921 |  5   |
|    3    |    50    |      1.0       |  1   |
|    3    |   100    | 0.903719912473 |  2   |
|    3    |   286    | 0.842450765864 |  3   |
|    3    |   288    | 0.827133479212 |  4   |
|    3    |    1     | 0.818380743982 |  5   |
|    4    |   181    | 0.90590809628  |  1   |
|    4    |   100    | 0.903719912473 |  2   |
|    4    |   258    | 0.894967177243 |  3   |
|    4    |  

In [15]:
#Train Model
matrix_model = graphlab.factorization_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

#Make Recommendations:
matrix_recomm = matrix_model.recommend(users=range(1,6),k=5)
matrix_recomm.print_rows(num_rows=25)

+---------+----------+---------------+------+
| user_id | movie_id |     score     | rank |
+---------+----------+---------------+------+
|    1    |   1368   | 5.66720762601 |  1   |
|    1    |   641    | 5.65021851411 |  2   |
|    1    |   497    | 5.63617738833 |  3   |
|    1    |   1398   | 5.59447362771 |  4   |
|    1    |   1449   | 5.56478973975 |  5   |
|    2    |   641    | 5.99873774251 |  1   |
|    2    |   1281   | 5.96192141374 |  2   |
|    2    |   1500   |  5.8133935233 |  3   |
|    2    |   133    | 5.79908465346 |  4   |
|    2    |   900    | 5.74802695712 |  5   |
|    3    |   1022   | 8.25075461468 |  1   |
|    3    |   902    | 7.59077771505 |  2   |
|    3    |   919    | 7.57618383256 |  3   |
|    3    |   1110   | 7.51833327612 |  4   |
|    3    |   1286   | 7.49065141877 |  5   |
|    4    |   902    | 6.60579963526 |  1   |
|    4    |   697    | 6.53838165601 |  2   |
|    4    |   854    | 6.53235962366 |  3   |
|    4    |   904    | 6.525413652

In [16]:
#Train Model
matrix_model2 = graphlab.factorization_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', regularization=1e-6)

#Make Recommendations:
matrix_recomm2 = matrix_model2.recommend(users=range(1,6),k=5)
matrix_recomm2.print_rows(num_rows=25)

+---------+----------+---------------+------+
| user_id | movie_id |     score     | rank |
+---------+----------+---------------+------+
|    1    |   647    | 5.55117612084 |  1   |
|    1    |   100    | 5.50005521377 |  2   |
|    1    |   519    | 5.34635581573 |  3   |
|    1    |   493    |  5.2316690286 |  4   |
|    1    |   530    | 5.20406838259 |  5   |
|    2    |   750    | 5.50939947756 |  1   |
|    2    |   197    |  5.3746920863 |  2   |
|    2    |    22    | 5.36436629685 |  3   |
|    2    |   174    | 5.32323795946 |  4   |
|    2    |   887    | 5.28517833742 |  5   |
|    3    |    86    | 4.96886958679 |  1   |
|    3    |   207    | 4.79848315319 |  2   |
|    3    |   854    | 4.73870115479 |  3   |
|    3    |   657    | 4.73233913382 |  4   |
|    3    |   707    | 4.65758361419 |  5   |
|    4    |   1449   | 5.26501521965 |  1   |
|    4    |   488    | 5.02461523315 |  2   |
|    4    |   1019   | 4.99558425447 |  3   |
|    4    |   1642   | 4.992033188

In [17]:
#Train Model
matrix_model3 = graphlab.factorization_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', regularization=1e-5)

#Make Recommendations:
matrix_recomm3 = matrix_model3.recommend(users=range(1,6),k=5)
matrix_recomm3.print_rows(num_rows=25)

+---------+----------+---------------+------+
| user_id | movie_id |     score     | rank |
+---------+----------+---------------+------+
|    1    |   313    | 5.22009231647 |  1   |
|    1    |    50    |  5.146131893  |  2   |
|    1    |   408    | 5.12705494961 |  3   |
|    1    |   302    | 5.10910268625 |  4   |
|    1    |   318    | 4.98507245382 |  5   |
|    2    |   173    | 5.06399468457 |  1   |
|    2    |   1449   |  5.0618010064 |  2   |
|    2    |   174    | 5.05349273359 |  3   |
|    2    |   318    | 5.03934589898 |  4   |
|    2    |   408    |  5.0070862857 |  5   |
|    3    |   1449   | 4.87894976264 |  1   |
|    3    |    64    | 4.74379932632 |  2   |
|    3    |   483    | 4.66715376017 |  3   |
|    3    |   113    | 4.63450006148 |  4   |
|    3    |   811    | 4.62353186061 |  5   |
|    4    |   1449   | 5.09425528845 |  1   |
|    4    |   1367   | 4.89270710697 |  2   |
|    4    |   483    | 4.87509702271 |  3   |
|    4    |   318    | 4.849284653

In [18]:
item_sim_model.evaluate_rmse(test_data, target='rating')

{'rmse_by_item': Columns:
 	movie_id	int
 	count	int
 	rmse	float
 
 Rows: 1413
 
 Data:
 +----------+-------+---------------+
 | movie_id | count |      rmse     |
 +----------+-------+---------------+
 |   1611   |   1   |      3.0      |
 |   118    |   64  | 3.04240990583 |
 |   660    |   29  |  3.7837995732 |
 |   699    |   26  | 3.68568133704 |
 |   567    |   7   | 2.17951518322 |
 |   773    |   1   | 3.96717724289 |
 |   1029   |   2   | 1.55832259186 |
 |   435    |   41  | 3.67198048143 |
 |   1289   |   2   | 1.99124726477 |
 |   1517   |   1   | 3.99343544858 |
 +----------+-------+---------------+
 [1413 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	user_id	int
 	count	int
 	rmse	float
 
 Rows: 943
 
 Data:
 +---------+-------+---------------+
 | user_id | count |      rmse     |
 +---------+-------+---------------+
 |   118   |   17  | 4.31

In [19]:
matrix_model.evaluate_rmse(test_data, target='rating')

{'rmse_by_item': Columns:
 	movie_id	int
 	count	int
 	rmse	float
 
 Rows: 1413
 
 Data:
 +----------+-------+----------------+
 | movie_id | count |      rmse      |
 +----------+-------+----------------+
 |   1611   |   1   | 0.177715279465 |
 |   118    |   64  | 0.927170190574 |
 |   660    |   29  | 0.824451886362 |
 |   699    |   26  |  1.1040663156  |
 |   567    |   7   | 1.83242032871  |
 |   773    |   1   | 0.792097665735 |
 |   1029   |   2   | 0.457519003572 |
 |   435    |   41  | 0.847591469792 |
 |   1289   |   2   |  1.320356135   |
 |   1517   |   1   | 0.315442980121 |
 +----------+-------+----------------+
 [1413 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	user_id	int
 	count	int
 	rmse	float
 
 Rows: 943
 
 Data:
 +---------+-------+----------------+
 | user_id | count |      rmse      |
 +---------+-------+----------------+
 |   118

In [20]:
matrix_model2.evaluate_rmse(test_data, target='rating')

{'rmse_by_item': Columns:
 	movie_id	int
 	count	int
 	rmse	float
 
 Rows: 1413
 
 Data:
 +----------+-------+-----------------+
 | movie_id | count |       rmse      |
 +----------+-------+-----------------+
 |   1611   |   1   | 0.0892611614698 |
 |   118    |   64  |  0.89067284862  |
 |   660    |   29  |  0.807700683632 |
 |   699    |   26  |  1.04439352492  |
 |   567    |   7   |  0.786195709521 |
 |   773    |   1   |  0.662729246204 |
 |   1029   |   2   |  0.544211951676 |
 |   435    |   41  |  0.674549684426 |
 |   1289   |   2   |  0.688879316884 |
 |   1517   |   1   |  1.04254535963  |
 +----------+-------+-----------------+
 [1413 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	user_id	int
 	count	int
 	rmse	float
 
 Rows: 943
 
 Data:
 +---------+-------+----------------+
 | user_id | count |      rmse      |
 +---------+-------+------------

In [21]:
matrix_model3.evaluate_rmse(test_data, target='rating')

{'rmse_by_item': Columns:
 	movie_id	int
 	count	int
 	rmse	float
 
 Rows: 1413
 
 Data:
 +----------+-------+----------------+
 | movie_id | count |      rmse      |
 +----------+-------+----------------+
 |   1611   |   1   | 0.308787127529 |
 |   118    |   64  | 0.945302250509 |
 |   660    |   29  | 0.848387299532 |
 |   699    |   26  | 1.00072656515  |
 |   567    |   7   | 0.894702348774 |
 |   773    |   1   | 0.79259065031  |
 |   1029   |   2   | 1.47151187574  |
 |   435    |   41  | 0.750938125278 |
 |   1289   |   2   | 2.14582030803  |
 |   1517   |   1   | 0.446526049884 |
 +----------+-------+----------------+
 [1413 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	user_id	int
 	count	int
 	rmse	float
 
 Rows: 943
 
 Data:
 +---------+-------+----------------+
 | user_id | count |      rmse      |
 +---------+-------+----------------+
 |   118