# 1. Load Data

In [1]:
# import all packages needed
import pandas as pd
import json
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt 
import os
import sys
import time
import pickle
from scipy.sparse import csr_matrix
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline

### 1.1 Load Ratings Data

In [2]:
line_count = len(open("yelp_dataset/review.json").readlines())
user_ids, business_ids, stars, dates,reviews = [], [], [], [],[]
with open("yelp_dataset/review.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_ids += [blob["user_id"]]
        business_ids += [blob["business_id"]] 
        stars += [blob["stars"]]
        dates += [blob["date"]]
        reviews +=[blob["text"]]
ratings=pd.DataFrame({"UserId": user_ids, "ItemId": business_ids, "Rating": stars, "date": dates,"reviews":reviews})
user_counts = ratings["UserId"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

100%|██████████| 6685900/6685900 [00:47<00:00, 142217.40it/s]


In [3]:
ratings.head()

Unnamed: 0,UserId,ItemId,Rating,date,reviews
0,hG7b0MtEbXx5QzbzE6C_VA,ujmEBvifdJM6h6RLv4wQIg,1.0,2013-05-07 04:34:36,Total bill for this horrible service? Over $8G...
1,yXQM5uF2jS6es16SJzNHfg,NZnhc2sEQy3RmzKTZnqtwQ,5.0,2017-01-14 21:30:33,I *adore* Travis at the Hard Rock's new Kelly ...
2,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0,2016-11-09 20:09:03,I have to say that this office really has it t...
3,dacAIZ6fTM6mqwW5uxkskg,ikCg8xy5JIg_NGPx-MSIDA,5.0,2018-01-09 20:56:38,Went in for a lunch. Steak sandwich was delici...
4,ssoyf2_x0EQMed6fgHeMyQ,b1b1eb3uo-w561D0ZfCEiQ,1.0,2018-01-30 23:07:38,Today was my second out of three sessions I ha...


In [4]:
#TEST
# active_users = user_counts.loc[user_counts >= 200].index.tolist()
# len(active_users)

In [5]:
# select the active users in the dataset
ratings_active = ratings[ratings.UserId.isin(active_users)]

#### Split Test and Train Data

In [6]:
ratings_sort = ratings_active.sort_values(by=['UserId','date'])

In [7]:
test = ratings_sort.groupby('UserId').tail(1)
test.shape

(286130, 5)

In [8]:
test.head()

Unnamed: 0,UserId,ItemId,Rating,date,reviews
3928452,---1lKK3aKOuomHnwAkAow,Hqs4YNST_ZHbshwyi4bnsQ,5.0,2018-10-11 23:29:57,The customer service of the owner made me give...
2794680,--0kuuLmuYBe3Rmu0Iycww,PYe_FDw6QTbTf66WcGE_tw,2.0,2014-04-21 16:58:28,I'd like to keep it short and sweet today....\...
3722634,--2HUmLkcNHZp0xw6AMBPg,KW9RNyBPmc77f9FsO92qYw,5.0,2018-10-04 02:02:28,This is a beautiful gym. Lots of equipment op...
1213261,--2vR0DIsmQ6WfcSzKWigw,BLIJ-p5wYuAhw6Pp6mh6mw,3.0,2018-01-11 04:24:17,Bei unserem morgendlichen Spaziergang über den...
4737894,--3WaS23LcIXtxyFULJHTA,UKrfUw8quQiQM2N9i1nH0g,4.0,2018-09-03 19:32:11,Toucan play at this game! Great family activi...


In [9]:
train =pd.concat([ratings_sort, test]).drop_duplicates(keep=False)
train.shape

(4252142, 5)

In [10]:
train.head()

Unnamed: 0,UserId,ItemId,Rating,date,reviews
5178360,---1lKK3aKOuomHnwAkAow,5cbsjFtrntUAeUx51FaFTg,4.0,2008-11-11 04:31:46,"I like it, and so far I think it is one of the..."
5934075,---1lKK3aKOuomHnwAkAow,--9e1ONYQuAa-CB_Rrw7Tw,4.0,2008-11-11 04:40:05,So when you go to a restaurant like this pleas...
4312893,---1lKK3aKOuomHnwAkAow,ifEHr-ZnGFSKgJVsywiAFg,5.0,2009-01-16 21:49:36,"The Wild Boar was amazing, so good my husband ..."
2915094,---1lKK3aKOuomHnwAkAow,kosTPb88O4Q0XGbVbEOGCA,4.0,2010-10-16 23:27:02,While its not Lotus it was tasty an the women ...
809678,---1lKK3aKOuomHnwAkAow,rq5dgoksPHkJwJNQKlGQ7w,5.0,2010-10-16 23:31:28,"Best coffee in town, they brew each cup. If yo..."


### 1.2 Load User Side Information

In [11]:
line_count = len(open("yelp_dataset/user.json").readlines())
user_id,review_count, fans, average_stars = [],[],[],[]
useful, funny, cool, elite = [],[],[],[]
with open("yelp_dataset/user.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_id += [blob["user_id"]]
        review_count += [blob["review_count"]]
        fans += [blob["fans"]]
        average_stars += [blob["average_stars"]]
        useful += [blob["useful"]]
        funny += [blob["funny"]]
        cool += [blob["cool"]]
        
users = pd.DataFrame(
    {"UserId": user_id, "review_count": review_count, "fans":fans, 
     "average_stars":average_stars, "useful":useful, "funny":funny, "cool":cool}
)

100%|██████████| 1637138/1637138 [00:18<00:00, 89796.59it/s] 


In [14]:
user_info_active = users[users.UserId.isin(active_users)]
user_info_active.shape

(286130, 7)

In [15]:
user_info_active.head()

Unnamed: 0,UserId,review_count,fans,average_stars,useful,funny,cool
0,l6BmjZMeQD3rDxWUbiAiow,95,5,4.03,84,17,25
1,4XChL029mKr5hydo79Ljxg,33,4,3.63,48,22,16
2,bc8C_eETBWL0olvFSJJd0w,16,0,3.71,28,8,10
4,MM4RJAeH6yuaN8oZDSt0RA,361,39,4.08,1114,279,665
6,TEtzbpgA2BFBrC0y0sCbfw,1122,696,4.39,13311,19356,15319



# 2. Basic model - only movie ratings

Non-collective factorization model - including user and item biases + regularization:


## 2.1 Fitting the model

In [16]:
%%time
from copy import deepcopy
from cmfrec import CMF

model_no_side_info = CMF(k=40, reg_param=1e-4, random_seed=1)
model_no_side_info.fit(deepcopy(train))
test_no_side_info = deepcopy(test)
test_no_side_info['Predicted'] = model_no_side_info.predict(test_no_side_info.UserId, 
                                                            test_no_side_info.ItemId)






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.





INFO:tensorflow:Optimization terminated with:
  Message: b'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
  Objective function value: 1.696654
  Number of iterations: 36
  Number of functions evaluations: 46
CPU times: user 7min 53s, sys: 2min 12s, total: 10min 5s
Wall time: 2min 42s


## 2.2 Evaluating results

For this model and the ones that will follow, I will evaluate the recommendations by computing:
* Root mean squared error (RMSE), i.e. sum( sqrt( (real - predicted)^2 ) ) - which can be though of the average star-rating error for each predicted rating. This is the most typical measure but has some drawbacks as it doesn't tend to be a good measure when ranking and can be substantially improved without changing the relative order of predictions.

There are other more appropriate evaluation criteria, but these are easy to understand and provide reasonable insights on model performance.

In [17]:
print("RMSE (no side info): ", 
      np.sqrt(np.mean( (test_no_side_info.Predicted - test_no_side_info.Rating)**2) ))

RMSE (no side info):  1.4563397206455984


# 3. Model with user side information

Now I'll add only the user information.

## 3.1 Fitting the model

In [18]:
%%time
model_user_info = CMF(k=40, reg_param=1e-4, offsets_model=True, random_seed=1)
model_user_info.fit(deepcopy(train),
                     user_info = deepcopy(user_info_active))
test_with_user_info = deepcopy(test)
test_with_user_info['Predicted'] = model_user_info.predict(test_with_user_info.UserId, 
                                                           test_with_user_info.ItemId)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 985.159973
  Number of iterations: 1000
  Number of functions evaluations: 1078
CPU times: user 3h 2min 55s, sys: 50min 8s, total: 3h 53min 4s
Wall time: 53min 24s


## 3.2 Evaluating results

In [19]:
print("RMSE (user side info): ", 
      np.sqrt(np.mean( (test_with_user_info.Predicted - test_with_user_info.Rating)**2) ))

RMSE (user side info):  6.474480331600907
