## Step 1: Get data into a numpy array to make recommender matrix

In [2]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize
import pickle

from scipy.sparse import lil_matrix as sparse_matrix

---

### Loading JSON files into Python objects and Pickling

In [3]:
users_f = open('yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_user.json', 'r')
users_json = []
for user in users_f:
    user_json = json.loads(user)
    users_json.append(user_json)

users_pd_json = json_normalize(users_json)
users = users_pd_json[['user_id', 'average_stars', 'fans', 'review_count']]

In [4]:
users_map = {}
count = 0
for user in users['user_id']:
    users_map[user] = count
    count += 1

In [7]:
business_f = open('yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json', 'r')
business_json = []
for business_dump in business_f:
    business_json.append(json.loads(business_dump))

businesses = json_normalize(business_json)
business_ids = business_pd['business_id']

In [8]:
business_map = {}
count = 0
for business in business_ids:
    business_map[business] = count
    count += 1

In [17]:
reviews_f = open('yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json', 'r')
reviews_json = []
for review_dump in reviews_f:
    review_json_full = json.loads(review_dump)
    review = {}
    review['user_id'] = review_json_full['user_id']
    review['review_id'] = review_json_full['review_id']
    review['business_id'] = review_json_full['business_id']
    review['stars'] = review_json_full['stars']
    review['text'] = review_json_full['text']
    reviews_json.append(review)
    
reviews = json_normalize(reviews_json)
# reviews has everything that we need, will drop review text for first model

In [18]:
pickle.dump(users_map, open('users_map.p', 'w'))
pickle.dump(business_map, open('business_map.p', 'w'))
pickle.dump(reviews, open('reviews.p', 'wb'))

In [9]:
pickle.dump(users, open('users_pd.p', 'wb'))
pickle.dump(businesses, open('businesses_pd.p', 'wb'))

-----
### Load objects directly from Pickle files

In [3]:
users_map = pickle.load(open('users_map.p', 'rb'))
business_map = pickle.load(open('business_map.p', 'rb'))

# pandas objects
reviews = pickle.load(open('reviews.p', 'rb'))
users = pickle.load(open('users_pd.p', 'rb'))
businesses = pickle.load(open('businesses_pd.p', 'rb'))

In [34]:
l_users = len(users_map)
l_business = len(business_map)

In [38]:
# create a numpy array that has users on x and business on y, can be flipped without problems
ratings_matrix = sparse_matrix((l_users, l_business))

def add_to_ratings(record):
    i = users_map[record['user_id']]
    j = business_map[record['business_id']]
    ratings_matrix[i, j] = record['stars']

reviews.apply(lambda record : add_to_ratings(record), axis=1)

0          None
1          None
2          None
3          None
4          None
5          None
6          None
7          None
8          None
9          None
10         None
11         None
12         None
13         None
14         None
15         None
16         None
17         None
18         None
19         None
20         None
21         None
22         None
23         None
24         None
25         None
26         None
27         None
28         None
29         None
           ... 
2225183    None
2225184    None
2225185    None
2225186    None
2225187    None
2225188    None
2225189    None
2225190    None
2225191    None
2225192    None
2225193    None
2225194    None
2225195    None
2225196    None
2225197    None
2225198    None
2225199    None
2225200    None
2225201    None
2225202    None
2225203    None
2225204    None
2225205    None
2225206    None
2225207    None
2225208    None
2225209    None
2225210    None
2225211    None
2225212    None
dtype: object

In [41]:
pickle.dump(ratings_matrix, open('ratings_matrix.p', 'wb'))

In [42]:
ratings_matrix = pickle.load(open('ratings_matrix.p', 'rb'))

In [53]:
ratings_matrix.shape

(552339, 77445)

In [43]:
ratings_matrix[users_map['PUFPaY9KxDAcGqfsorJp3Q'], business_map['5UmKMjUEUNdYWqANhGckJw']]

4.0

### Computing Baselines on Data

In [4]:
alpha = reviews['stars'].mean()

In [5]:
# users[['user_id', 'average_stars']]
beta_users = []
for i in range(0, len(users)):
    user = users.iloc[i]
    beta_users.append(user['average_stars'] - alpha)

In [6]:
beta_business = []
for i in range(0, len(businesses)):
    business = businesses.iloc[i]
    beta_business.append(business['stars'] - alpha)

In [7]:
baselines = {}
baselines['alpha'] = alpha
baselines['beta_users'] = beta_users
baselines['beta_business'] = beta_business

In [8]:
pickle.dump(baselines, open('baselines.p', 'wb'))

In [9]:
baselines = pickle.load(open('baselines.p', 'rb'))