In [None]:
import numpy as np
import pandas as pd
import gzip
from scipy.sparse import coo_matrix

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

In [None]:
def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
def describe(df):
    print('rating:', df1.shape[0])
    num_user  =len(set(df['reviewerID']))
    print('user:', num_user)
    num_item = len(set(df['asin']))
    print('item:', num_item)
    print('user x item:', num_user*num_item)
    print('rating null:', df['overall'].isnull().sum())
    print('user null:', df['reviewerID'].isnull().sum())
    print('item null:', df['asin'].isnull().sum())

In [None]:
def get_rating_matrix(df):
    # delete duplicated review except last one
    data = df[['overall', 'reviewerID', 'asin']].drop_duplicates(subset=['reviewerID', 'asin'], keep='last')
    
    # get indices: sorted user & item 
    row_idx = sorted(list(set(data['reviewerID'])))
    col_idx = sorted(list(set(data['asin'])))
    # get coordinate of each item
    row = list(map(lambda x: row_idx.index(x), data['reviewerID']))
    col = list(map(lambda x: col_idx.index(x), data['asin']))
    
    # make matrix
    num_user = len(row_idx)
    num_item = len(col_idx)
    matrix = coo_matrix((data['overall'], (row, col)), shape=(num_user, num_item))
    
    return matrix, row_idx, col_idx

In [None]:
def split_rating_matrix(rating_matrix, seed=0):
    num_rate = rating_matrix.count_nonzero()
    num_user = rating_matrix.shape[0]
    num_item = rating_matrix.shape[1]
    np.random.seed(seed=seed)
    
    # make train, test mask
    random_array = np.random.rand(num_rate)
    train_flag = np.where(random_array > 0.8, 0, 1)
    train_mask = coo_matrix((train_flag, (rating_matrix.row, rating_matrix.col)), shape=(num_user, num_item))
    test_flag = 1 - train_flag
    test_mask = coo_matrix((test_flag, (rating_matrix.row, rating_matrix.col)), shape=(num_user, num_item))
    
    # make train, test matrix
    train_matrix = rating_matrix.tocsr().multiply(train_mask.tocsr()).tocoo()
    test_matrix = rating_matrix.tocsr().multiply(test_mask.tocsr()).tocoo()
    
    return train_matrix, test_matrix

In [None]:
df1 = get_df('data/Appliances_5.json.gz')
describe(df1)
df1.head()

In [None]:
rating_matrix, row_idx, col_idx = get_rating_matrix(df1)
train_matrix, test_matrix = split_rating_matrix(rating_matrix, seed=1)

In [None]:
print(rating_matrix.count_nonzero())
print(train_matrix.count_nonzero())
print(test_matrix.count_nonzero())

In [None]:
df2 = get_df('data/magazine_Subscriptions_5.json.gz')
describe(df2)
df2.head()

In [None]:
rating_matrix, row_idx, col_idx = get_rating_matrix(df2)
train_matrix, test_matrix = split_rating_matrix(rating_matrix, seed=1)

In [None]:
print(rating_matrix.count_nonzero())
print(train_matrix.count_nonzero())
print(test_matrix.count_nonzero())