In [90]:
import math
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

RATES = [1,2,3,4,5]

def loadData(nbChunks = 0): # <= 0 to load all chunks, positive integer to load specific number of chunks
    reviewers = {}
    books = {}
    chunks = pd.read_json(
        '../Dataset/Books.json', lines=True, chunksize=100000,
        typ="frame", orient="records", dtype={"asin": str, "reviewerName": str, "overall": int}
    )
    for chunk in chunks:
        for _, item in chunk.iterrows():
            asin = item["asin"]
            rate = item["overall"]
            reviewerName = item["reviewerName"]
            if asin not in books:
                books[asin] = {}
            if reviewerName not in reviewers:
                reviewers[reviewerName] = {}
            books[asin][reviewerName] = rate
            reviewers[reviewerName][asin] = rate
        nbChunks -= 1
        if (nbChunks == 0): break
    data = np.array([ 
        [ books[b][r] if r in books[b] else np.nan for b in books.keys() ] for r in reviewers.keys() 
    ])
    df = pd.DataFrame(
        data=data,
        index=reviewers.keys(),
        columns=books.keys()
    )
    sum_of_nans_col = np.isnan(df.to_numpy()).sum(axis=0)
    threshold = np.quantile(sum_of_nans_col, 1000/len(books.keys()))
    # Remove all columns that contain more than X amount of NaNs
    df = df.iloc[:, sum_of_nans_col <= threshold]
    # Remove all rows that contain only NaNs
    df = df.iloc[~np.isnan(df.to_numpy()).all(axis=1), :]
    return df


fr1 = loadData(1)
display(fr1)