In [4]:
import json
import math
import base64
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.colors import ListedColormap
from celluloid import Camera
from IPython.display import Image
from IPython.display import HTML

BAYES_QUARTILE_OPT = 0.25
RATES = [1,2,3,4,5]
index = 0
asinToIndex = {}
scoreMatrix = []
booksAsin = []
chunks = pd.read_json(
    '../Dataset/Books.json', lines=True, chunksize=500000, 
    typ="frame", orient="records", dtype={"asin": str, "overall": int, "unixReviewTime": int}
)

nbChunks = 1 # <= 0 to load all chunks, positive integer to load specific number of chunks
for chunk in chunks:
    for _, item in chunk.iterrows():
        asin = item["asin"]
        rate = item["overall"]

        if (asin not in asinToIndex):
            scoreMatrix.append([0,0,0,0,0])
            booksAsin.append(asin)
            asinToIndex[asin] = index
            index += 1

        row = asinToIndex[asin]
        scoreMatrix[row][rate-1] += 1
    nbChunks -= 1
    if (nbChunks == 0):
        break


def getBookStats(scoreMatrix):
    nbRating = np.sum(scoreMatrix, axis=1)
    avgRatings = (np.average(scoreMatrix, axis=1, weights=RATES) * np.sum(RATES)) / nbRating
    stdRatings = np.std(scoreMatrix, axis=1)
    confidenceNumber = np.quantile(nbRating, BAYES_QUARTILE_OPT)
    arithAverage = np.mean(avgRatings)
    bayesAvgRatings = (nbRating * avgRatings + arithAverage * confidenceNumber) / (nbRating + confidenceNumber)
    nbNonAppreciate = scoreMatrix[:,0]+scoreMatrix[:,1]
    nbNeutral = scoreMatrix[:,2]
    nbAppreciate = scoreMatrix[:,3]+scoreMatrix[:,4]
    return pd.DataFrame(
        data=np.column_stack((nbRating, avgRatings, bayesAvgRatings, stdRatings, nbNonAppreciate, nbNeutral, nbAppreciate)),
        index=booksAsin, 
        columns=["Nb rating", "Average Rating", "Bayes Rating", "STD", "Depreciation", "Neutral", "Appreciations"]
    )


scoreMatrix = np.array(scoreMatrix)
pdScoreMat = pd.DataFrame(
    data=scoreMatrix,
    index=booksAsin,
    columns=[f"{i} ⭐" for i in RATES]
)
display(pdScoreMat)

booksFeatures = getBookStats(scoreMatrix)
display(booksFeatures)

Unnamed: 0,1 ⭐,2 ⭐,3 ⭐,4 ⭐,5 ⭐
000100039X,6,4,8,15,173
0001055178,0,4,2,10,2
0001473123,1,0,0,2,13
0001473727,0,0,0,0,7
0001473905,0,0,1,0,5
...,...,...,...,...,...
0062700847,0,0,1,2,5
006270107X,0,0,0,0,5
0062701363,0,0,0,3,3
0062701398,0,0,0,2,4


Unnamed: 0,Nb rating,Average Rating,Bayes Rating,STD,Depreciation,Neutral,Appreciations
000100039X,206.0,4.674757,4.655516,66.004242,10.0,8.0,188.0
0001055178,18.0,3.555556,3.741554,3.440930,4.0,2.0,12.0
0001473123,16.0,4.625000,4.470017,4.955805,1.0,0.0,15.0
0001473727,7.0,5.000000,4.552027,2.800000,0.0,0.0,7.0
0001473905,6.0,4.666667,4.377171,1.939072,0.0,1.0,5.0
...,...,...,...,...,...,...,...
0062700847,8.0,4.500000,4.330025,1.854724,0.0,1.0,7.0
006270107X,5.0,5.000000,4.483108,2.000000,0.0,0.0,5.0
0062701363,6.0,4.500000,4.305743,1.469694,0.0,0.0,6.0
0062701398,6.0,4.666667,4.377171,1.600000,0.0,0.0,6.0
