In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("all sephora.csv", encoding="utf-8", index_col=0)

In [3]:
print(df.columns)

Index(['AuthorId', 'Helpfulness', 'Id', 'LastModificationTime', 'ProductId',
       'Rating', 'RatingRange', 'ReviewText', 'UserNickname', 'age',
       'eyeColor', 'hairColor', 'skinTone', 'skinType'],
      dtype='object')


In [4]:
columns = ["eyeColor", "hairColor", "skinTone", "skinType"]

In [5]:
skus = set(df["ProductId"])

In [6]:
# skus

In [7]:
label_dict = {}

for col in columns:
    label_dict[col] = set(df[col])

import pprint
pprint.pprint(label_dict)

{'eyeColor': {nan, 'blue', 'brown', 'gray', 'green', 'hazel'},
 'hairColor': {nan, 'blonde', 'red', 'brunette', 'gray', 'black', 'auburn'},
 'skinTone': {nan,
              'dark',
              'deep',
              'ebony',
              'fair',
              'light',
              'medium',
              'olive',
              'porcelain',
              'tan'},
 'skinType': {nan, 'dry', 'oily', 'combination', 'normal'}}


In [8]:
for col in label_dict["eyeColor"]:
    if pd.isnull(col):
        print("NAN", type(col))
    else:
        print(col)

NAN <class 'float'>
blue
brown
gray
green
hazel


In [9]:
df[pd.isnull(df["skinType"]) | (df["skinType"] == "dry")].head()

Unnamed: 0,AuthorId,Helpfulness,Id,LastModificationTime,ProductId,Rating,RatingRange,ReviewText,UserNickname,age,eyeColor,hairColor,skinTone,skinType
8,1502678847,1.0,124469579,2019-04-08T05:42:12.000+00:00,2144608,5,5,"RiRi did it again!\r\nI LOVE this set, from th...",purplemoon,13to17,brown,brunette,fair,dry
9,11745190902,,124433325,2019-02-12T17:00:07.000+00:00,2144608,5,5,Love love love!! They definitely hit it out of...,thewhitejag,,blue,brunette,porcelain,dry
24,1650932125,,123165749,2019-01-09T13:45:04.000+00:00,2144608,5,5,I bought this for the hot pink and it is even ...,KRReed,,brown,brunette,medium,dry
25,1650932125,,123165712,2019-01-09T13:45:04.000+00:00,2144616,5,5,Love the colors! The blue isn't too overpoweri...,KRReed,,brown,brunette,medium,dry
27,5213721427,,123138536,2019-01-08T05:45:03.000+00:00,2144616,5,5,This is so out of my usual color range but I a...,MidnightDreamer,,brown,brunette,fair,dry


In [10]:
df["Rating"].dtype

dtype('int64')

In [11]:
len(skus)

3576

In [12]:
import time
import math

total = len(skus)

result = {}

t0 = time.time()

counter = 0

max_score = 0

for i, sku in enumerate(skus):
    buf = df[df["ProductId"] == sku]
    scores = {}
    for j, col in enumerate(columns):
        if pd.isnull(col):
            continue
        labels = label_dict[col]
        scores[col] = {}
        for k, label in enumerate(labels):
            filtered = buf[pd.isnull(buf[col]) | (buf[col] == label)]
            counter += len(filtered)
            mean = filtered["Rating"].mean()
            cnt = len(filtered)
            scores[col][label] = mean + math.log(cnt+1,10)
            max_score = scores[col][label] if scores[col][label] > max_score else max_score
    result[sku] = scores
    if i % 100 == 0:
        dt = time.time() - t0
        print("{:>4d}: counter = {:>6d}, used = {:.1f}, eta = {:.1f}".format(i, counter, dt, dt * (total - i - 1) / (i + 1)))

   0: counter =    242, used = 0.0, eta = 132.3
 100: counter =  63886, used = 3.1, eta = 107.8
 200: counter = 451856, used = 6.2, eta = 104.6
 300: counter = 575769, used = 9.3, eta = 101.4
 400: counter = 639144, used = 12.4, eta = 98.0
 500: counter = 707585, used = 15.4, eta = 94.8
 600: counter = 750114, used = 18.5, eta = 91.7
 700: counter = 768783, used = 21.7, eta = 88.9
 800: counter = 781166, used = 24.8, eta = 85.8
 900: counter = 798851, used = 27.8, eta = 82.7
1000: counter = 831205, used = 30.9, eta = 79.5
1100: counter = 842426, used = 34.0, eta = 76.3
1200: counter = 850016, used = 37.0, eta = 73.2
1300: counter = 864924, used = 40.1, eta = 70.1
1400: counter = 1057949, used = 43.2, eta = 67.1
1500: counter = 1075247, used = 46.3, eta = 64.0
1600: counter = 1165329, used = 49.4, eta = 60.9
1700: counter = 1318457, used = 52.5, eta = 57.8
1800: counter = 1334793, used = 55.5, eta = 54.7
1900: counter = 1415476, used = 58.6, eta = 51.7
2000: counter = 1470031, used = 61

In [13]:
counter, len(df)

(2290924, 157456)

In [14]:
import copy

raw_result = copy.deepcopy(result)

In [15]:
# raw_result

In [16]:
len(result)

3576

In [17]:
result = copy.deepcopy(raw_result)

total = len(result)

t0 = time.time()

for i, (sku, sku_scores) in enumerate(result.items()):
    for j, (col, col_scores) in enumerate(sku_scores.items()):
        for k, (label, score) in enumerate(col_scores.items()):
            if np.isnan(score):
                result[sku][col][label] = df[df["ProductId"] == sku]["Rating"].mean() / max_score
            else:
                result[sku][col][label] = result[sku][col][label] / max_score
        result[sku][col]["nan"] = df[df["ProductId"] == sku]["Rating"].mean() / max_score
    if i % 100 == 0:
        dt = time.time() - t0
        print("{:>4d}: used = {:.1f}, eta = {:.1f}".format(i, dt, dt * (total - i - 1) / (i + 1)))

   0: used = 0.0, eta = 135.9
 100: used = 14.8, eta = 510.5
 200: used = 30.7, eta = 516.2
 300: used = 48.7, eta = 529.8
 400: used = 67.5, eta = 534.5
 500: used = 84.6, eta = 519.2
 600: used = 101.1, eta = 500.7
 700: used = 120.6, eta = 494.4
 800: used = 138.9, eta = 481.1
 900: used = 154.6, eta = 459.0
1000: used = 172.0, eta = 442.5
1100: used = 190.5, eta = 428.2
1200: used = 209.8, eta = 414.9
1300: used = 225.5, eta = 394.4
1400: used = 241.2, eta = 374.4
1500: used = 258.9, eta = 357.9
1600: used = 276.7, eta = 341.4
1700: used = 293.7, eta = 323.7
1800: used = 310.3, eta = 305.8
1900: used = 329.5, eta = 290.3
2000: used = 346.9, eta = 273.0
2100: used = 365.2, eta = 256.4
2200: used = 384.4, eta = 240.1
2300: used = 403.3, eta = 223.5
2400: used = 419.2, eta = 205.2
2500: used = 436.2, eta = 187.5
2600: used = 452.7, eta = 169.7
2700: used = 470.6, eta = 152.5
2800: used = 486.3, eta = 134.6
2900: used = 504.0, eta = 117.3
3000: used = 520.9, eta = 99.8
3100: used = 537

In [18]:
# result

In [19]:
import json

with open("label rating with nan normalized.json", "w", encoding="utf-8") as fout:
    json.dump(result, fout)