In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("all sephora.csv", encoding="utf-8", index_col=0)

In [3]:
print(df.columns)

Index(['AuthorId', 'Helpfulness', 'Id', 'LastModificationTime', 'ProductId',
       'Rating', 'RatingRange', 'ReviewText', 'UserNickname', 'age',
       'eyeColor', 'hairColor', 'skinTone', 'skinType'],
      dtype='object')


In [4]:
columns = ["eyeColor", "hairColor", "skinTone", "skinType"]

In [5]:
skus = set(df["ProductId"])

In [6]:
# skus

In [7]:
label_dict = {}

for col in columns:
    label_dict[col] = set(df[col])

import pprint
pprint.pprint(label_dict)

{'eyeColor': {nan, 'green', 'brown', 'blue', 'hazel', 'gray'},
 'hairColor': {nan, 'brunette', 'blonde', 'black', 'gray', 'auburn', 'red'},
 'skinTone': {nan,
              'dark',
              'deep',
              'ebony',
              'fair',
              'light',
              'medium',
              'olive',
              'porcelain',
              'tan'},
 'skinType': {nan, 'dry', 'oily', 'combination', 'normal'}}


In [8]:
for col in label_dict["eyeColor"]:
    if pd.isnull(col):
        print("NAN", type(col))
    else:
        print(col)

NAN <class 'float'>
green
brown
blue
hazel
gray


In [9]:
df[pd.isnull(df["skinType"]) | (df["skinType"] == "dry")].head(50)

Unnamed: 0,AuthorId,Helpfulness,Id,LastModificationTime,ProductId,Rating,RatingRange,ReviewText,UserNickname,age,eyeColor,hairColor,skinTone,skinType
8,1502678847,1.0,124469579,2019-04-08T05:42:12.000+00:00,2144608,5,5,"RiRi did it again!\r\nI LOVE this set, from th...",purplemoon,13to17,brown,brunette,fair,dry
9,11745190902,,124433325,2019-02-12T17:00:07.000+00:00,2144608,5,5,Love love love!! They definitely hit it out of...,thewhitejag,,blue,brunette,porcelain,dry
24,1650932125,,123165749,2019-01-09T13:45:04.000+00:00,2144608,5,5,I bought this for the hot pink and it is even ...,KRReed,,brown,brunette,medium,dry
25,1650932125,,123165712,2019-01-09T13:45:04.000+00:00,2144616,5,5,Love the colors! The blue isn't too overpoweri...,KRReed,,brown,brunette,medium,dry
27,5213721427,,123138536,2019-01-08T05:45:03.000+00:00,2144616,5,5,This is so out of my usual color range but I a...,MidnightDreamer,,brown,brunette,fair,dry
28,2090226998,,123053679,2019-01-05T02:30:04.000+00:00,2144616,4,5,"Pretty colours, love the packaging. Glides on...",2manyperfumes,,hazel,brunette,fair,dry
44,2170902292,,121364514,2018-11-14T05:15:02.000+00:00,2144616,5,5,Love the colors. Comfortable to wear & gorgeou...,lecaptain,,brown,blonde,porcelain,dry
47,6854553786,1.0,121255643,2019-03-28T02:08:55.000+00:00,2144608,5,5,"Fenty's ""Snow Daze"" is absolute perfection! F...",GianettaB,,green,brunette,light,dry
54,2077014259,1.0,120829387,2019-03-29T21:43:21.000+00:00,2144608,5,5,"More moisturizing than mattemoisel, very beaut...",AndiSilva,,brown,brunette,tan,dry
62,22881223329,,126274976,2019-04-07T23:15:04.000+00:00,2063766,1,5,"Don’t do it!! I love Nars orgasm everything, b...",Fancy001,,blue,black,olive,


In [10]:
df["Rating"].dtype

dtype('int64')

In [11]:
len(skus)

3576

In [12]:
import time
import math

total = len(skus)

result = {}

t0 = time.time()

counter = 0

for i, sku in enumerate(skus):
    buf = df[df["ProductId"] == sku]
    scores = {}
    for j, col in enumerate(columns):
        if pd.isnull(col):
            continue
        labels = label_dict[col]
        scores[col] = {}
        for k, label in enumerate(labels):
            filtered = buf[pd.isnull(buf[col]) | (buf[col] == label)]
            counter += len(filtered)
            mean = filtered["Rating"].mean()
            cnt = len(filtered)
            scores[col][label] = mean + math.log(cnt+1,10)
    result[sku] = scores
    if i % 100 == 0:
        dt = time.time() - t0
        print("{:>4d}: counter = {:>6d}, used = {:.1f}, eta = {:.1f}".format(i, counter, dt, dt * (total - i - 1) / (i + 1)))

   0: counter =     12, used = 0.0, eta = 139.6
 100: counter =  55238, used = 3.3, eta = 112.0
 200: counter =  75230, used = 6.5, eta = 109.4
 300: counter = 120774, used = 9.7, eta = 105.9
 400: counter = 141813, used = 13.1, eta = 103.5
 500: counter = 264693, used = 16.5, eta = 101.5
 600: counter = 405977, used = 20.3, eta = 100.7
 700: counter = 428745, used = 24.3, eta = 99.6
 800: counter = 522576, used = 27.8, eta = 96.3
 900: counter = 532340, used = 32.0, eta = 95.0
1000: counter = 551826, used = 35.6, eta = 91.5
1100: counter = 604784, used = 39.4, eta = 88.5
1200: counter = 614506, used = 42.9, eta = 84.9
1300: counter = 630095, used = 46.3, eta = 81.0
1400: counter = 698180, used = 49.7, eta = 77.2
1500: counter = 812143, used = 53.2, eta = 73.5
1600: counter = 857303, used = 56.5, eta = 69.8
1700: counter = 912391, used = 60.0, eta = 66.2
1800: counter = 1337924, used = 63.5, eta = 62.6
1900: counter = 1386634, used = 66.9, eta = 58.9
2000: counter = 1404650, used = 70.

In [13]:
counter, len(df)

(2290924, 157456)

In [14]:
import copy

raw_result = copy.deepcopy(result)

In [15]:
# raw_result

In [16]:
len(result)

3576

In [21]:
result = copy.deepcopy(raw_result)

total = len(result)

t0 = time.time()

for i, (sku, sku_scores) in enumerate(result.items()):
    for j, (col, col_scores) in enumerate(sku_scores.items()):
        for k, (label, score) in enumerate(col_scores.items()):
            if np.isnan(score):
                result[sku][col][label] = df[df["ProductId"] == sku]["Rating"].mean()
        result[sku][col]["nan"] = df[df["ProductId"] == sku]["Rating"].mean()
    if i % 100 == 0:
        dt = time.time() - t0
        print("{:>4d}: used = {:.1f}, eta = {:.1f}".format(i, dt, dt * (total - i - 1) / (i + 1)))

   0: used = 0.3, eta = 918.7
 100: used = 18.0, eta = 620.1
 200: used = 36.1, eta = 605.7
 300: used = 56.3, eta = 612.2
 400: used = 72.6, eta = 574.8
 500: used = 91.1, eta = 559.1
 600: used = 108.7, eta = 538.2
 700: used = 124.8, eta = 511.8
 800: used = 141.1, eta = 488.9
 900: used = 162.1, eta = 481.3
1000: used = 181.7, eta = 467.4
1100: used = 202.0, eta = 454.2
1200: used = 225.2, eta = 445.4
1300: used = 247.8, eta = 433.4
1400: used = 264.5, eta = 410.7
1500: used = 279.9, eta = 386.9
1600: used = 298.1, eta = 367.8
1700: used = 315.7, eta = 348.0
1800: used = 332.4, eta = 327.6
1900: used = 348.1, eta = 306.7
2000: used = 363.8, eta = 286.4
2100: used = 381.1, eta = 267.5
2200: used = 398.9, eta = 249.2
2300: used = 415.2, eta = 230.1
2400: used = 433.1, eta = 212.0
2500: used = 450.4, eta = 193.6
2600: used = 467.3, eta = 175.2
2700: used = 483.5, eta = 156.6
2800: used = 502.3, eta = 139.0
2900: used = 521.4, eta = 121.3
3000: used = 538.3, eta = 103.1
3100: used = 55

In [22]:
# result

In [23]:
import json

with open("label rating with nan.json", "w", encoding="utf-8") as fout:
    json.dump(result, fout)