In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.api.types import CategoricalDtype
import scipy.stats as stats
import statsmodels.formula.api as sm

print(pd.__version__)

%matplotlib inline

0.22.0


In [3]:
# Read table

mercari_data = pd.read_table("./train.tsv", index_col = 'train_id', dtype = {'item_condition_id':CategoricalDtype(categories = [str(i) for i in range(1,6)], ordered = True), 'category_name':'category', 'brand_name': 'category', 'shipping':'category'})

  mask |= (ar1 == a)


In [28]:
cat1, cat2, cat3, a = mercari_data.category_name.str.split("/", 3).str
mercari_data['category_1'] = cat1
mercari_data['category_2'] = cat2
mercari_data['category_3'] = cat3
for col in ['category_1', 'category_2', 'category_3']:
    mercari_data[col] = mercari_data[col].astype('category')

In [29]:
# The categories of the first category level

for i in mercari_data.category_1.cat.categories.values:
    print(i)

Beauty
Electronics
Handmade
Home
Kids
Men
Other
Sports & Outdoors
Vintage & Collectibles
Women


In [30]:
# The path of each category in the second level

for i in mercari_data.category_2.cat.categories.values:
    print(i)

Accessories
Antique
Apparel
Art
Artwork
Athletic Apparel
Automotive
Bags and Purses
Bath
Bath & Body
Bathing & Skin Care
Bedding
Blazers & Sport Coats
Book
Books
Books and Zines
Boys (4+)
Boys 0-24 Mos
Boys 2T-5T
Cameras & Photography
Candles
Car Audio, Video & GPS
Car Seats & Accessories
Cell Phones & Accessories
Ceramics and Pottery
Children
Cleaning Supplies
Clothing
Coats & Jackets
Collectibles
Computers & Tablets
Crochet
Daily & Travel items
Diapering
Dolls and Miniatures
Dresses
Electronics
Exercise
Fan Shop
Feeding
Footwear
Fragrance
Furniture
Gear
Geekery
Girls (4+)
Girls 0-24 Mos
Girls 2T-5T
Glass
Golf
Hair Care
Health & Baby Care
Holidays
Home Appliances
Home Decor
Home Décor
Housewares
Jeans
Jewelry
Kids' Home Store
Kitchen & Dining
Knitting
Magazines
Makeup
Maternity
Media
Men's Accessories
Music
Musical instruments
Needlecraft
Nursery
Office supplies
Other
Others
Outdoors
Pants
Paper Ephemera
Paper Goods
Patterns
Pet Supplies
Pets
Potty Training
Pregnancy & Maternity
Quilt

In [34]:
KLdivergence_data = pd.Series()

mercari_mean = mercari_data.price.mean()
#Sample variance
mercari_sv = mercari_data.price.std()**2

for category in mercari_data.category_1.cat.categories.values:
    category_price_data = mercari_data[mercari_data.category_1 == category].price
    category_mean = category_price_data.mean()
    category_sv = category_price_data.std()**2
    # STD approximates variance for large N
    #Assuming Gaussian distribution, calculate KL-divergence based on mean and variance of fit curve
    KLdivergence = (np.log(mercari_sv/category_sv))+(category_sv+(category_mean-mercari_mean)**2/(2*mercari_sv))-(1/2)
    KLdivergence_data[category] = KLdivergence
    
KLdivergence_data

Beauty                     456.954874
Electronics               4278.216207
Handmade                   770.706022
Home                       596.609722
Kids                       517.263533
Men                       1700.130564
Other                      890.914022
Sports & Outdoors          858.671832
Vintage & Collectibles    2861.898261
Women                     1555.275101
dtype: float64

In [37]:
type(mercari_data.category_1.cat.categories)

pandas.core.indexes.base.Index

In [None]:
# Trying to generalize getting KLdivergence data
data_by_category = mercari_data.groupby('category_1')
# SV is sample variance
def KLdivergence(new_data, old_data):
    new_data_mean = new_data.mean()
    old_data_mean = old_data.mean()
    new_data_sv = new_data.std()**2
    old_data_sv = old_data.std()**2
    # STD approximates variance for large N
    #Assuming Gaussian distribution, calculate KL-divergence based on mean and variance of fit curve
    KLdivergence = (np.log(mercari_sv/category_sv))+(category_sv+(category_mean-mercari_mean)**2/(2*mercari_sv))-(1/2)
data_by_category.apply()

In [None]:
# For troubleshooting
category_data = pd.DataFrame(columns = pd.Index(['mean', 'std']))
for for category in mercari_data.category_1.cat.categories.values:

In [33]:
# Basic statistics about price, the only numerical category in the data

mercari_data.describe()

Unnamed: 0,price
count,1482535.0
mean,26.73752
std,38.58607
min,0.0
25%,10.0
50%,17.0
75%,29.0
max,2009.0


In [33]:
# The next goal is to figure out, within each category, and then as a whole, the proportion of the error that can be eliminated by specifying the category.
# Based on intuition from visual inspection of the names of the categories, it seems that specifying the second level category plausibly eliminates much of the error.

# Need to consolidate logs so this function goes faster

'''
def get_group_stats(group_like): #Takes group or Series - but it needs to be made into two functions for each input type, because as it is you have to many if's
    
        log_group_like = group_like.transform(lambda y: np.log(y + 1))
    if isinstance(group_like, pd.core.groupby.SeriesGroupBy):
       # log_group_stats = log_group_st
    
    # Log ones need to be counted groupwise
    mean_raw = group_like.mean()
    mean_log = log_group_like.mean()
    std_log = log_group_like.std()
    sem_log = std_log/log_group_like.count()
    
    group_stats_dict = {'mean' : mean_raw, 'mean log': mean_log, 'std log' : std_log, 'sem log' : sem_log}
    
    if isinstance(group_like, pd.core.groupby.SeriesGroupBy):
        group_stats = pd.DataFrame(group_stats_dict)
    elif isinstance(group_like, pd.core.series.Series):
        group_stats = pd.Series(group_stats_dict)
    else:
        raise Exception('Must be Series or Grouped Series')
    
    return group_stats
''' 

new_df = train_df.drop('category_twoLevel', axis = 1).assign(category_twoLevel = train_df.category_twoLevel.cat.add_categories('NaN').fillna('NaN'))
new_df = new_df.assign(log_price = np.log(new_df.price + 1))

# Log price stats for whole dataset
price_stats = pd.Series({'n' : new_df.price.count(), 'mean' : new_df.price.mean(), 'mean log': new_df.log_price.mean(), 'std log' : new_df.log_price.std(), 'sem log' : new_df.log_price.std()/np.sqrt(new_df.log_price.count())})
#price_stats = get_group_stats(train_df.price)

# Log price stats for each category - must add transform
grouped_df = new_df.groupby('category_twoLevel')
stats_dict = {'n' : grouped_df.price.count(), 'mean' : grouped_df.price.mean(), 'mean log': grouped_df.log_price.mean(), 'std log' : grouped_df.log_price.std(), 'sem log' : grouped_df.log_price.std()/np.sqrt(grouped_df.log_price.count())}
category2_price_stats = pd.DataFrame(stats_dict)
#category2_price_stats = get_group_stats(train_df.drop('category_twoLevel', axis = 1).assign(category_twoLevel = train_df.category_twoLevel.cat.add_categories('NaN').fillna('NaN')).groupby('category_twoLevel').price)

In [34]:
price_stats

mean        2.673752e+01
mean log    2.979059e+00
n           1.482535e+06
sem log     6.153196e-04
std log     7.492094e-01
dtype: float64

In [35]:
category2_price_stats

Unnamed: 0_level_0,mean,mean log,n,sem log,std log
category_twoLevel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Beauty/Bath & Body,19.084687,2.767214,7758,0.007405,0.652202
Beauty/Fragrance,23.801062,2.989748,24294,0.004101,0.639170
Beauty/Hair Care,19.374646,2.815232,7770,0.006784,0.598025
Beauty/Makeup,18.686176,2.767961,124624,0.001735,0.612341
Beauty/Other,22.952965,2.812244,489,0.033802,0.747466
Beauty/Skin Care,20.497721,2.798968,29838,0.003890,0.671997
Beauty/Tools & Accessories,19.907468,2.770611,13055,0.005916,0.675898
Electronics/Cameras & Photography,71.343813,3.742712,3976,0.015691,0.989426
"Electronics/Car Audio, Video & GPS",37.874031,3.324236,516,0.034974,0.794456
Electronics/Cell Phones & Accessories,30.142278,2.771278,53290,0.004040,0.932555


In [36]:
# 

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_oneLevel,category_threeLevel,clothing_type,category_twoLevel,log_price
train_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10,Smashbox primer,2,Beauty/Makeup/Face,Smashbox,8.0,1,0.25 oz Full size is 1oz for [rm] in Sephora,Beauty,Beauty/Makeup/Face,,Beauty/Makeup,2.197225
15,Sephora tarte birthday gift,1,Beauty/Makeup/Makeup Sets,Tarte,11.0,1,Brand new. Deluxe travel size products. Contai...,Beauty,Beauty/Makeup/Makeup Sets,,Beauty/Makeup,2.484907
16,Glitter Eyeshadow,1,Beauty/Makeup/Eyes,Wet n Wild,6.0,1,2 glitter eyeshadows; one in Brass and one in ...,Beauty,Beauty/Makeup/Eyes,,Beauty/Makeup,1.94591
18,"Too Faced Limited ""Merry Macaroons""",1,Beauty/Makeup/Makeup Palettes,Too Faced,25.0,1,This AUTHENTIC pallete by Too Faced is brand n...,Beauty,Beauty/Makeup/Makeup Palettes,,Beauty/Makeup,3.258097
30,Too Faced Better Than Sex Mascara QTY3,1,Beauty/Makeup/Eyes,Too Faced,32.0,1,BNIB 3 for [rm] Better Than Sex Waterproof Mas...,Beauty,Beauty/Makeup/Eyes,,Beauty/Makeup,3.496508


In [39]:
# Planning to regress log price on the category names
# This needs to be changed to log price as is done with the rest of this notebook
cat_one_results = sm.ols('log_price ~ category_oneLevel', data = new_df).fit()
cat_one_results.summary()

0,1,2,3
Dep. Variable:,log_price,R-squared:,0.038
Model:,OLS,Adj. R-squared:,0.038
Method:,Least Squares,F-statistic:,6525.0
Date:,"Mon, 22 Jan 2018",Prob (F-statistic):,0.0
Time:,14:43:39,Log-Likelihood:,-1639600.0
No. Observations:,1476208,AIC:,3279000.0
Df Residuals:,1476198,BIC:,3279000.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.8003,0.002,1737.523,0.000,2.797,2.804
category_oneLevel[T.Electronics],0.1859,0.003,70.277,0.000,0.181,0.191
category_oneLevel[T.Handmade],-0.2212,0.004,-49.347,0.000,-0.230,-0.212
category_oneLevel[T.Home],0.1904,0.003,58.607,0.000,0.184,0.197
category_oneLevel[T.Kids],0.0116,0.002,4.830,0.000,0.007,0.016
category_oneLevel[T.Men],0.4283,0.003,148.139,0.000,0.423,0.434
category_oneLevel[T.Other],-0.0469,0.004,-12.321,0.000,-0.054,-0.039
category_oneLevel[T.Sports & Outdoors],0.1718,0.005,35.138,0.000,0.162,0.181
category_oneLevel[T.Vintage & Collectibles],0.0974,0.004,25.840,0.000,0.090,0.105

0,1,2,3
Omnibus:,125389.911,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,190207.843
Skew:,0.666,Prob(JB):,0.0
Kurtosis:,4.149,Cond. No.,11.2


In [49]:
cat_one_results.params

Intercept                                      2.800349
category_oneLevel[T.Electronics]               0.185905
category_oneLevel[T.Handmade]                 -0.221245
category_oneLevel[T.Home]                      0.190374
category_oneLevel[T.Kids]                      0.011573
category_oneLevel[T.Men]                       0.428328
category_oneLevel[T.Other]                    -0.046918
category_oneLevel[T.Sports & Outdoors]         0.171780
category_oneLevel[T.Vintage & Collectibles]    0.097373
category_oneLevel[T.Women]                     0.280360
dtype: float64

In [50]:
cat_one_results.bse

Intercept                                      0.001612
category_oneLevel[T.Electronics]               0.002645
category_oneLevel[T.Handmade]                  0.004483
category_oneLevel[T.Home]                      0.003248
category_oneLevel[T.Kids]                      0.002396
category_oneLevel[T.Men]                       0.002891
category_oneLevel[T.Other]                     0.003808
category_oneLevel[T.Sports & Outdoors]         0.004889
category_oneLevel[T.Vintage & Collectibles]    0.003768
category_oneLevel[T.Women]                     0.001847
dtype: float64

In [58]:
# Assuming Gaussian distrubution (not completely sound, as shown by prior probability plot),
# Find the KL divergences for each category
cat_one_means = pd.Series(cat_one_results.params[0] + cat_one_results.params[1:])
#{'mean' : cat_one_results.params)
cat_one_sems = pd.Series(cat_one_results.bse[1:])
cat_one_stds = cat_one_sems

category_oneLevel[T.Electronics]               0.002645
category_oneLevel[T.Handmade]                  0.004483
category_oneLevel[T.Home]                      0.003248
category_oneLevel[T.Kids]                      0.002396
category_oneLevel[T.Men]                       0.002891
category_oneLevel[T.Other]                     0.003808
category_oneLevel[T.Sports & Outdoors]         0.004889
category_oneLevel[T.Vintage & Collectibles]    0.003768
category_oneLevel[T.Women]                     0.001847
dtype: float64

In [56]:
# Divergences of distributions of means


#You want stds not sems

pandas.core.series.Series