In [24]:
!pip install eli5



In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [6]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix/dw_matrix"

/content/drive/My Drive/Colab Notebooks/dw_matrix/dw_matrix


In [0]:
df = pd.read_csv('data/women_shoes.csv', low_memory=False)

In [0]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  x = df[ feats ].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [9]:
#df[ 'brand_cat'] = df[ 'brand'].factorize()[0]
df[ 'brand_cat'] = df[ 'brand'].map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-51.57118869808353, 0.6390743725376244)

In [10]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-51.356432361171095, 0.7207640114801125)

In [11]:
df.features.head().values

array(['[{"key":"Season","value":["All-Season"]},{"key":"Material","value":["Synthetic"]},{"key":"Gender","value":["Women"]},{"key":"Shoe Size","value":["9"]},{"key":"Size","value":["9"]},{"key":"Color","value":["Blue"]},{"key":"Model","value":["Z130103201090"]},{"key":"Manufacturer Part Number","value":["Z130103201090"]},{"key":"Brand","value":["Zoot"]},{"key":"Age Group","value":["Adult"]}]',
       '[{"key":"Season","value":["All-Season"]},{"key":"Material","value":["Synthetic"]},{"key":"Gender","value":["Women"]},{"key":"Shoe Size","value":["9"]},{"key":"Size","value":["9"]},{"key":"Color","value":["Blue"]},{"key":"Model","value":["Z130103201090"]},{"key":"Manufacturer Part Number","value":["Z130103201090"]},{"key":"Brand","value":["Zoot"]},{"key":"Age Group","value":["Adult"]}]',
       '[{"key":"Season","value":["All-Season"]},{"key":"Material","value":["Synthetic"]},{"key":"Gender","value":["Women"]},{"key":"Shoe Size","value":["9"]},{"key":"Size","value":["9"]},{"key":"Color","

In [0]:
#test = {'key': 'value'}
#test['key']
#str(test)

"{'key': 'value'}"

In [12]:
str_dict ='[{"key":"Season","value":["All-Season"]},{"key":"Material","value":["Synthetic"]},{"key":"Gender","value":["Women"]},{"key":"Shoe Size","value":["9"]},{"key":"Size","value":["9"]},{"key":"Color","value":["Blue"]},{"key":"Model","value":["Z130103201090"]},{"key":"Manufacturer Part Number","value":["Z130103201090"]},{"key":"Brand","value":["Zoot"]},{"key":"Age Group","value":["Adult"]}]'


literal_eval(str_dict)[0]['key']

'Season'

In [0]:
def parse_features(x):
  output_dict = {}
  if str(x) == 'nan': return output_dict

  features = literal_eval(x.replace('\\"','"'))
  for item in features:
    key = item['key'].lower().strip()
    value = item['value'][0].lower().strip()

    output_dict[key] = value
  
  return output_dict


df['features_parsed'] = df['features'].map(parse_features)

In [14]:
 keys = set()
#df['features_parsed'].head().values
df['features_parsed'].map(lambda x: keys.update(x.keys()))

len(keys)

342

In [15]:
def get_name_feat(key):
  return 'feat_'+key

for key in tqdm_notebook(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

HBox(children=(IntProgress(value=0, max=342), HTML(value='')))




In [16]:
df.shape[0]

18001

In [0]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0] / df.shape[0] * 100

In [18]:
{k:v for k,v in keys_stat.items() if v > 40 }

{'age group': 42.54208099550025,
 'brand': 57.22459863340925,
 'color': 53.1414921393256,
 'gender': 58.835620243319816,
 'manufacturer part number': 44.30309427254041,
 'material': 48.458418976723515}

In [0]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]
df['feat_condition_cat'] = df['feat_condition'].factorize()[0]
df['feat_mpn_cat'] = df['feat_mpn'].factorize()[0]
df['feat_heel_height_cat'] = df['feat_heel height'].factorize()[0]

for key in keys:
  df[get_name_feat(key)+ '_cat'] = df[get_name_feat(key)].factorize()[0]


In [20]:
df['brand'] = df['brand'].map(lambda x: str(x).lower())
#df[df.brand == df.feat_brand][['brand','feat_brand']].head()
df[df.brand == df.feat_brand].shape

(10245, 396)

In [73]:
df['weight'].unique()

array([nan, '1.0 lbs', '2 pounds', '2.0 lbs', '907 g', '905 grams',
       '3.0 lbs', '0.65 lbs', '1.1 lbs', '0.5 lbs', '399 g', '454 g',
       '2.4 ounces', '0.56 lbs', '1.3267 lbs', '2.13 lbs', '0.375 lbs',
       '1 pounds', '1.7 lbs', '0.2 ounces', '3.35 lbs', '386 g',
       '0.6 lbs', '14 lbs', '0.17 lbs', '680 g', '0.4202 lbs', '0.52 lbs',
       '1.6 ounces', '4.0 lbs', '2.1 pounds', '0.85 lbs', '5.0 lbs',
       '249 g', '14 Kg', '0.4788 lbs', '3 pounds', '1.67 lbs', '1.19 lbs',
       '1.66 lbs', '4.8 ounces'], dtype=object)

In [0]:
feats = ['brand_cat','feat_mpn_cat','feat_condition_cat','feat_sport_cat','feat_style_cat','feat_brand_cat', 'feat_material_cat','feat_gender_cat' ]

In [52]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(['brand_cat'],model)

(-51.35622251579232, 0.7198217504771702)

In [0]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
result = run_model(feats, model)

In [75]:
x = df[ feats ].values
y = df['prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(x,y)
print(result)
perm = PermutationImportance(m, random_state=1).fit(x,y);
eli5.show_weights(perm, feature_names=feats)

(-50.24388278073944, 1.513254627082612)


Weight,Feature
0.6114  ± 0.0105,brand_cat
0.3223  ± 0.0242,feat_brand_cat
0.0633  ± 0.0089,feat_gender_cat
0.0337  ± 0.0052,feat_condition_cat
0.0257  ± 0.0012,feat_material_cat
0.0073  ± 0.0019,feat_style_cat
0.0069  ± 0.0010,feat_mpn_cat
0.0001  ± 0.0000,feat_sport_cat


In [35]:
df['brand'].value_counts(normalize=True)

ralph lauren    0.026332
nike            0.020443
toms            0.018221
muk luks        0.013221
easy spirit     0.012888
                  ...   
ted baker       0.000056
cejon           0.000056
aqua            0.000056
callaway        0.000056
efco            0.000056
Name: brand, Length: 1954, dtype: float64

In [45]:
df[ df['brand'] == 'nike'].features_parsed.sample(5).values

array([{'manufacturer part number': '807145 001', 'season': 'all-season', 'shoe size': '6', 'gender': 'women', 'material': 'canvas', 'brand': 'nike', 'age group': 'adult', 'color': 'black'},
       {},
       {'sport': 'running shoes', 'occasion': 'running shoes', 'material': 'synthetic-and-mesh', 'gender': 'women', 'shoe size': '10 us women', 'shoe category': 'running shoes;training shoes;lifestyle;casual shoes', 'assembled product dimensions (l x w x h)': '12.00 x 8.00 x 6.00 inches', 'fabric content': 'synthetic-and-mesh', 'color': 'black/black/vivid purple/copa ,�� black', 'model': '724477 007', 'shoe closure': 'lace-up', 'casual & dress shoe style': 'running shoes;training shoes;lifestyle;casual shoes', 'manufacturer part number': '724477 007', 'variant group id': '224#mp#724477 007', 'brand': 'nike', 'age group': 'women', 'shoe width': 'b(m)'},
       {'sport': 'softball, lacrosse, soccer, rugby', 'type': 'cleats', 'condition': 'new without box'},
       {'sport': 'running shoes'

In [46]:
df['feat_age group'].value_counts()

adult             7375
women              156
adult ,�� teen      54
child               49
womens              12
women's              5
unisex               3
toddler              2
men                  2
Name: feat_age group, dtype: int64

In [77]:
ls matrix_one/


day03.ipynb  day4.ipynb  day5.ipynb


In [0]:
!git add matri