In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import json
import pandas as pd
import itertools
import sys
import re
import string
import nltk
from nltk import word_tokenize
from collections import defaultdict
from sklearn.metrics import classification_report, accuracy_score, f1_score

from shared_lib import utils

In [2]:
productFeatures_df = pd.read_csv("product_features.csv")
productFeatures_df.head()

Unnamed: 0,brand,description,gender,modelId,price,productName,rating,specs
0,Adidas,mi adizero adios 3 Shoes feature an energy-ret...,W,4002178_W,160,mi adizero adios 3 Shoes,1.8,Runner type: neut boost™ is our most responsiv...
1,Adidas,,W,4002179_W,100,mi Energy Cloud Shoes,3.3,
2,Adidas,"Designed specifically for women, these shoes a...",W,4002255_W,150,mi PureBoost X Shoes,5.0,Select single-color mesh or graphic-print mesh...
3,Adidas,Create your own adidas Supernova with boost™ t...,W,4002435_W,150,mi Supernova Shoes,3.9,Choose your outsole color for a custom footpri...
4,Adidas,The boost™ midsole in these running shoes rele...,M,4002556_M,160,mi PureBoost Shoes,4.5,boost™ is our most responsive cushioning ever:...


In [42]:
productFeatures_df.loc[0, 'specs']

"Runner type: neut boost\xe2\x84\xa2 is our most responsive cushioning ever: The more energy you give, the more you get Open mesh upper for maximum lightweight breathability; Textile and synthetic overlays for added support Designed for high speed, Microfit locks the foot down for a direct fit and fast run Show where you're from with a country flag lace clip. TORSION\xc2\xae SYSTEM between the heel and forefoot for a stable ride Imported"

### Explore specs words

In [23]:
specs_string = ''.join(productFeatures_df['specs'].apply(str)).decode('utf-8')
specs_tokens = utils.preprocess_stop_stem(utils.clean_str(specs_string), punct=True, stem=False, stop=True, sent=False)

In [31]:
word_count = defaultdict(lambda: 0)
for word in specs_tokens:
    word_count[word] += 1

#### Most freq words

In [38]:
for key, value in sorted(word_count.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    if value > 15:
        print key, value

upper 267
outsole 265
mm 252
heel 248
rubber 229
fit 219
textile 213
forefoot 151
support 149
cushioning 138
lining 134
ride 122
comfort 119
midsole 114
provides 112
type 107
natural 102
importedrunner 102
synthetic 98
size 92
ounces 90
weight 88
drop 88
9 86
energized 85
foot 83
neutral 79
10 79
responsive 75
energy 75
boost 74
mesh 72
5 69
stretchweb 66
flexes 65
ever 65
give 64
get 64
counter 64
7 60
adaptive 59
molded 57
achilles 57
flexibility 56
optimal 55
movement 55
allows 54
designed 53
stable 52
underfoot 51
continental 51
system 50
stretch 50
seamless 50
wraps 49
fitcounter 49
torsion 48
supportive 46
comfortable 46
traction 44
8 44
premium 42
construction 41
lightweight 40
every 40
engineered 40
conditions 39
like 38
adidas 38
adapts 38
move 37
wet 35
breathable 35
extraordinary 34
midfoot 33
ensure 33
dry 33
custom 33
tongue 32
sock 32
bounce 32
primeknit 31
feel 31
22 31
ultralight 30
superior 30
stability 30
shoes 30
knit 30
help 29
areas 29
offer 27
high 27
helps 27
for

### Explorer description words

In [39]:
desc_string = ''.join(productFeatures_df['description'].apply(str)).decode('utf-8')
decs_tokens = utils.preprocess_stop_stem(utils.clean_str(desc_string), punct=True, stem=False, stop=True, sent=False)

In [40]:
word_count = defaultdict(lambda: 0)
for word in specs_tokens:
    word_count[word] += 1

#### Most freq words

In [41]:
for key, value in sorted(word_count.iteritems(), key=lambda (k,v): (v,k), reverse=True):
    if value > 15:
        print key, value

upper 267
outsole 265
mm 252
heel 248
rubber 229
fit 219
textile 213
forefoot 151
support 149
cushioning 138
lining 134
ride 122
comfort 119
midsole 114
provides 112
type 107
natural 102
importedrunner 102
synthetic 98
size 92
ounces 90
weight 88
drop 88
9 86
energized 85
foot 83
neutral 79
10 79
responsive 75
energy 75
boost 74
mesh 72
5 69
stretchweb 66
flexes 65
ever 65
give 64
get 64
counter 64
7 60
adaptive 59
molded 57
achilles 57
flexibility 56
optimal 55
movement 55
allows 54
designed 53
stable 52
underfoot 51
continental 51
system 50
stretch 50
seamless 50
wraps 49
fitcounter 49
torsion 48
supportive 46
comfortable 46
traction 44
8 44
premium 42
construction 41
lightweight 40
every 40
engineered 40
conditions 39
like 38
adidas 38
adapts 38
move 37
wet 35
breathable 35
extraordinary 34
midfoot 33
ensure 33
dry 33
custom 33
tongue 32
sock 32
bounce 32
primeknit 31
feel 31
22 31
ultralight 30
superior 30
stability 30
shoes 30
knit 30
help 29
areas 29
offer 27
high 27
helps 27
for

In [5]:
new_df = pd.DataFrame(
    {"specs": productFeatures_df['specs'].apply(str),
     "desc": productFeatures_df['description'].apply(str)})

In [6]:
new_df.head(5)

Unnamed: 0,desc,specs
0,mi adizero adios 3 Shoes feature an energy-ret...,Runner type: neut boost™ is our most responsiv...
1,,
2,"Designed specifically for women, these shoes a...",Select single-color mesh or graphic-print mesh...
3,Create your own adidas Supernova with boost™ t...,Choose your outsole color for a custom footpri...
4,The boost™ midsole in these running shoes rele...,boost™ is our most responsive cushioning ever:...


In [8]:
new_df['all_specs'] = new_df['desc'] + " " + new_df['specs']
new_df.drop(columns = ['desc', 'specs'], inplace=True)

In [21]:
cond1 = new_df['all_specs'].str.contains('running shoes')
cond2 = new_df['all_specs'].str.contains('training')
cond3 = new_df['all_specs'].str.contains('basketball')
cond4 = new_df['all_specs'].str.contains('tennis')

In [41]:
x = new_df[cond2]

In [40]:
x.loc[0, 'all_specs']

"mi adizero adios 3 Shoes feature an energy-returning boost midsole and an open mesh upper with overlays for lightweight, breathable support. The TORSION\xc2\xae SYSTEM gives you targeted stability, and a high-wear rubber outsole grips on dry and slick surfaces. Runner type: neut boost\xe2\x84\xa2 is our most responsive cushioning ever: The more energy you give, the more you get Open mesh upper for maximum lightweight breathability; Textile and synthetic overlays for added support Designed for high speed, Microfit locks the foot down for a direct fit and fast run Show where you're from with a country flag lace clip. TORSION\xc2\xae SYSTEM between the heel and forefoot for a stable ride Imported"