In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


In [2]:
#Verify path to data
import os
print(os.listdir('./input'))


['sample_submission.csv', 'test.tsv', 'train.tsv', 'zipped']


In [3]:
# load training data to dataframe
train_df = pd.read_table("./input/train.tsv")

In [4]:
#review training data
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


Condition ID's
1 - New
2 - Like New
3 - Good
4 - Fair
5 - Poor

Clothing
1 - New: NWT (New With Tags), unworn, unaltered and includes the original tags.
2 - Like New: NWOT (New without tags) or very lightly used with no flaws or damage.
3 - Good: Gently used but still may have minor flaws such as pilling, stretching, or loose threads.
4 - Fair: Multiple minor flaws, defects, or damage such as rips, light stains, pilling.
5 - Poor: Heavily used and has major cosmetic flaws or damage such as holes, stains, fading, or missing buttons/beads.

Shoes 
1 - New: New, unworn and still in the original box.
2 - Like New: Unworn or only tried on. No creases, but may not be in the original box.
3 - Good: Used, but there are only a few minor flaws such as slight wear on the sole, light creases, or small scuffs.
4 - Fair: Show signs of wear and has multiple major flaws, such as holes, rips, creases, or stains, but the sole is intact.
5 - Poor: Heavily worn and have multiple major flaws, such as large holes, stains, scuffs, or the soles are not intact.

Electronics
1 - New: Unused in the original packaging.
2 - Like New: lightly used and fully functional, but does not include the original packaging.
3 - Good: Gently used and may have minor cosmetic flaws, but is fully functional.
4 - Fair: Used and has multiple flaws but is overall functional.
5 - Poor: Heavily used, has major cosmetic flaws or damage, non-functional or sold as parts.

Condition comments may appear in name or description
1 - New
    BNWT: Brand New With Tags
    BNIP: Brand New In Packet 
    MIB: Mint In Box
    NWT: New With Tags
    NIB: New In Box
    BNIB: Brand New In Box

2 - Like New
    BNWOT: Brand New Without Tags
    MWOB: Mint With Out Box
    NWOT: New without tags or very lightly used with no flaws or damage
    BNWOB: Brand New Without Box


In [5]:
# Summary stats for numeric fields. Not much useful here other than price.
train_df.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1482535.0,1482535.0,1482535.0,1482535.0
mean,741267.0,1.90738,26.73752,0.4472744
std,427971.1,0.9031586,38.58607,0.4972124
min,0.0,1.0,0.0,0.0
25%,370633.5,1.0,10.0,0.0
50%,741267.0,2.0,17.0,0.0
75%,1111900.0,3.0,29.0,1.0
max,1482534.0,5.0,2009.0,1.0


In [6]:
#Describe the values of the category name feature
train_df['category_name'].describe()

count                                            1476208
unique                                              1287
top       Women/Athletic Apparel/Pants, Tights, Leggings
freq                                               60177
Name: category_name, dtype: object

In [7]:
#Grouping by category name, get summary stats for price and load the results into their own dataframe.
category_price_df = train_df.groupby('category_name').describe()['price'].reset_index()
print(category_price_df)

                                     category_name    count        mean  \
0                          Beauty/Bath & Body/Bath   5049.0   18.481878   
1           Beauty/Bath & Body/Bathing Accessories    332.0   21.882530   
2                     Beauty/Bath & Body/Cleansers    723.0   13.564315   
3                         Beauty/Bath & Body/Other    147.0   20.469388   
4      Beauty/Bath & Body/Scrubs & Body Treatments    812.0   24.640394   
5                          Beauty/Bath & Body/Sets    695.0   21.086331   
6           Beauty/Fragrance/Candles & Home Scents   2965.0   21.093423   
7                            Beauty/Fragrance/Kids     87.0    8.379310   
8                             Beauty/Fragrance/Men   2114.0   28.555818   
9                           Beauty/Fragrance/Other     39.0   24.794872   
10                           Beauty/Fragrance/Sets    461.0   25.603037   
11                          Beauty/Fragrance/Women  18628.0   23.717790   
12                   Beau

In [8]:
# Show price stats for Men/Tops/T-shirts
category_price_df.loc[category_price_df['category_name'] == 'Men/Tops/T-shirts']

Unnamed: 0,category_name,count,mean,std,min,25%,50%,75%,max
808,Men/Tops/T-shirts,15108.0,18.352495,16.307762,0.0,10.0,14.0,20.0,309.0


In [9]:
# Feature extraction from text
# Method: bag of words 
# https://pythonprogramminglanguage.com
 
from sklearn.feature_extraction.text import CountVectorizer

# create a dataframe for rows with category of "Men/Tops/T-shirts"
MensTees_df = train_df.loc[train_df['category_name'] == 'Men/Tops/T-shirts']

# Summary stats. Verify they match cell above.
MensTees_df.describe() 

Unnamed: 0,train_id,item_condition_id,price,shipping
count,15108.0,15108.0,15108.0,15108.0
mean,737412.8,2.142441,18.352495,0.394559
std,427335.7,0.871937,16.307762,0.488772
min,0.0,1.0,0.0,0.0
25%,373212.0,1.0,10.0,0.0
50%,729965.0,2.0,14.0,0.0
75%,1113179.0,3.0,20.0,1.0
max,1482464.0,5.0,309.0,1.0


In [10]:
#Create a corpus of words in the item description for this category
corpus = MensTees_df['item_description'][pd.notnull(MensTees_df['item_description'])]

In [11]:
#Describe the values in the corpus
corpus.describe()

count                  15108
unique                 12860
top       No description yet
freq                    1257
Name: item_description, dtype: object

In [12]:
vectorizer = CountVectorizer(stop_words="english")

In [13]:
#print sparse array
print(vectorizer.fit_transform(corpus).toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [14]:
#Print length of vocabulary and dictionary entries. For 15K rows, there are almost 9400 distinct words.
print(len(vectorizer.vocabulary_)), print( vectorizer.vocabulary_ )

9393


(None, None)

In [15]:
#Print vectorizer stop words
vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [16]:
#Transform to a sorted list to illustrate the frequency of words in the corpus
transformed = vectorizer.fit_transform(corpus)
sparse_sums = transformed.sum(axis = 0)
feature_names = vectorizer.get_feature_names()
sorted_word_counts = sorted(zip(feature_names, sparse_sums.tolist()[0]), key = lambda x: x[1], reverse = True)
print(sorted_word_counts)



In [30]:
top_n_words = [i[0] for i in sorted_word_counts[:100]]
print(len(top_n_words)), top_n_words

100


(None,
 ['size',
  'shirt',
  'new',
  'condition',
  'men',
  'worn',
  'large',
  'medium',
  'brand',
  'small',
  'xl',
  'free',
  'black',
  'shipping',
  'tee',
  'great',
  'tags',
  'good',
  'description',
  'rm',
  'like',
  'shirts',
  'sleeve',
  'color',
  'blue',
  'white',
  'bundle',
  'price',
  'cotton',
  'stains',
  'nike',
  'long',
  '100',
  'fit',
  'items',
  'tshirt',
  'mens',
  'used',
  'vintage',
  'home',
  'fits',
  'smoke',
  'tag',
  'logo',
  'polo',
  'red',
  'ship',
  'firm',
  'authentic',
  'times',
  'ask',
  'just',
  'excellent',
  'wear',
  'check',
  'short',
  'flaws',
  'neck',
  'gray',
  'xxl',
  'navy',
  'holes',
  'sizes',
  'grey',
  'american',
  '2xl',
  'rips',
  'unisex',
  'adidas',
  'questions',
  'supreme',
  'perfect',
  'green',
  'day',
  'soft',
  'band',
  'graphic',
  'save',
  'item',
  '10',
  'super',
  'jordan',
  'washed',
  'bought',
  'available',
  'bape',
  'sleeves',
  'nwt',
  'offers',
  'comes',
  'light',

In [17]:
# Find words that appear more than the specified number of times
# Set to 250 to get a list of 100 words for thsi example
def filter_value( wordList, frequency ):
    for x, y in wordList:
        if y > frequency :
            yield x

frequent_words= list( filter_value( sorted_word_counts, 250 ) )
print(len(frequent_words)), print(frequent_words)


100
['size', 'shirt', 'new', 'condition', 'men', 'worn', 'large', 'medium', 'brand', 'small', 'xl', 'free', 'black', 'shipping', 'tee', 'great', 'tags', 'good', 'description', 'rm', 'like', 'shirts', 'sleeve', 'color', 'blue', 'white', 'bundle', 'price', 'cotton', 'stains', 'nike', 'long', '100', 'fit', 'items', 'tshirt', 'mens', 'used', 'vintage', 'home', 'fits', 'smoke', 'tag', 'logo', 'polo', 'red', 'ship', 'firm', 'authentic', 'times', 'ask', 'just', 'excellent', 'wear', 'check', 'short', 'flaws', 'neck', 'gray', 'xxl', 'navy', 'holes', 'sizes', 'grey', 'american', '2xl', 'rips', 'unisex', 'adidas', 'questions', 'supreme', 'perfect', 'green', 'day', 'soft', 'band', 'graphic', 'save', 'item', '10', 'super', 'jordan', 'washed', 'bought', 'available', 'bape', 'sleeves', 'nwt', 'offers', 'comes', 'light', 'listings', 'nice', 'extra', 'tears', 'ralph', 'lauren', 'pink', 'women', 'don']


(None, None)

In [39]:
mens_Ts_cat_desc_price_df = category_price_df.loc[category_price_df['category_name'] == 'Men/Tops/T-shirts'].reset_index()
mens_Ts_cat_desc_price_df

Unnamed: 0,index,category_name,count,mean,std,min,25%,50%,75%,max
0,808,Men/Tops/T-shirts,15108.0,18.352495,16.307762,0.0,10.0,14.0,20.0,309.0


In [40]:
mens_Ts_cat_desc_price_df = category_price_df.loc[category_price_df['category_name'] == 'Men/Tops/T-shirts'].reset_index()
mens_Ts_cat_desc_price_df = mens_Ts_cat_desc_price_df.drop(['index'], axis=1)
mens_Ts_cat_desc_price_df

Unnamed: 0,category_name,count,mean,std,min,25%,50%,75%,max
0,Men/Tops/T-shirts,15108.0,18.352495,16.307762,0.0,10.0,14.0,20.0,309.0


In [41]:
mens_Ts_cat_desc_price_df['desc_word'] = pd.Series(['all'])
mens_Ts_cat_desc_price_df

Unnamed: 0,category_name,count,mean,std,min,25%,50%,75%,max,desc_word
0,Men/Tops/T-shirts,15108.0,18.352495,16.307762,0.0,10.0,14.0,20.0,309.0,all


In [20]:
MensTees_df[MensTees_df['item_description'].str.contains('size')].describe()['price']

count    3164.000000
mean       18.541719
std        15.119332
min         0.000000
25%        10.000000
50%        14.000000
75%        20.000000
max       215.000000
Name: price, dtype: float64

In [42]:
mens_Ts_cat_desc_price_df.loc[mens_Ts_cat_desc_price_df.index.max() + 1] = MensTees_df[MensTees_df['item_description'].str.contains('size')].describe()['price']
mens_Ts_cat_desc_price_df.loc[mens_Ts_cat_desc_price_df.index.max(), 'category_name'] = 'Men/Tops/T-shirts'
mens_Ts_cat_desc_price_df.loc[mens_Ts_cat_desc_price_df.index.max(), 'desc_word'] = 'size'
mens_Ts_cat_desc_price_df

Unnamed: 0,category_name,count,mean,std,min,25%,50%,75%,max,desc_word
0,Men/Tops/T-shirts,15108.0,18.352495,16.307762,0.0,10.0,14.0,20.0,309.0,all
1,Men/Tops/T-shirts,3164.0,18.541719,15.119332,0.0,10.0,14.0,20.0,215.0,size


In [80]:
##############################
##############################
##############################
import sys
import time

category_name = 'Men/Tops/T-shirts'
num_words = 100

cat_desc_price_df = pd.DataFrame(columns=["category_name", "count", "mean", "std", "min", "25%", "50%", "75%", "max", "desc"])
cat_row = category_price_df.loc[category_price_df['category_name'] == 'Men/Tops/T-shirts']
cat_desc_price_df = cat_desc_price_df.append({
        "category_name": category_name, 
        "count": cat_row.get(0), 
        "mean": cat_row.get(1), 
        "std": cat_row.get(2), 
        "min": cat_row.get(3), 
        "25%": cat_row.get(4), 
        "50%": cat_row.get(5), 
        "75%": cat_row.get(6), 
        "max": cat_row.get(7), 
        "desc_word": "all"
    }, ignore_index=True)

corpus = train_df.loc[train_df['category_name'] == category_name]['item_description'][pd.notnull(train_df['item_description'])]

#Transform to a sorted list to illustrate the frequency of words in the corpus
transformed = vectorizer.fit_transform(corpus)
sparse_sums = transformed.sum(axis = 0)
feature_names = vectorizer.get_feature_names()
sorted_word_counts = sorted(zip(feature_names, sparse_sums.tolist()[0]), key = lambda x: x[1], reverse = True)

# Find top n words in sorted list
top_n_words = [i[0] for i in sorted_word_counts[:num_words]]

for word in top_n_words:
    row = train_df.loc[(train_df['category_name'] == category_name) & (train_df['item_description'].str.contains(word))].describe()['price']    
    cat_desc_price_df = cat_desc_price_df.append({
        "category_name": category_name, 
        "count": row.get(0), 
        "mean": row.get(1), 
        "std": row.get(2), 
        "min": row.get(3), 
        "25%": row.get(4), 
        "50%": row.get(5), 
        "75%": row.get(6), 
        "max": row.get(7), 
        "desc_word": word
    }, ignore_index=True)

cat_desc_price_df


Unnamed: 0,category_name,count,mean,std,min,25%,50%,75%,max,desc,desc_word
0,Men/Tops/T-shirts,,,,,,,,,,all
1,Men/Tops/T-shirts,3164.0,18.541719,15.119332,0.0,10.00,14.0,20.00,215.0,,size
2,Men/Tops/T-shirts,4999.0,17.402180,14.007666,0.0,10.00,14.0,20.00,219.0,,shirt
3,Men/Tops/T-shirts,2314.0,21.748487,18.280724,0.0,11.00,16.0,25.00,230.0,,new
4,Men/Tops/T-shirts,3076.0,15.951073,11.948901,0.0,10.00,13.0,18.00,129.0,,condition
5,Men/Tops/T-shirts,1990.0,18.253266,14.863893,0.0,10.00,14.0,21.00,230.0,,men
6,Men/Tops/T-shirts,1999.0,17.870435,14.748805,0.0,10.00,14.0,20.00,176.0,,worn
7,Men/Tops/T-shirts,1497.0,17.609552,14.727925,0.0,10.00,14.0,20.00,200.0,,large
8,Men/Tops/T-shirts,1380.0,19.302174,18.128119,0.0,10.00,14.0,20.25,230.0,,medium
9,Men/Tops/T-shirts,780.0,19.247436,15.765012,0.0,11.00,15.0,20.00,200.0,,brand


In [87]:
pd.set_option('display.max_colwidth', -1)
train_df.loc[(train_df['category_name'] == 'Men/Tops/T-shirts') & (train_df['item_description'].str.contains('lauren'))]

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
43519,43519,Vintage goldberg wcw 90z t shirt,3,Men/Tops/T-shirts,Vintage,16.0,0,Size xl. Fits tts. Tags: wcw wwf polo sport ralph lauren hat bear tommy hilfiger big logo champion basketball jersey
176406,176406,Ralph lauren!,3,Men/Tops/T-shirts,Polo Ralph Lauren,21.0,0,Ralph lauren mens pullover!
186074,186074,NIKE ORANGE SHORT SLEEVE SHIRT SIZE M,3,Men/Tops/T-shirts,Nike,12.0,0,"supreme, bape, Louis Vuitton, neywork, streetwear, Boxer, men's, new, Gucci balmain Adidas ultra boost NMD yeezy air Jordan retro 123456789 10 11 12 13 14 vlone off the white ferragamo stussy kobe lebron kd kyrie curry iphone Michael Kors, cheap, pink, Victoria's Secret, Air Force One, high, low, puma, nike, adidas, tommy hilfiger, banana republic, north face, black, white, bath and body works, kylie, lip kit, juicy couture, fidget spinner, hoodie, shoes, ralph lauren, comfort colors, tyler's, guy harvey, vans, reebok"
223085,223085,white ralph lauren polo t shirt,2,Men/Tops/T-shirts,Polo Ralph Lauren,27.0,0,white ralph lauren polo t shirt
247257,247257,Stussy baseball jersey,3,Men/Tops/T-shirts,Stussy,25.0,1,Size small Great condition Worn literally 1x paid [rm] for it Key words: Nike jordan adidas primeknit flyknit air jordan retro ogs original vintage bred true blue yeezy air max tubular limited tommy hilfiger polo ralph lauren kanye west travis scott asap rocky bape supreme thrasher palace gucci louis vuitton a bathing ape baby milo guess dkny prestos soccer cleats michael jordan huarache jordan 1 2 3 ultraboost nmd boost y3 timberland michael kors vineyard vines patagonia calvin klein off white ronaldo messi neymar yeezus dad hat
266625,266625,Super rare vintage tupac tshirt rap,1,Men/Tops/T-shirts,Vintage,15.0,1,Not really vintage lol But still dope as hell Pay homage to one of the greatest rappers to live Key words: Nike jordan adidas primeknit flyknit air jordan retro ogs original vintage bred true blue yeezy air max tubular limited tommy hilfiger polo ralph lauren kanye west travis scott asap rocky bape supreme thrasher palace gucci louis vuitton a bathing ape baby milo guess dkny roshe run free run prestos soccer cleats michael jordan huarache jordan 1 2 3 concert tee rap tee
273624,273624,Vintage Ralph Lauren Polo Jeans T Shirt,2,Men/Tops/T-shirts,Polo Ralph Lauren,15.0,1,Like new red ralph lauren polo jeans t shirt for mens Large
315541,315541,vintage polo winter sport long sleeve,3,Men/Tops/T-shirts,Polo Ralph Lauren,20.0,0,"very good condition for age has some cracking on graphic size small but could pass as a medium tags: bape, polo sport, vintage, polo bear, nautica, Tommy Hilfiger, tommy jeans, supreme, thrasher, stussy, jordan, nike, adidas, ralph lauren,"
386624,386624,Ralph Lauren Embroidered Logo Shirt Med,3,Men/Tops/T-shirts,Ralph Lauren,16.0,1,Vtg Ralph Lauren Embroidered logo shirt. Thick. In good used condition. Has one tiny hole near left shoulder. Otherwise great condition. Any questions please ask. • [rm] shipped • #ralphlauren #vintage #embroidered #sizemedium
442010,442010,Polo ralph lauren shirt,1,Men/Tops/T-shirts,Polo Ralph Lauren,18.0,0,Size large Authentic ralph lauren Brand new with tag One shirt
