In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline


In [2]:
#Verify path to data
import os
print(os.listdir('./input'))


['sample_submission.csv', 'test.tsv', 'train.tsv', 'zipped']


In [3]:
# load training data to dataframe
train_df = pd.read_table("./input/train.tsv")

In [4]:
#review training data
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


Condition ID's
1 - New
2 - Like New
3 - Good
4 - Fair
5 - Poor

Clothing
1 - New: NWT (New With Tags), unworn, unaltered and includes the original tags.
2 - Like New: NWOT (New without tags) or very lightly used with no flaws or damage.
3 - Good: Gently used but still may have minor flaws such as pilling, stretching, or loose threads.
4 - Fair: Multiple minor flaws, defects, or damage such as rips, light stains, pilling.
5 - Poor: Heavily used and has major cosmetic flaws or damage such as holes, stains, fading, or missing buttons/beads.

Shoes 
1 - New: New, unworn and still in the original box.
2 - Like New: Unworn or only tried on. No creases, but may not be in the original box.
3 - Good: Used, but there are only a few minor flaws such as slight wear on the sole, light creases, or small scuffs.
4 - Fair: Show signs of wear and has multiple major flaws, such as holes, rips, creases, or stains, but the sole is intact.
5 - Poor: Heavily worn and have multiple major flaws, such as large holes, stains, scuffs, or the soles are not intact.

Electronics
1 - New: Unused in the original packaging.
2 - Like New: lightly used and fully functional, but does not include the original packaging.
3 - Good: Gently used and may have minor cosmetic flaws, but is fully functional.
4 - Fair: Used and has multiple flaws but is overall functional.
5 - Poor: Heavily used, has major cosmetic flaws or damage, non-functional or sold as parts.

Condition comments may appear in name or description
1 - New
    BNWT: Brand New With Tags
    BNIP: Brand New In Packet 
    MIB: Mint In Box
    NWT: New With Tags
    NIB: New In Box
    BNIB: Brand New In Box

2 - Like New
    BNWOT: Brand New Without Tags
    MWOB: Mint With Out Box
    NWOT: New without tags or very lightly used with no flaws or damage
    BNWOB: Brand New Without Box


In [5]:
# Summary stats
train_df.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1482535.0,1482535.0,1482535.0,1482535.0
mean,741267.0,1.90738,26.73752,0.4472744
std,427971.1,0.9031586,38.58607,0.4972124
min,0.0,1.0,0.0,0.0
25%,370633.5,1.0,10.0,0.0
50%,741267.0,2.0,17.0,0.0
75%,1111900.0,3.0,29.0,1.0
max,1482534.0,5.0,2009.0,1.0


In [6]:
# For category name, parse out the first three levels and append them to the data frame as their own fields
a, b, c, d = train_df['category_name'].str.split("/", 3).str
category_oneLevel = a
category_twoLevel = a + "/" + b
category_threeLevel = a + "/" + b + "/" + c
train_df['category_oneLevel'] = category_oneLevel
train_df['category_twoLevel'] = category_twoLevel
train_df['category_threeLevel'] = category_threeLevel

In [7]:
#review training data and verify appended columns
train_df.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_oneLevel,category_twoLevel,category_threeLevel
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet,Men,Men/Tops,Men/Tops/T-shirts
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,Electronics,Electronics/Computers & Tablets,Electronics/Computers & Tablets/Components & P...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,Women,Women/Tops & Blouses,Women/Tops & Blouses/Blouse
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...,Home,Home/Home Décor,Home/Home Décor/Home Décor Accents
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity,Women,Women/Jewelry,Women/Jewelry/Necklaces


In [16]:
#pd.Series([y for x in train_df.values['item_description'].flatten() for y in x.split()]).value_counts()

# Feature extraction from text
# Method: bag of words 
# https://pythonprogramminglanguage.com
 
from sklearn.feature_extraction.text import CountVectorizer

# create a dataframe for rows with a top level category of "Women"
women_df = train_df.loc[train_df['category_oneLevel'] == 'Women']

#randomly split it into smaller dataframes to avoid memory error
from sklearn.model_selection import train_test_split

train_women_df, test_women_df = train_test_split(women_df, test_size=0.8)

train_women_df.describe()

#breaking these out to step through and isolate memory error
#corpus = women_df['item_description'][pd.notnull(women_df['item_description'])]

#vectorizer = CountVectorizer()
#print( vectorizer.fit_transform(corpus).todense() )
#print( vectorizer.vocabulary_ )
 

Unnamed: 0,train_id,item_condition_id,price,shipping
count,132877.0,132877.0,132877.0,132877.0
mean,740603.1,2.023383,28.947542,0.389134
std,428271.7,0.869717,39.078059,0.487556
min,12.0,1.0,0.0,0.0
25%,369488.0,1.0,12.0,0.0
50%,741912.0,2.0,19.0,0.0
75%,1111095.0,3.0,34.0,1.0
max,1482526.0,5.0,2000.0,1.0


In [17]:
corpus = train_women_df['item_description'][pd.notnull(train_women_df['item_description'])]

In [18]:
corpus.describe()

count                 132877
unique                119901
top       No description yet
freq                    7525
Name: item_description, dtype: object

In [19]:
vectorizer = CountVectorizer()

In [20]:
#This is where I get the memory error if I'm running with the entire training set
print(vectorizer.fit_transform(corpus).todense())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [21]:
print( vectorizer.vocabulary_ )

