In [None]:
# default_exp data

In [None]:
# export
from snkrfinder.imports import *
from snkrfinder.core import *
from snkrfinder.data import *



# snkrfinder helper

### version 2.0 May 2021 ()

## create and save some useful dataframes

Script to generate and save the shortcut Pandas dataframes summarizing the database of images.   

NOTE:  symbolic link in the nbs directory to enable the module loads in these notebooks.  i.e. `ln -s ../snkrfinder/ snkrfinder`




In [None]:
#hide

print(Path().cwd())

/home/ergonyc/Projects/Project2.0/snkrfinder/nbs


In [None]:
#hide
if Path().cwd().parts[-1] == 'nbs': 
    os.chdir('..')
    print('moved out of `nbs` directory')
    
print(Path().cwd())

moved out of `nbs` directory
/home/ergonyc/Projects/Project2.0/snkrfinder


In [None]:
D_ROOT


Path('/home/ergonyc/Projects/Project2.0/snkrfinder/data/raw')


### Part 1: import the UT-Zappos50k database

Using fastai `untar_data`.

First we need to get a database of sneaker images.   The sneaker images are only 136 pixels wide, and organized by 'Category', 'SubCategory', and 'Brand'.   Additional semantic information is also available from Mechanical Turk labeling, and is available packed into matlab files.  

In [None]:
meta_path, im_path = get_zappos_db()

assert (meta_path==ZAPPOS_META_DIR)
assert(im_path==D_ROOT/DBS["zappos"])


In [None]:
df = read_zappos_meta(meta_path)

assert (df['Category'] == df['Category1']).all()
assert (df['SubCategory'] == df['Category2']).all()

### data set cleaning

The dataset is very diverse and not constructed in order to think about how popular footwear is aesthetically related.  I'm simplifying the database to include 3 main categories which seem to split aesthetics for general footwear:

    1. BOOTS - weatherized and/or protected footwear for work or outdoor activity
    2. SHOES - generic non "boot" footwear.  includes heels and formalwear
    3. SNEAKERS - atheletic / comfort inspired 

I will also make a fourth category which might be useful:

    4. SLIPPERS - charachterized by a sole and straps



In [None]:
df.head()

Unnamed: 0,CID,Category,SubCategory,HeelHeight,Insole,Closure,Gender,Material,ToeStyle,path,...,ToeStyle.Apron Toe,ToeStyle.Wide Toe Box,ToeStyle.Snip Toe,ToeStyle.Peep Toe,ToeStyle.Medallion,path_and_file,Category1,Category2,Brand,Filename
0,100627-72,Shoes,Oxfords,,Leather,Lace up,Men,Leather,Capped Toe;Round Toe,ut-zap50k-images/Shoes/Oxfords/Bostonian/100627.72.jpg,...,0,0,0,0,0,"[Shoes, Oxfords, Bostonian, 100627.72.jpg]",Shoes,Oxfords,Bostonian,100627.72.jpg
1,100627-255,Shoes,Oxfords,,Leather,Lace up,Men,Leather,Capped Toe;Round Toe,ut-zap50k-images/Shoes/Oxfords/Bostonian/100627.255.jpg,...,0,0,0,0,0,"[Shoes, Oxfords, Bostonian, 100627.255.jpg]",Shoes,Oxfords,Bostonian,100627.255.jpg
2,100657-72,Shoes,Oxfords,,Leather;Padded;Removable,Lace up,Men,Leather,Capped Toe;Round Toe,ut-zap50k-images/Shoes/Oxfords/Bostonian/100657.72.jpg,...,0,0,0,0,0,"[Shoes, Oxfords, Bostonian, 100657.72.jpg]",Shoes,Oxfords,Bostonian,100657.72.jpg
3,100657-216,Shoes,Oxfords,,Leather;Padded;Removable,Lace up,Men,Leather,Capped Toe;Round Toe,ut-zap50k-images/Shoes/Oxfords/Bostonian/100657.216.jpg,...,0,0,0,0,0,"[Shoes, Oxfords, Bostonian, 100657.216.jpg]",Shoes,Oxfords,Bostonian,100657.216.jpg
4,101026-3,Boots,Mid-Calf,1in - 1 3/4in,Leather;Padded,Pull-on,Men,Leather;Rubber,Square Toe;Closed Toe,ut-zap50k-images/Boots/Mid-Calf/Durango/101026.3.jpg,...,0,0,0,0,0,"[Boots, Mid-Calf, Durango, 101026.3.jpg]",Boots,Mid-Calf,Durango,101026.3.jpg


I've taken several steps to try and simplify the "asethetic" space the modeling will need to capture is to exclude some particular sub-categories which represent extremes.  E.g. High cut boots, high-heels, and kids sub-categores. Indeed, I will only include "adult" shoes at this time but excluding "kids" and some edge cases because the overall size scale distorts the overall "aesthetics" I'd like to get at. 


In [None]:
df = simplify_zappos_db(df)

In [None]:
#hide
df.head()

Unnamed: 0,CID,Category,path,path_and_file,Category1,Category2,Filename,Sneakers,Boots,Shoes,Slippers,Adult,Gender
0,100627-72,Shoes,ut-zap50k-images/Shoes/Oxfords/Bostonian/100627.72.jpg,"[Shoes, Oxfords, Bostonian, 100627.72.jpg]",Shoes,Oxfords,100627.72.jpg,False,False,True,False,True,Men
1,100627-255,Shoes,ut-zap50k-images/Shoes/Oxfords/Bostonian/100627.255.jpg,"[Shoes, Oxfords, Bostonian, 100627.255.jpg]",Shoes,Oxfords,100627.255.jpg,False,False,True,False,True,Men
2,100657-72,Shoes,ut-zap50k-images/Shoes/Oxfords/Bostonian/100657.72.jpg,"[Shoes, Oxfords, Bostonian, 100657.72.jpg]",Shoes,Oxfords,100657.72.jpg,False,False,True,False,True,Men
3,100657-216,Shoes,ut-zap50k-images/Shoes/Oxfords/Bostonian/100657.216.jpg,"[Shoes, Oxfords, Bostonian, 100657.216.jpg]",Shoes,Oxfords,100657.216.jpg,False,False,True,False,True,Men
4,101026-3,Boots,ut-zap50k-images/Boots/Mid-Calf/Durango/101026.3.jpg,"[Boots, Mid-Calf, Durango, 101026.3.jpg]",Boots,Mid-Calf,101026.3.jpg,False,True,False,False,True,Men


------------

## Prepare the simplified dataframe for further analysis


Also use sklearn `train_test_split` to create category stratified train/test/validate groups.  We'll keep 15 percent of our data to truly test.

In [None]:
#hide
df = skl_tt_split(df,df.Category)
df.head()

Unnamed: 0,CID,Category,path,path_and_file,Category1,Category2,Filename,Sneakers,Boots,Shoes,Slippers,Adult,Gender,train,test,validate,t_t_v
0,100627-72,Shoes,ut-zap50k-images/Shoes/Oxfords/Bostonian/100627.72.jpg,"[Shoes, Oxfords, Bostonian, 100627.72.jpg]",Shoes,Oxfords,100627.72.jpg,False,False,True,False,True,Men,True,False,False,train
1,100627-255,Shoes,ut-zap50k-images/Shoes/Oxfords/Bostonian/100627.255.jpg,"[Shoes, Oxfords, Bostonian, 100627.255.jpg]",Shoes,Oxfords,100627.255.jpg,False,False,True,False,True,Men,False,True,False,test
2,100657-72,Shoes,ut-zap50k-images/Shoes/Oxfords/Bostonian/100657.72.jpg,"[Shoes, Oxfords, Bostonian, 100657.72.jpg]",Shoes,Oxfords,100657.72.jpg,False,False,True,False,True,Men,True,False,False,train
3,100657-216,Shoes,ut-zap50k-images/Shoes/Oxfords/Bostonian/100657.216.jpg,"[Shoes, Oxfords, Bostonian, 100657.216.jpg]",Shoes,Oxfords,100657.216.jpg,False,False,True,False,True,Men,True,False,False,train
4,101026-3,Boots,ut-zap50k-images/Boots/Mid-Calf/Durango/101026.3.jpg,"[Boots, Mid-Calf, Durango, 101026.3.jpg]",Boots,Mid-Calf,101026.3.jpg,False,True,False,False,True,Men,True,False,False,train


------------

## pickle / unpickle the database dataframes

In [None]:
#hide
filename = "zappos-50k-simplified"
df.to_pickle(f"data/{filename}.pkl")


#hide
df2 = df.sort_values('path', ascending=True)
df2 = df2.reset_index(drop=True)

filename = "zappos-50k-simplified_sort"
df2.to_pickle(f"data/{filename}.pkl")
# df2.to_json(f"data/{filename}.json")

# # filename = "zappos-50k-simplified_sort"
# # df = pd.read_pickle(f"data/{filename}.pkl")
# filename = f"data/{'zappos-50k-simplified_sort'}.pkl" 


------------

## Create a unfied database of "sneakers"

- images (need to access) path to fit beta-VAE 
- descriptions for fitting text autoencoder
- skip "meta" data (will come back to this later for further analysis.

In [None]:
filename = f"data/{ZAPPOS_DF_SIMPLIFIED}.pkl" #"zappos-50k-simplified_sort"

if os.path.exists(filename):
    df_zappos = pd.read_pickle(filename)

In [None]:
SCRAPED_META_DIR/SCRAPED_DF


Path('/home/ergonyc/Projects/Project2.0/snkrfinder/data/raw/scraped/full_data')

In [None]:
filename = f"{SCRAPED_META_DIR/SCRAPED_DF}.pkl" 
filename = f"data/{SCRAPED_DF}.pkl"

if os.path.exists(filename):
    df_scraped = pd.read_pickle(filename)

In [None]:
#hide 
# attributes = df_scraped.attributes.values
# description = df_scraped.description.values
df_scraped[['url','hero_fullpath']].head(2)

Unnamed: 0,url,hero_fullpath
0,https://www.goat.com/sneakers/air-jordan-11-retro-bred-2012-378037-010,/Users/ergonyc/Projects/DATABASE/SnkrScrpr/data/goat/img/99543f4630.jpg
1,https://www.goat.com/sneakers/yeezy-boost-700-carbon-blue-yzy-700-what-the,/Users/ergonyc/Projects/DATABASE/SnkrScrpr/data/goat/img/3fbd48e729.jpg


I need to put the scraped databases from SNS (sneakersnstuff.com) and GOAT (goat.com) into a dataframe with columns suitable merge with the UTZappos database I originally started with.   
I'll spend some time infering the `Category`, `Brand`, `Gender` (and `model`?) so I can pay attention to class balance .  From the transfer learning work, the `Slippers` category is pretty poor, but I should be able to use that net to label the scraped data... and validate based on the `attributes` and `see_also` fields.

First things first:  fix the "hero_fullpath" so its agnostic of OS (Mac or Linux).


## Create a unfied database of "sneakers"

- images (need to access) path to fit beta-VAE 
- descriptions for fitting text autoencoder
- skip "meta" data (will come back to this later for further analysis.

Now we just need to wrap the above proceedure into a little function for exporting...

In [None]:
df = get_scraped_db()

In [None]:
# stratify across db_name
# skl_tt_split defined in data.zappos
df_scraped = skl_tt_split(df,df.db_name)

In [None]:
df_zap = extract_zap_sneakers(df_zappos)
df_zap.shape

(9434, 17)

In [None]:
df_test = merge_dbs(df_zap,df_scraped)

df_scraped.columns,df_zappos.columns,df_test.columns

df_scraped.shape,df_zappos.shape,df_test.shape

# TODO:  check for duplicate paths before saving...

((2337, 17), (27614, 17), (11771, 29))

In [None]:

# save the combined db for easy access later.
df_test.to_pickle(os.path.join("data", f"{COMBINED_SNEAKERS_DF}.pkl"))


df_test2 = merge_dbs(df_zappos,df_scraped)
df_test2.to_pickle(os.path.join("data", f"{COMBINED_DF}.pkl"))


------------