In [1]:
import os
import shutil
import urllib.request
import pandas as pd
from tqdm import tqdm
from time import sleep

# Load

In [2]:
wine_df = pd.read_pickle('data/best_wine_df_ever.pickle.gzip', compression='gzip')
print(wine_df.shape)

(123516, 9)


In [3]:
wine_df.tail(2)

Unnamed: 0,name,description,price,category_1,category_2,origin,wine_size_value,wine_size_units,wine_abv
0,Zuccardi Zeta (1.5 Liter Magnum) 2012,"Blend: 87% Malbec, 13% Cabernet Sauvignon",84.97,Red Wine,Bordeaux Red Blends,"from Mendoza, Argentina",1500,ML,15
0,Zuccardi Zeta 2012,"Blend: 87% Malbec, 13% Cabernet Sauvignon",43.99,Red Wine,Bordeaux Red Blends,"from Mendoza, Argentina",750,ML,15


In [4]:
raw_df = pd.read_pickle('data/scraped/scraped_with_decs.pickle.gzip', compression='gzip')
print(raw_df.shape)

(125787, 7)


In [5]:
raw_df.tail(2)

Unnamed: 0,name,image_path,price,url,raw_html,URL_name,description
125785,Piper-Heidsieck Cuvee Brut (3.0 Liter Bottle -...,/product/images/fl_progressive/fwu9dzrdyyfpxvb...,399.97,/product/Piper-Heidsieck-Cuvee-Brut-30-Liter-B...,"<div class=""prodItem_wrap"">\n<div class=""prodI...",/product/Piper-Heidsieck-Cuvee-Brut-30-Liter-B...,#95 Wine Spectator Top 100 of 2019The Brut NV ...
125786,Turley Estate Cabernet Sauvignon 2012,/product/images/fl_progressive/155880.jpg,64.99,/product/Turley-Estate-Cabernet-Sauvignon-2012...,"<div class=""prodItem_wrap"">\n<div class=""prodI...",/product/Turley-Estate-Cabernet-Sauvignon-2012...,"Certified organic, Napa Valley single vineyard..."


# Merge

In [6]:
merged_df = raw_df.reset_index()\
              .rename(columns={'index':'file_index'})\
              .merge(right=wine_df, how='left', on='name')\
              .drop_duplicates(subset='name')
assert merged_df.shape[0] == raw_df.shape[0]

In [7]:
#wine_types = ['Red Wine', 'White Wine', 'Pink and Rosé', 'Sparkling & Champagne']
wine_types = merged_df['category_2'].value_counts(normalize=True)[:15].index
merged_df['origin_short'] = merged_df['origin'].str.split().str[-1]
origins = merged_df['origin_short'].value_counts()[:10].index

clean_df = merged_df[merged_df['category_2'].isin(wine_types)]
clean_df = clean_df[clean_df['origin_short'].isin(origins)]
print(clean_df.shape)

(89452, 17)


# Iterate

In [9]:
labels_to_keep = clean_df['file_index'].values
print(labels_to_keep.shape)

(89452,)


In [100]:
source_dir = 'D:/data/sorted/label/'
dest_dir = 'D:/data/sorted/labels_clean/'
5/0
not_found = []
for label in tqdm(labels_to_keep):
  filename = f"label_{label}.jpg"
  try:
    shutil.copy(
      src=os.path.join(source_dir, filename),
      dst=os.path.join(dest_dir, filename)
    )
  except:
    not_found.append(filename)

100%|███████████████████████| 89452/89452 [17:57<00:00, 82.99it/s]


In [102]:
not_found.__len__()

14798

# Create metadata

In [74]:
clean_df.head(2)

Unnamed: 0,file_index,name,image_path,price_x,url,raw_html,URL_name,description_x,description_y,price_y,category_1,category_2,origin,wine_size_value,wine_size_units,wine_abv,origin_short,cat_2_int
4,4,Clarendon Hills Astralis Syrah 2011,/product/images/fl_progressive/qajrqr4d6ttn2pf...,149.99,/product/Clarendon-Hills-Astralis-Syrah-2011/5...,"<div class=""prodItem_wrap"">\n<div class=""prodI...",/product/Clarendon-Hills-Astralis-Syrah-2011/5...,"The pride of our portfolio, Clarendon Hills As...",The grapes for Astralis were selected from low...,129.99,Red Wine,Syrah/Shiraz,"from McLaren Vale, South Australia, Australia",750,ML,14.0,Australia,14
5,5,Yalumba Patchwork Shiraz 2014,/product/images/fl_progressive/167055.jpg,16.99,/product/Yalumba-Patchwork-Shiraz-2014/167055,"<div class=""prodItem_wrap"">\n<div class=""prodI...",/product/Yalumba-Patchwork-Shiraz-2014/167055,This wine shows all the hallmarks of Barossa S...,Pair with barbecued pork spare ribs with a da...,16.99,Red Wine,Syrah/Shiraz,"from Barossa, South Australia, Australia",750,ML,13.5,Australia,14


In [78]:
category_mapping = dict([(y,x+1) for x,y in enumerate(sorted(set(clean_df['category_2'])))])
clean_df['cat_2_int'] = [category_mapping[x] for x in clean_df['category_2']]

In [79]:
directory = 'D:/data/sorted/labels_clean/'

label_metadata = {}
label_metadata['labels'] = []
for entry in tqdm(os.listdir(directory)):
  filename = os.path.split(entry)[-1]
  if filename.split('.')[1] != 'jpg':
    continue
  file_index = filename.split('_')[-1].split('.jpg')[0]
  assert file_index.isdigit()
  wine_row = clean_df[clean_df['file_index'] == int(file_index)]
  assert wine_row.shape[0] == 1
  label_metadata['labels'].append((
    filename, 
    int(wine_row['cat_2_int'].values[0])
    #wine_row['origin_short'].values[0]
  ))

# Dump to JSON
with open('D:/data/sorted/labels_clean/dataset.json', 'w') as outfile:
    json.dump(label_metadata, outfile)

100%|█████████| 74655/74655 [00:35<00:00, 2129.99it/s]
