In [8]:
import torch
from transformers import AutoModel
from torch.utils.data import DataLoader
from torchvision import transforms
from transformers import AutoModelForSequenceClassification, AutoImageProcessor, ViTForImageClassification
from datasets import load_dataset

import random
import random
from PIL import ImageDraw, ImageFont, Image
import pathlib
import sklearn
import datasets
import pandas as pd
import sklearn.preprocessing
import sklearn.model_selection
import glob
import functools


In [9]:
# !wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar

In [10]:
dataset_path_raw = "/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata"

In [68]:
glob.glob(dataset_path_raw + '*/*.json.gz')

['/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_b.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_2.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_c.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_3.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_a.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_1.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_8.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_9.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_0.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_5.json.gz',
 '/Users/tylerklimas/Desktop/BERTModel/abo-listings/listings/metadata/listings_e.json.gz',

In [71]:
data_list = map(functools.partial(pd.read_json, lines=True), glob.glob(dataset_path_raw + '*/*.json.gz'))

In [72]:
df_raw = pd.concat(data_list)

In [73]:
df_raw.iloc[0]

brand                       [{'language_tag': 'en_CA', 'value': 'CGTL'}]
color                  [{'language_tag': 'en_CA', 'value': 'Multicolo...
item_id                                                       B08DV9LXW3
item_name              [{'language_tag': 'en_CA', 'value': 'CGTL Ear ...
product_type                                    [{'value': 'ACCESSORY'}]
main_image_id                                                51GM3Rpdm8L
country                                                               CA
marketplace                                                       Amazon
domain_name                                                    amazon.ca
node                   [{'node_id': 10287485011, 'node_name': '/Categ...
item_dimensions                                                      NaN
bullet_point                                                         NaN
item_weight                                                          NaN
other_image_id                                     

In [53]:
def parse_property(dataset_dict, name):  # takes each element of the data and places the value into dictionary spot
    try:
        r = dataset_dict[name][0]
        if name == 'node':
            return r['node_name']
        else:
            return r['value']
    
    except Exception as e:
        return None
    
def cleanup_record(raw_record:dict):
    record = {
        'brand': parse_property(raw_record,'brand')
        ,'item_id': raw_record['item_id']
        ,'item_name': parse_property(raw_record,'item_name')
        ,'product_type': parse_property(raw_record,'product_type')
        ,'node': parse_property(raw_record, 'node')
        , 'main_image_id': raw_record['main_image_id']
        ,'product_description': raw_record['product_description']

        
    }

    return pd.Series(record)

In [74]:
cleanup_record(df_raw.iloc[0])

brand                                                               CGTL
item_id                                                       B08DV9LXW3
item_name              CGTL Ear Loops Face Bandana Neck Gaiter Stylis...
product_type                                                   ACCESSORY
node                          /Categories/Women/Accessories/Neck Gaiters
main_image_id                                                51GM3Rpdm8L
product_description                                                  NaN
dtype: object

In [48]:
r = df_raw.iloc[0].to_dict()
parse_property(r, 'brand')  
r['brand'][0]['value']

'CGTL'

In [55]:
df = df_raw.apply(cleanup_record,axis=1)

Index(['brand', 'item_id', 'item_name', 'product_type', 'node',
       'main_image_id', 'product_description'],
      dtype='object')

In [57]:
df['product_type'].value_counts()
# we want to limit the amount of classes we have to predict

CELLULAR_PHONE_CASE    64853
SHOES                  12965
GROCERY                 6546
HOME                    5264
HOME_BED_AND_BATH       3082
                       ...  
TREADMILL                  1
GOLF_CLUB                  1
SCULPTURE                  1
AIR_PURIFIER               1
STROLLER                   1
Name: product_type, Length: 576, dtype: int64

In [99]:
min_product_count = 500

top_products =  df['product_type'].value_counts().loc[lambda x: x>min_product_count].index.tolist()
top_products

['CELLULAR_PHONE_CASE',
 'SHOES',
 'GROCERY',
 'HOME',
 'HOME_BED_AND_BATH',
 'HOME_FURNITURE_AND_DECOR',
 'CHAIR',
 'BOOT',
 'SANDAL',
 'FINERING',
 'HEALTH_PERSONAL_CARE',
 'FINENECKLACEBRACELETANKLET',
 'ACCESSORY',
 'SOFA',
 'OFFICE_PRODUCTS',
 'FINEEARRING',
 'PET_SUPPLIES',
 'SPORTING_GOODS',
 'TABLE',
 'HARDWARE_HANDLE',
 'RUG',
 'HANDBAG',
 'LIGHT_BULB',
 'KITCHEN',
 'HAT',
 'EARRING',
 'OUTDOOR_LIVING',
 'WALL_ART',
 'JANITORIAL_SUPPLY',
 'LAMP',
 'LIGHT_FIXTURE']

In [100]:
df_all = df [ df['product_type'].isin(top_products) ].copy()

In [101]:
df_all['label_name'] = df_all['product_type']
df_all['text'] = df_all['item_name']

In [102]:
label_encoder = sklearn.preprocessing.LabelEncoder()

In [103]:
label_encoder.fit(df_all['label_name'])

In [104]:
df_all['label'] = label_encoder.transform(df_all['label_name'])

In [114]:
df_train, df_test = sklearn.model_selection.train_test_split(df_all, train_size=.6, stratify= df_all['label'] )


df_test, df_val = sklearn.model_selection.train_test_split(df_test, test_size=.5, stratify= df_test['label'] )


print  ( 
{
    'train': len(df_train)
    ,'test': len(df_test)
    ,'val': len(df_val)
}

)

{'train': 72743, 'test': 24248, 'val': 24248}


In [116]:
dataset_features = datasets.Features({
    'text': datasets.Value('string'),
    'item_name': datasets.Value('string'),
    'label': datasets.ClassLabel(names=list(label_encoder.classes_)),
    'brand': datasets.Value('string'),
    'item_id': datasets.Value('string'),
    'main_image_id': datasets.Value('string'),
    'node': datasets.Value('string')
})

In [127]:
interested_columns = dataset_features.keys()

dataset_train = datasets.Dataset.from_pandas(df_train[interested_columns],features=dataset_features, preserve_index=False)
dataset_test = datasets.Dataset.from_pandas(df_test[interested_columns],features=dataset_features, preserve_index=False)
dataset_validation = datasets.Dataset.from_pandas(df_test[interested_columns],features=dataset_features, preserve_index=False)

dataset_all = datasets.DatasetDict({
    'train': dataset_train,
    'test': dataset_test,
    'valid': dataset_validation }
)
dataset_all

DatasetDict({
    train: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 72743
    })
    test: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 24248
    })
    valid: Dataset({
        features: ['text', 'item_name', 'label', 'brand', 'item_id', 'main_image_id', 'node'],
        num_rows: 24248
    })
})

In [132]:
all_classes = dataset_all['train'].features['label'].names_file

In [None]:
dataset_path = '../artifacts/dataset_processed/'