In [2]:
# imports

import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
from items import Item
from loaders import ItemLoader
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
from tqdm import tqdm
import pickle
import json

In [3]:
# Load in dataset

with open('../train.pkl', 'rb') as file:
    train = pickle.load(file)

with open('../test.pkl', 'rb') as file:
    test = pickle.load(file)

In [4]:
dir(train[0])

['PREFIX',
 'QUESTION',
 'REMOVALS',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'category',
 'details',
 'include',
 'make_prompt',
 'parse',
 'price',
 'prompt',
 'scrub',
 'scrub_details',
 'test_prompt',
 'title',
 'token_count',
 'tokenizer']

In [5]:
items = train + test

In [6]:
len(items)

402000

In [7]:
for item in tqdm(items):
    item.features = json.loads(item.details)

100%|███████████████████████████████| 402000/402000 [00:02<00:00, 152499.00it/s]


In [8]:
for item in items:
    w = item.features.get('Item Weight')
    if w:
        space = w.index(' ')
        item.weight = float(w[:space])
        item.units = w[space+1:]
    else:
        item.weight = None
        item.units = None

In [9]:
items[0].units

'Pounds'

In [10]:
set(item.units for item in items if item.units is not None)

{'Grams',
 'Hundredths Pounds',
 'Kilograms',
 'Milligrams',
 'Ounces',
 'Pounds',
 'ounces',
 'pounds'}

In [11]:
multipliers = {'Grams': 0.035274,
 'Hundredths Pounds': 0.16,
 'Kilograms': 35.27396,
 'Milligrams': 0.000035274,
 'Ounces': 1,
 'Pounds': 16,
 'ounces': 1,
 'pounds': 16}

In [12]:
for item in items:
    if item.weight:
        item.weight = item.weight * multipliers[item.units]
        item.units = "ounces"

In [13]:
for item in items:
    ranks = item.features.get("Best Sellers Rank")
    if ranks:
        item.rank = min(ranks.values())
    else:
        item.rank = None

In [14]:
len(items)

402000

In [15]:
from datetime import datetime

replacer = {
    "01M": "January",
    "02M": "February",
    "03M": "March",
    "04M": "April",
    "05M": "May",
    "06M": "June",
    "07M": "July",
    "08M": "August",
    "09M": "September",
    "10M": "October",
    "11M": "November",
    "12M": "December"
}

for item in items:
    avail = item.features.get("Date First Available")
    if avail:
        for key, value in replacer.items():
            avail = avail.replace(key, value)
        date_obj = datetime.strptime(avail, "%B %d, %Y")
        item.timestamp = date_obj.timestamp()
    else:
        item.timestamp = None

In [16]:
top_tech = ['samsung', 'sony', 'garmin', 'intel', 'dell computers','hp','lg','asus', 'nikon']
top_toys = ['mattel', 'hasbro', 'lego']
for item in items:
    item.is_top_tech = (item.features.get("Manufacturer") or "").lower() in top_tech
    item.is_top_toys = (item.features.get("Manufacturer") or "").lower() in top_toys


In [18]:
ranks = [i.rank for i in items if i.rank]
average_rank = sum(ranks)/len(ranks)
weights = [i.weight for i in items if i.weight]
average_weight = sum(weights)/len(weights)
timestamps

217.19522700550888

In [20]:
train_features = [t for t in train if t.rank and t.weight and t.timestamp]

In [21]:
len(train_features)

277220

In [22]:
test_features = [t for t in test if t.rank and t.weight and t.timestamp]

In [23]:
len(test_features)

1416

In [24]:

with open('../training_data.pkl', 'wb') as file:
    pickle.dump(train_features, file)

with open('../test_data.pkl', 'wb') as file:
    pickle.dump(test_features, file)

In [1]:
len(items)

NameError: name 'items' is not defined