# Description of algorithm

This code groups brands according the value to add to the item.

# Settings

In [1]:
settings = {
            'del': False, # Delete variables that are no longer needed to proceed in computations to save place
            'random_state': { # Set random states so that the results are repeatable
                'shuffle': 42 # sklearn's shuffle method
            }
           }

# Use helper functions

In [2]:
%run ../src/helper_functions.py

Following functions has been loaded:

print_memory_usage
rmse



# Load data set and shuffle it

In [3]:
import numpy as np
import pandas as pd

In [4]:
PATH = "../../data/"
data_full = pd.read_csv(f'{PATH}train.tsv', sep='\t')

In [5]:
data_full.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm]...
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


## Shuffle

In [6]:
from sklearn.utils import shuffle
data_shuffled = shuffle(data_full, random_state=settings['random_state']['shuffle'])

if (settings['del']):
    del data_full

In [7]:
len(data_shuffled)

1482535

In [8]:
print("Number of unique fields:\n")

print("category_name: \t%d" % data_shuffled['category_name'].nunique())
print("brand_name: \t%d" % data_shuffled['brand_name'].nunique())
print()
print("%d items have no brand" % data_shuffled['brand_name'].isna().sum())
print("%d items have no category_name" % data_shuffled['category_name'].isna().sum())

Number of unique fields:

category_name: 	1287
brand_name: 	4809

632682 items have no brand
6327 items have no category_name


# Filter out samples that does not have category_name

In [16]:
data_reduced = data_shuffled.loc[data_shuffled['category_name'].notnull()]

if (settings['del']):
    del data_shuffled

In [17]:
# double check
print("%d items have no category_name" % data_reduced['category_name'].isna().sum())

0 items have no category_name


# Represent data numerically and perform linear regression

In [24]:
unique_cns = data_reduced.category_name.unique() # array of unique category names

In [25]:
cat_name = unique_cns[0]
print(cat_name)
data_filtered_cn = data_reduced.loc[data_reduced.category_name == cat_name]

Handmade/Paper Goods/Stationery


In [26]:
data_filtered_cn

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
777341,777341,F/ship 4 Totoro Washi + 1 pen,1,Handmade/Paper Goods/Stationery,,12.0,1,This listing is for all 4 Totoro washi tape fo...
943434,943434,Large Memo Pads for JocyFisher,1,Handmade/Paper Goods/Stationery,,20.0,1,JocyFisher ^_^
253027,253027,FLASH SALE Memo paper lot! #2,1,Handmade/Paper Goods/Stationery,,3.0,1,❤️Welcome to PastelQuinn❤️ ~ All items BOGO [r...
1067145,1067145,Stationary accessories,3,Handmade/Paper Goods/Stationery,,25.0,1,Comes with everything
891522,891522,40pcs Crayon Shin-Chan Scrapbook Sticker,1,Handmade/Paper Goods/Stationery,,7.0,1,Super Cute Scrapbooking Stickers！ Quantity: 1 ...
649124,649124,BUNDLE,3,Handmade/Paper Goods/Stationery,,10.0,0,Originally 100 sheets and 50 envelopes but the...
172715,172715,Trolls stationary set,1,Handmade/Paper Goods/Stationery,,5.0,1,5 piece stationary set
411325,411325,40 Rate Great,1,Handmade/Paper Goods/Stationery,,4.0,1,⭐️Don't forget to read my Bio before you purch...
1073706,1073706,Bundle: Assortment on stationary items!,3,Handmade/Paper Goods/Stationery,,18.0,0,"These items haven't been used, just sitting on..."
103741,103741,Juicy couture notebook,2,Handmade/Paper Goods/Stationery,,14.0,0,Super cute stationary set from juicy couture


In [11]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))

                 data_shuffled: 690.5MiB
                             _: 688.1MiB
                  data_reduced: 688.1MiB
                           _10: 688.1MiB
                     data_full: 679.2MiB
                           ___:   2.1KiB
                            _5:   2.1KiB
                          _i11:   578.0B
                          _iii:   492.0B
                           _i8:   492.0B
