In [1]:
import gzip
import gc
import math
import json
from collections import Counter, defaultdict
import random
from tqdm.notebook import tqdm
import numpy as np

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

def jl_to_list(fname):
    output = []
    with gzip.open(fname, "rb") as f:
        for line in f:
            output.append(json.loads(line))
    return output

### Load user data

In [5]:
samples = None
test_size = .2
rows = jl_to_list("data/train_dataset.jl.gz")
if samples:
    rows = rows[:samples]

### Train-test split

In [6]:
val_size = .15
rows_train, rows_test = train_test_split(rows, test_size = test_size, random_state = 42)
print(f"Train data: {len(rows_train)}.")
print(f"Test data: {len(rows_test)}.")

Train data: 330530.
Test data: 82633.


### Load items

In [2]:
item_data = jl_to_list("data/item_data.jl.gz")
metadata = {x["item_id"] : x for x in item_data}

### Load domain vectors

In [3]:
import csv

In [4]:
# open vector file
tsv_file = open("dom2vec/vectors_dom.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

vectors_dom = []
for row in read_tsv:
    vectors_dom.append(row)
tsv_file.close()

v_dom = np.zeros((len(vectors_dom), 32), dtype = np.float64)
for i, v in tqdm(enumerate(vectors_dom)):
    for j, vi in enumerate(v):
        v_dom[i,j] = float(vi)

# open keys file
tsv_file = open("dom2vec/meta_dom.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

keys_dom = []
for key in read_tsv:
    keys_dom.append(key[0])
tsv_file.close()

assert len(keys_dom) == v_dom.shape[0]

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




### Load category vectors

In [5]:
# open vector file
tsv_file = open("cat2vec/vectors_cat.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

vectors_cat = []
for row in read_tsv:
    vectors_cat.append(row)
tsv_file.close()

v_cat = np.zeros((len(vectors_cat), 32), dtype = np.float64)
for i, v in tqdm(enumerate(vectors_cat)):
    for j, vi in enumerate(v):
        v_cat[i,j] = float(vi)

# open keys file
tsv_file = open("cat2vec/meta_cat.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

keys_cat = []
for key in read_tsv:
    keys_cat.append(key[0])
tsv_file.close()

assert len(keys_cat) == v_cat.shape[0]

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




### Load word vectors

In [None]:
# open vector file
tsv_file = open("word2vec/vectors_q.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

vectors_wrd = []
for row in read_tsv:
    vectors_wrd.append(row)
tsv_file.close()

v_wrd = np.zeros((len(vectors_wrd), 32), dtype = np.float64)
for i, v in tqdm(enumerate(vectors_wrd)):
    for j, vi in enumerate(v):
        v_wrd[i,j] = float(vi)

# open keys file
tsv_file = open("word2vec/meta_q.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

keys_wrd = []
for key in read_tsv:
    keys_wrd.append(key[0])
tsv_file.close()

assert len(keys_wrd) == v_wrd.shape[0]

### Build dicts

In [7]:
doms = {k : v for k, v in zip(keys_dom, v_dom)}
cats = {k : v for k, v in zip(keys_cat, v_cat)}
words = {k : v for k, v in zip(keys_wrd, v_wrd)}

In [8]:
wrd_counter = Counter()
for i, wrd in enumerate(words.keys()):
    if i == 0: continue
    wrd_counter[wrd] = i

### Process prices

In [9]:
prices = []
for item in item_data:
    if item["price"] is not None: prices.append(float(item["price"]))
    
mean = np.mean(prices)
std = np.std(prices)

print("mean: ", mean)
print("std: ", std)

mean:  134906.66263479702
std:  22995129.77463674


### Print dimensions

In [10]:
dom_dim = v_dom.shape[1]
cat_dim = v_cat.shape[1]
wrd_dim = v_wrd.shape[1]

print("domain dim: ", dom_dim)
print("category dim: ", cat_dim)
print("word dim: ", wrd_dim)

domain dim:  32
category dim:  32
word dim:  32


### Build item array

In [12]:
n_words = 4
item_arr = np.zeros((len(item_data), dom_dim + cat_dim + n_words*wrd_dim + 1), 
                    dtype = np.float64)
for i, item in tqdm(enumerate(item_data)):
    dom = item["domain_id"]
    cat = item["category_id"]
    price = item["price"]
    title = item["title"]
    # fill array
    # domain
    if dom in doms.keys():
        item_arr[i, :dom_dim] = doms[dom]
    else:
        item_arr[i, :dom_dim] = doms["None"]
    # category
    if cat in cats.keys():
        item_arr[i, dom_dim:dom_dim + cat_dim] = cats[cat]
    else:
        item_arr[i, dom_dim:dom_dim + cat_dim] = cats["None"]
    # title
    wrds = list(set(title.lower().split()[:12]))
    ranks = []
    for w in wrds:
        ranks.append(wrd_counter[w])
    top = np.argsort(ranks)[::-1][:n_words]
    top_words = [wrds[t] for t in top]
    for j, w in enumerate(top_words):
        try:
            item_arr[i, dom_dim + cat_dim + wrd_dim*j:dom_dim + cat_dim + wrd_dim*(j+1)] = words[w]
        except KeyError:
            item_arr[i, dom_dim + cat_dim + wrd_dim*j:dom_dim + cat_dim + wrd_dim*(j+1)] = words['[UNK]']
    
    # price
    item_arr[i, -1] = price if price is not None else mean

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




### Standardize price

In [13]:
mean = np.mean(item_arr[:,-1])
std = np.std(item_arr[:,-1])

item_arr[:,-1] = (item_arr[:,-1] - mean)/std

In [14]:
print("mean: ", np.mean(item_arr[:, -1]))
print("std: ", np.std(item_arr[:,-1]))

mean:  3.031745264666882e-18
std:  1.0


### Save it!

In [None]:
np.save("items.npy", item_arr)

In [144]:
!ls -l --block-size=M

total 3096M
drwxrwxr-x. 3 guillermo.etchebarne guillermo.etchebarne    1M Nov 24 14:15 cat2vec
drwxrwxr-x. 2 guillermo.etchebarne guillermo.etchebarne    1M Nov 23 01:48 data
drwxrwxr-x. 3 guillermo.etchebarne guillermo.etchebarne    1M Nov 24 14:19 dom2vec
-rw-rw-r--. 1 guillermo.etchebarne guillermo.etchebarne    1M Nov 23 20:07 extract_features_sage.ipynb
-rw-rw-r--. 1 guillermo.etchebarne guillermo.etchebarne    1M Nov 24 15:53 get_item.ipynb
-rw-r--r--. 1 guillermo.etchebarne guillermo.etchebarne    1M Nov 22 17:29 graphsage_src.ipynb
-rw-rw-r--. 1 guillermo.etchebarne guillermo.etchebarne    1M Nov 23 18:08 item_features.ipynb
-rw-rw-r--. 1 guillermo.etchebarne guillermo.etchebarne 3096M Nov 24 15:53 items.npy
drwxrwxr-x. 3 guillermo.etchebarne guillermo.etchebarne    1M Nov 24 14:58 word2vec
