In [1]:
#Import.
import gzip
import json
import gc
import math
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import random
from sklearn.model_selection import train_test_split
from datetime import datetime
import pandas as pd
import numpy as np
from itertools import product
from itertools import chain
from itertools import compress
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process

def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

def cut_str(a):
    try:
        return a[4:]
    except:
        return a

In [8]:
#Funciones.
def dominios_visitados(visits):
    domains = Counter()
    for item in visits:
        domain = metadata[item]['domain_id']
        domains[domain] += 1
    return domains

def productos_visitados(visits):
    productos = Counter()
    for item in visits:
        producto = metadata[item]['product_id']
        if producto:
            productos[producto] += 1
    return productos

def categorias_visitadas(visits):
    categorias = Counter()
    for item in visits:
        categoria = metadata[item]['category_id']
        if categoria:
            categorias[categoria] += 1
    return categorias

def cprecios_visitados(visits):
    cprecios = Counter()
    for item in visits:
        cprecio = int(metadata[item]['price_cluster'])
        if cprecio:
            cprecios[cprecio] += 1
    return cprecios

def top_items(feature, m, k=10):
    vector = {'d': ventas_x_dominio,
          'p': ventas_x_producto,
          'c': ventas_x_categoria,
          'r': ventas_x_cluster_precio}[m]
    top = vector[feature]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

def top_items2(feature, m, k=10):
    vector = {'d': visitas_x_dominio,
          'p': visitas_x_producto,
          'c': visitas_x_categoria,
          'r': visitas_x_cluster_precio}[m]
    top = vector[feature]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

def last_viewed(row, viewed, k=10, h=1):
    a = []
    for i in sorted([ev for ev in row['user_history'] if ev['event_type']=='view'], key = lambda x: x['event_timestamp'], reverse = True):
        if i['event_info'] not in a:
            a.append(i['event_info'])
        if len(a) == k:
            break
    j = k - len(a)
    a.extend([-1] * j)
    return a

def most_viewed(row, viewed, k=10, h=1):
    m_viewed = Counter()
    if len(viewed)==0:
        return [-1] * k
    for item in viewed:
        m_viewed[item] += 1
    j = min(len(m_viewed), k)
    a = [x[0] for x in m_viewed.most_common(j)]
    a.extend([-1] * (k - j))
    return a

def top_by_best_domain(row, viewed, k=10, h=1):
    if len(viewed)==0:
        return [-1] * k
    domain = dominios_visitados(viewed)
    if len(domain) >= h:
        domain = domain.most_common(h)[0][0]
        t_items = top_items(domain, k=k, m='d')
        if len(t_items) < k:
            t_items.extend([-1] * (k - len(t_items)))
        return t_items
    else:
        return [-1] * k

def top_by_best_product(row, viewed, k=10, h=1):
    if len(viewed)==0:
        return [-1] * k
    producto = productos_visitados(viewed)
    if len(producto) >= h:
        producto = producto.most_common(h)[0][0]
        t_items = top_items(producto, k=k, m='p')
        if len(t_items) < k:
            t_items.extend([-1] * (k - len(t_items)))
        return t_items
    else:
        return [-1] * k
    
def top_by_best_category(row, viewed, k=10, h=1):
    if len(viewed)==0:
        return [-1] * k
    categoria = categorias_visitadas(viewed)
    if len(categoria) >= h:
        categoria = categoria.most_common(h)[0][0]
        t_items = top_items(categoria, k=k, m='c')
        if len(t_items) < k:
            t_items.extend([-1] * (k - len(t_items)))
        return t_items
    else:
        return [-1] * k

def top_by_cprice(row, viewed, k=10, h=1):
    if len(viewed)==0:
        return [-1] * k
    cprecio = cprecios_visitados(viewed)
    if len(cprecio) >= h:
        cprecio = cprecio.most_common(h)[0][0]
        t_items = top_items(cprecio, k=k, m='r')
        if len(t_items) < k:
            t_items.extend([-1] * (k - len(t_items)))
        return t_items
    else:
        return [-1] * k

def top_by_best_domain2(row, viewed, k=10, h=1):
    if len(viewed)==0:
        return [-1] * k
    domain = dominios_visitados(viewed)
    if len(domain) >= h:
        domain = domain.most_common(h)[0][0]
        t_items = top_items2(domain, k=k, m='d')
        if len(t_items) < k:
            t_items.extend([-1] * (k - len(t_items)))
        return t_items
    else:
        return [-1] * k

def top_by_best_product2(row, viewed, k=10, h=1):
    if len(viewed)==0:
        return [-1] * k
    producto = productos_visitados(viewed)
    if len(producto) >= h:
        producto = producto.most_common(h)[0][0]
        t_items = top_items2(producto, k=k, m='p')
        if len(t_items) < k:
            t_items.extend([-1] * (k - len(t_items)))
        return t_items
    else:
        return [-1] * k
    
def top_by_best_category2(row, viewed, k=10, h=1):
    if len(viewed)==0:
        return [-1] * k
    categoria = categorias_visitadas(viewed)
    if len(categoria) >= h:
        categoria = categoria.most_common(h)[0][0]
        t_items = top_items2(categoria, k=k, m='c')
        if len(t_items) < k:
            t_items.extend([-1] * (k - len(t_items)))
        return t_items
    else:
        return [-1] * k

def top_by_cprice2(row, viewed, k=10, h=1):
    if len(viewed)==0:
        return [-1] * k
    cprecio = cprecios_visitados(viewed)
    if len(cprecio) >= h:
        cprecio = cprecio.most_common(h)[0][0]
        t_items = top_items2(cprecio, k=k, m='r')
        if len(t_items) < k:
            t_items.extend([-1] * (k - len(t_items)))
        return t_items
    else:
        return [-1] * k

def top_site(row, viewed, k=10, h=1):
    return [x[0] for x in ventas.most_common(k)]
    
def get_item_scores(row, viewed, k=10):
    item_scores = defaultdict(int)
    for ev in row['user_history']:
        if ev['event_type'] == 'view':
            for j, v in vistas_compras[int(ev['event_info'])].items():
                item_scores[j] += v
    return [x[0] for x in Counter(item_scores).most_common(k)]

def get_item_bought(row):
    if 'item_bought' in row:
        return row['item_bought']
    else:
        return 0

def ib_same_as_liv(ib, liv):
    return ib==liv

def ib_in_viewed(row, viewed):
    ib = get_item_bought(row)
    return ib in viewed

def history_length(row):
    return len(row['user_history'])

def views(row):
    return ([ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view'])

def searchs(row):
    return ([ev['event_info'] for ev in row['user_history'] if ev['event_type']=='search'])

def ib_same_as_miv(ib, miv):
    return ib==miv

def make_target_list(rows_train, extract_function, k=10):
    target = []
    for row in tqdm(rows_train):
        target.append([extract_function(row)] * k)
    return target

def make_features_dict_list(rows_train, features_def, aux_function, k=10):
    features = []
    for row in tqdm(rows_train):
        aux_data = aux_function(row)
        row_features = {}
        for f in features_def:
            row_features[f] = features_def[f](row, aux_data, k=k)
        features.append(row_features)
    return features
    
def list_to_np_array(collection):
    if str(type(collection[0])).replace("'","") == '<class list>':
        for i, j in tqdm(enumerate(collection)):
            collection[i] = np.array(j)
    elif str(type(collection[0])).replace("'","") == '<class dict>':
        for i, j in tqdm(enumerate(collection)):
            for h in collection[i]:
                collection[i][h] = np.array(collection[i][h])
    return collection

def searchs_to_np(searchs):
    return np.array([busquedas_values[x] for x in searchs])

def add_features_to_list(new_features, new_feature_data, old_feature_data):
    if len(new_feature_data) == len(old_feature_data):
        for row in tqdm(zip(old_feature_data, new_feature_data)):
            for key in new_features:
                row[0][key] = row[1][key]
    return old_feature_data

def load_item_data(all_itms = False):
    ITEM_DATA = pd.read_csv('item_data.csv', sep=';')
    fields = ['item_id', 'domain_id', 'product_id', 'category_id', 'price', 'price_cluster', 'condition', 'mexico']
    m = {}
    for column in tqdm(fields):
         m[column] = list(ITEM_DATA[column])
    metadata = {}
    for i, j in tqdm(enumerate(m['item_id'])):
        metadata[j] = {}
        for column in fields: 
            metadata[j].update({column: m[column][i]})
    if all_itms:
        all_items = list(metadata)
    else:
        all_items = []
    return metadata, all_items

def str_to_int(a):
    return int(a)

In [3]:
#Loading data.
rows_train = jl_to_list('train_dataset.jl.gz')
rows_test = jl_to_list('test_dataset.jl.gz')
metadata, _ = load_item_data()

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [4]:
#Ventas site.
ventas = Counter()
for row in tqdm(rows_train):
    ventas[row['item_bought']] += 1

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))




In [5]:
#Ventas.
ventas_x_dominio = defaultdict(lambda: defaultdict(int))
ventas_x_producto = defaultdict(lambda: defaultdict(int))
ventas_x_categoria = defaultdict(lambda: defaultdict(int))
ventas_x_cluster_precio = defaultdict(lambda: defaultdict(int))

for item in tqdm(ventas):
    domain = metadata[item]['domain_id']
    producto = metadata[item]['product_id']
    categoria = metadata[item]['category_id']
    cprecio = int(metadata[item]['price_cluster'])
    ventas_x_dominio[domain][item] += ventas[item]
    if producto:
        ventas_x_producto[producto][item] += ventas[item]
    if categoria:
        ventas_x_categoria[categoria][item] += ventas[item]
    ventas_x_cluster_precio[cprecio][item] += ventas[item]

HBox(children=(FloatProgress(value=0.0, max=64928.0), HTML(value='')))




In [6]:
#Visitas.
visitas_x_dominio = defaultdict(lambda: defaultdict(int))
visitas_x_producto = defaultdict(lambda: defaultdict(int))
visitas_x_categoria = defaultdict(lambda: defaultdict(int))
visitas_x_cluster_precio = defaultdict(lambda: defaultdict(int))

for row in tqdm(rows_train):
    viewed = views(row)
    for item in viewed:
        domain = metadata[item]['domain_id']
        producto = metadata[item]['product_id']
        categoria = metadata[item]['category_id']
        cprecio = int(metadata[item]['price_cluster'])
        visitas_x_dominio[domain][item] += 1
        if producto:
            visitas_x_producto[producto][item] += 1
        if categoria:
            visitas_x_categoria[categoria][item] += 1
        visitas_x_cluster_precio[cprecio][item] += 1

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))




In [12]:
#Colecciones.
#ib = make_target_list(rows_train, get_item_bought, k=10)

features_def = {'liv': last_viewed,
            'miv': most_viewed,
            'tbd': top_by_best_domain,
            'tbd2': top_by_best_domain2,
            'tbp': top_by_best_product,
            'tbp2': top_by_best_product2,
            'tbc': top_by_best_category,
            'tbc2': top_by_best_category2}

dim = make_features_dict_list(rows_train, features_def, views)
dim_pred = make_features_dict_list(rows_test, features_def, views)

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [9]:
#Colecciones.
#ib = make_target_list(rows_train, get_item_bought, k=10)

features_def = {'tcp': top_by_cprice,
                'tcp2': top_by_cprice2}

dim = make_features_dict_list(rows_train, features_def, views)
dim_pred = make_features_dict_list(rows_test, features_def, views)

HBox(children=(FloatProgress(value=0.0, max=413163.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [13]:
#with open('train_dataset_ib.json', 'w') as fname:
#    json.dump(ib, fname, indent=4)
with open('train_dataset_dim_v4.json', 'w') as fname:
    json.dump(dim, fname, indent=4)
with open('train_dataset_dim_pred_v4.json', 'w') as fname:
    json.dump(dim_pred, fname, indent=4)

In [10]:
with open('train_dataset_dim_tcp.json', 'w') as fname:
    json.dump(dim, fname, indent=4)
with open('train_dataset_dim_pred_tcp.json', 'w') as fname:
    json.dump(dim_pred, fname, indent=4)