In [1]:
#Import.
import gzip
import json
import gc
import math
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import random
from sklearn.model_selection import train_test_split
from datetime import datetime
import pandas as pd
import numpy as np
from itertools import product
from itertools import chain
from itertools import compress
from numba import jit, njit, vectorize
import numpy_indexed as npi
from ast import literal_eval

def jl_to_list(fname):
    output = []
    with gzip.open(fname, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

def cut_str(a):
    try:
        return a[4:]
    except:
        return a

#Funciones.
def dominios_visitados(visits):
    domains = Counter()
    for item in visits:
        domain = metadata[item]['domain_code']
        domains[domain] += 1
    return domains

def productos_visitados(visits):
    productos = Counter()
    for item in visits:
        producto = metadata[item]['product_id']
        if producto:
            productos[producto] += 1
    return productos

def categorias_visitadas(visits):
    categorias = Counter()
    for item in visits:
        categoria = metadata[item]['category_code']
        if categoria:
            categorias[categoria] += 1
    return categorias

def top_items(feature, k=10, m='d'):
    vector = {'d': ventas_x_dominio,
          'p': ventas_x_producto,
          'c': ventas_x_categoria}[m]
    top = vector[feature]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

def last_viewed(row, k=10):
    a = []
    for i in sorted([ev for ev in row if ev['event_type']=='view'], key = lambda x: x['event_timestamp'], reverse = True):
        if i['event_info'] not in a:
            a.append(i['event_info'])
        if len(a) == k:
            break
    j = k - len(a)
    a.extend([0] * j)
    return a

def most_viewed(row, k=10):
    viewed = views(row)
    m_viewed = Counter()
    if len(viewed)==0:
        return [0] * k
    for item in viewed:
        m_viewed[item] += 1
    j = min(len(m_viewed), k)
    a = [x[0] for x in m_viewed.most_common(j)]
    a.extend([0] * (k - j))
    return a

def top_by_best_domain(row, k=10):
    viewed = views(row)
    if len(viewed)==0:
        return [0] * k
    domain = dominios_visitados(viewed)
    domain = domain.most_common(1)[0][0]
    t_items = top_items(domain, k=k, m='d')
    if len(t_items) < k:
        t_items.extend([0] * (k - len(t_items)))
    return t_items

def top_by_best_product(row, k=10):
    viewed = views(row)
    if len(viewed)==0:
        return [0] * k
    producto = productos_visitados(viewed)
    try:
        producto = producto.most_common(1)[0][0]
        t_items = top_items(producto, k=k, m='p')
        if len(t_items) < k:
            t_items.extend([0] * (k - len(t_items)))
        return t_items
    except:
        return [0] * k

def top_by_best_category(row, k=10):
    viewed = views(row)
    if len(viewed)==0:
        return [0] * k
    categoria = categorias_visitadas(viewed)
    try:
        categoria = categoria.most_common(1)[0][0]
    except:
        return [0] * k
    t_items = top_items(categoria, k=k, m='c')
    if len(t_items) < k:
        t_items.extend([0] * (k - len(t_items)))
    return t_items

def top_site(row, k=10):
    return [x[0] for x in ventas.most_common(k)]

def get_item_bought(row):
    if 'item_bought' in row:
        return row['item_bought']
    else:
        return 0

def ib_same_as_liv(ib, liv):
    return ib==liv

def ib_in_viewed(row, viewed):
    ib = get_item_bought(row)
    return ib in viewed

def history_length(row):
    return len(row['user_history'])

def views0(row):
    return ([ev['event_info'] for ev in row['user_history'] if ev['event_type']=='view'])

def views(row):
    return ([ev['event_info'] for ev in row if ev['event_type']=='view'])

def searchs(row):
    return ([ev['event_info'] for ev in row['user_history'] if ev['event_type']=='search'])

def ib_same_as_miv(ib, miv):
    return ib==miv

def make_target_list(rows_train, extract_function, k=10):
    target = []
    for row in tqdm(rows_train):
        target.append([extract_function(row)] * k)
    return target

def make_features_dict_list(rows_train, features_def, aux_function, k=10):
    features = []
    for row in tqdm(rows_train):
        aux_data = aux_function(row)
        row_features = {}
        for f in features_def:
            row_features[f] = features_def[f](row, aux_data, k=k)
        features.append(row_features)
    return features

def make_features_pd(rows_train, features_def, k=10):
    ROWS_TRAIN = pd.DataFrame(rows_train)
    for f in tqdm(features_def):
        ROWS_TRAIN[f] = ROWS_TRAIN['user_history'].apply(features_def[f])
    return ROWS_TRAIN
    
def list_to_np_array(collection):
    if str(type(collection[0])).replace("'","") == '<class list>':
        for i, j in tqdm(enumerate(collection)):
            collection[i] = np.array(j)
    elif str(type(collection[0])).replace("'","") == '<class dict>':
        for i, j in tqdm(enumerate(collection)):
            for h in collection[i]:
                collection[i][h] = np.array(collection[i][h])
    return collection

def searchs_to_np(searchs):
    return np.array([busquedas_values[x] for x in searchs])

def add_features_to_list(new_features, new_feature_data, old_feature_data):
    if len(new_feature_data) == len(old_feature_data):
        for row in tqdm(zip(old_feature_data, new_feature_data)):
            for key in new_features:
                row[0][key] = row[1][key]
    return old_feature_data

def str_to_int(a):
    return int(a)

def extract_views(a):
    b = np.empty_like(a)
    for i in range(len(b)):
        b[i] = np.array([x['event_info'] for x in a[i] if x['event_type'] == 'view'])
    return b

def extract_searchs(a):
    b = np.empty_like(a)
    for i in range(len(b)):
        b[i] = np.array([x['event_info'] for x in a[i] if x['event_type'] == 'search'])
    return b

def vistas_compras(a, b, c):
    d = []
    for i in np.unique(a):
        if i != b:
            d.append([i, b, c, 1])
    return d

def gen_vistas_compras(views, ib):
    id = np.array(range(len(views)))
    VC = []
    for v in tqdm(range(len(views))):
        vc = vistas_compras(views[v][0], ib[v][0], id[v])
        VC += vc
    return VC

def busquedas_compras(a, b, c):
    d = []
    for i in np.unique(a):
        d.append([i, b, c, 1])
    return d

def gen_busquedas_compras(searchs, ib):
    id = np.array(range(len(searchs)))
    VC = []
    for v in tqdm(range(len(searchs))):
        vc = vistas_compras(searchs[v][0], ib[v][0], id[v])
        VC += vc
    return VC

#Filtros nb.
def filter_not(x, z):
    return x != z
filter_not_v = np.vectorize(filter_not)

def filter_in(x, z):
    return np.isin(x, z, assume_unique=True)
filter_in_v = np.vectorize(filter_in)

#Generar datos.
def gen_data(window, events, k=10):
    WINDOW = pd.DataFrame(window, columns=['busqueda', 'compra', 'id_sesion', 'constante'])
    pred = []
    j = 0
    for b in tqdm(events):
        h = list(WINDOW.loc[(WINDOW['id_sesion'] != j) & (np.isin(np.array(WINDOW['busqueda']), b, assume_unique=True))].groupby('compra').sum().sort_values('constante', ascending=False).index[:k])
        if len(h) < k:
            h.extend([-1] * (k - len(h)))
        pred.append(h)
        j += 1
        if j % 50000 == 0:
            PRED = pd.DataFrame(pred, columns=['pred'+str(i) for i in range(10)])
            PRED.to_csv('dim_vc'+str(j)+'.csv', index=False, sep=';')        
    return pred

In [2]:
def load_ib():
    name = 'ib'
    V = pd.read_csv(name+'.csv', sep=';')
    return np.array(V)

def load_views(uq, test):
    name = 'views' + {True: '_test', False: '_train'}[test] + {True: '_uq', False: ''}[uq]
    V = pd.read_csv(name+'.csv', sep=';')
    V[name] = V[name].apply(lambda x: literal_eval(x))
    V[name] = V[name].apply(lambda x: np.array(x))
    return np.array(V)

def load_searchs(uq, test):
    name = 'searchs' + {True: '_test', False: '_train'}[test] + {True: '_uq', False: ''}[uq]
    V = pd.read_csv(name+'.csv', sep=';')
    V[name] = V[name].apply(lambda x: literal_eval(x))
    V[name] = V[name].apply(lambda x: np.array(x))
    return np.array(V)

In [7]:
#Loading data.
ib = load_ib()
#views_train = load_views(uq=True, test=False)
views_test = load_views(uq=True, test=True)

In [4]:
VC = gen_vistas_compras(views_train, ib)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=413163.0), HTML(value='')))




In [8]:
#dim_vc = gen_data(VC, views_train)
dim_pred_vc = gen_data(VC, views_test)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=177070.0), HTML(value='')))




In [6]:
DIM_VC = pd.DataFrame(dim_vc, columns=['pred'+str(i) for i in range(10)])
DIM_VC.to_csv('dim_vc.csv', index=False, sep=';')

In [9]:
DIM_PRED_VC = pd.DataFrame(dim_pred_vc, columns=['pred'+str(i) for i in range(10)])
DIM_PRED_VC.to_csv('dim_pred_vc.csv', index=False, sep=';')