In [1]:
import gzip
import json
import gc
import math
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import random
from sklearn.model_selection import train_test_split

In [2]:
def jl_to_list(filename):
    output = []
    with gzip.open(filename, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

In [3]:
samples = 50000
rows = jl_to_list('data/train_dataset.jl.gz')
if samples:
    rows = rows[:samples]

In [4]:
rows_train, rows_test = train_test_split(rows, test_size=0.2, random_state=42)

In [5]:
rows_train[0]

{'user_history': [{'event_info': 'CAMERA INTELBRAS SEGURANCA CASA',
   'event_timestamp': '2019-10-18T17:34:51.147-0400',
   'event_type': 'search'},
  {'event_info': 1455128,
   'event_timestamp': '2019-10-19T15:14:33.352-0400',
   'event_type': 'view'},
  {'event_info': 1269708,
   'event_timestamp': '2019-10-19T15:15:45.913-0400',
   'event_type': 'view'}],
 'item_bought': 342538}

In [6]:
item_data = jl_to_list('data/item_data.jl.gz')

In [7]:
item_data[0]

{'item_id': 111260,
 'title': 'Casa Sola En Venta Con Gran Patio Solo Pago De Contado.',
 'domain_id': 'MLM-INDIVIDUAL_HOUSES_FOR_SALE',
 'product_id': None,
 'price': '1150000.00',
 'category_id': 'MLM170527',
 'condition': 'new'}

In [8]:
metadata = {x['item_id']:x for x in item_data}

In [9]:
all_items = list(metadata.keys())

# Items mas vendidos del dominio mas visitado
La idea es ver que dominio miro mas veces el usuario y recomendar los items mas vendidos del dominio

In [10]:
ventas_x_dominio = defaultdict(lambda: defaultdict(int))

for row in tqdm(rows_train):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    for item in viewed:
        domain = metadata[item]['domain_id']
        ventas_x_dominio[domain][item] += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40000.0), HTML(value='')))




In [11]:
def dominios_visitados(row, max_views=15):
    domains = Counter()
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    if len(viewed) > max_views:
        viewed = viewed[:max_views]
    for item in viewed:
        domain = metadata[item]['domain_id']
        domains[domain] += 1
    return domains

In [12]:
dominios_visitados(rows_train[0])

Counter({'MLB-KITCHEN_SUPPLIES': 1, 'MLB-LEGGINGS': 1})

In [13]:
def top_items(domain, k=10):
    top = ventas_x_dominio[domain]
    top = Counter(top)
    top = top.most_common(k)
    return [x[0] for x in top]

In [14]:
top_items('MLB-KITCHEN_SUPPLIES')

[1958493,
 1189880,
 556723,
 1908820,
 1110319,
 791116,
 572086,
 299240,
 1228153,
 930009]

In [15]:
def top_by_best_domain(row, k=10):
    viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
    if len(viewed) == 0:
        return random.choices(all_items, k=k)
    domain = dominios_visitados(row, 20)
    domain = domain.most_common(1)[0][0]
    return top_items(domain, k=k)

In [16]:
top_by_best_domain(rows_train[0])

[1958493,
 1189880,
 556723,
 1908820,
 1110319,
 791116,
 572086,
 299240,
 1228153,
 930009]

In [17]:
y_pred = []
for row in tqdm(rows_test):
    recomendacion = top_by_best_domain(row)
    y_pred.append(recomendacion)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [18]:
y_true = [row['item_bought'] for row in rows_test]

In [19]:
y_pred[0]

[457123,
 1584461,
 1649403,
 1103659,
 1389864,
 277255,
 1602308,
 1605313,
 2102121,
 341510]

In [20]:
y_true[0]

1902250

# Ultimos items vistos
Recomendar los ultimos items vistos

In [21]:
def last_viewed(row, k=10):
    viewed = [ev for ev in row['user_history'] if ev['event_type'] == 'view']
    viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
    viewed = [ev['event_info'] for ev in viewed]
    recomendacion = []
    
    for item in viewed:
        if item not in recomendacion:
            recomendacion.append(item)
    
    if len(recomendacion) > k:
        recomendacion = recomendacion[:k]
    
    missing_items = k - len(recomendacion)
    fill_items = random.choices(all_items, k=missing_items)
    
    return recomendacion + fill_items

In [22]:
last_viewed(rows[0])

[1615991,
 1786148,
 1034192,
 1372431,
 137147,
 1146467,
 814752,
 1095730,
 674391,
 876722]

In [23]:
y_pred = []
for row in tqdm(rows_test):
    recomendacion = last_viewed(row)
    y_pred.append(recomendacion)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




# Vistas y Compras

In [24]:
vistas_compras = defaultdict(lambda: defaultdict(int))
for row in tqdm(rows_train):
    for ev in row['user_history']:
        if ev['event_type'] == 'view':
            vistas_compras[int(ev['event_info'])][int(row['item_bought'])] += 1

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40000.0), HTML(value='')))




In [25]:
def get_item_scores(row):
    item_scores = defaultdict(int)
    for ev in row['user_history']:
        if ev['event_type'] == 'view':
            for k, v in vistas_compras[int(ev['event_info'])].items():
                item_scores[k] += v
    
    return Counter(item_scores)

In [26]:
get_item_scores(rows_train[2])

Counter({702406: 59,
         379741: 8,
         525875: 2,
         1343601: 150,
         1587422: 116,
         740586: 7,
         1533042: 3,
         2023019: 4,
         786745: 6,
         1127740: 189,
         928548: 6,
         910588: 8,
         1926672: 8,
         1299465: 1,
         1975106: 7,
         1420390: 1,
         1567385: 2,
         1035836: 1,
         231634: 1,
         725371: 181,
         536540: 32,
         151458: 2,
         1292323: 13,
         106472: 1,
         588326: 6,
         2009504: 2,
         1856280: 2,
         1813877: 9,
         2013706: 3,
         927857: 4,
         86082: 55,
         1215956: 1,
         832022: 1,
         129029: 26,
         1795702: 87,
         2086498: 1,
         1416723: 1,
         1847644: 35,
         1715614: 1,
         1028985: 1,
         431985: 6,
         31377: 3,
         804820: 15,
         1033830: 1,
         455802: 1,
         104388: 1,
         517255: 1,
         1172027: 12,


In [27]:
def vc_recomendacion(row):
    recomendacion = []
    scores = get_item_scores(row)
    most_common = scores.most_common()
    for item, score in most_common:
        recomendacion.append(item)
        if len(recomendacion) == 10:
            return recomendacion
        
    missing_items = 10 - len(recomendacion)
    fill_items = random.choices(all_items, k=missing_items)
    
    return recomendacion + fill_items

In [28]:
vc_recomendacion(rows[2])

[1909110,
 35690,
 1424897,
 1842806,
 1554232,
 600580,
 227448,
 1394070,
 156432,
 528612]

In [29]:
y_pred = []
for row in tqdm(rows_test):
    recomendacion = vc_recomendacion(row)
    y_pred.append(recomendacion)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))


