In [1]:
%%writefile utils.py
import gzip
import json

def jl_to_list(filename):
    output = []
    with gzip.open(filename, 'rb') as f:
        for line in f:
            output.append(json.loads(line))
    return output

def drop_duplicates(items):
    seen = set()
    seen_add = seen.add
    return [x for x in items if not (x in seen or seen_add(x))]

Overwriting utils.py


In [2]:
%%writefile models.py
import math
import random
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from utils import drop_duplicates

class AbstractBaseline(object):
    def __init__(self, all_items, fill_model=None, k=10, verbose=True):
        self.all_items = all_items
        self.fill_model = fill_model
        self.k = k
        self.verbose = verbose
    
    def fit(self, X=None, y=None):
        self._fit(X, y)
        
        if self.fill_model is not None:
            self.fill_model.fit(X, y)
            
        return self
    
    def _fit(self, X, y):
        pass
    
    def predict(self, X=None):
        y_pred = []
        
        for row in (tqdm(X) if self.verbose else X):
            recommendation = self._predict_one(row)
            recommendation = self._fill_missing_values(row, recommendation)
            y_pred.append(recommendation)
        return y_pred
    
    def _predict_one(self, row):
        pass
    
    def _fill_missing_values(self, row, recommendation):
        recommendation = drop_duplicates(recommendation)[:10]
        
        missing_items = self.k - len(recommendation)
        fill_items = []
        if self.fill_model is None:
            fill_items = random.choices(self.all_items, k=missing_items)
        else:
            fill_items = self.fill_model.predict([row])[0][:missing_items]
        
        return recommendation + fill_items

Overwriting models.py


In [3]:
%%writefile -a models.py

class LastViewedBaseline(AbstractBaseline):
    def __init__(self, all_items, fill_model=None, k=10, verbose=True):
        super().__init__(all_items, fill_model=fill_model, k=k, verbose=verbose)
    
    def _predict_one(self, row):
        viewed = [ev for ev in row['user_history'] if ev['event_type'] == 'view']
        viewed = sorted(viewed, key=lambda x: x['event_timestamp'], reverse=True)
        viewed = [ev['event_info'] for ev in viewed]

        recommendation = []

        for item in viewed:
            if item not in recommendation:
                recommendation.append(item)
        
        return recommendation

Appending to models.py


In [4]:
%%writefile -a models.py

class TopViewedItemsByMostFrequentDomainBaseline(AbstractBaseline):
    def __init__(self, all_items, metadata, fill_model=None, k=10, max_views=30, verbose=True):
        super().__init__(all_items, fill_model=fill_model, k=k, verbose=verbose)
        self.max_views = max_views
        self.metadata = metadata
        self.viewed_items_by_domain = None
    
    def _fit(self, X=None, y=None):
        self.viewed_items_by_domain = defaultdict(lambda: defaultdict(int))

        for row in tqdm(X):
            viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
            for item in viewed:
                domain = self.metadata[item]['domain_id']
                self.viewed_items_by_domain[domain][item] += 1
                
        return self
    
    def _predict_one(self, row):
        viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
        if len(viewed) == 0:
            return random.choices(self.all_items, k=self.k)
        domain = self.__visited_domains(row)
        domain = domain.most_common(1)[0][0]
        return self.__top_items(domain)
    
    def __visited_domains(self, row):
        domains = Counter()
        viewed = [ev['event_info'] for ev in row['user_history'] if ev['event_type'] == 'view']
        if len(viewed) > self.max_views:
            viewed = viewed[:self.max_views]
        for item in viewed:
            domain = self.metadata[item]['domain_id']
            domains[domain] += 1
        return domains
    
    def __top_items(self, domain):
        top = self.viewed_items_by_domain[domain]
        top = Counter(top)
        top = top.most_common(self.k)
        recommendation = [x[0] for x in top]
        
        return recommendation

Appending to models.py


In [5]:
%%writefile metrics.py
import statistics
import numpy as np
from utils import drop_duplicates

def ndcg(y_pred, y_true, metadata):
    ndcg_values = []
    
    y_pred_cleaned = clean(y_pred)
    
    for i in range(len(y_pred_cleaned)):
        dcg_value = dcg(y_pred_cleaned[i], y_true[i], metadata)
        idcg_value = idcg(y_pred_cleaned[i], y_true[i])
        ndcg_values.append(dcg_value / idcg_value)
    
    return statistics.mean(ndcg_values)

def clean(y_pred):
    results = []
    
    for items in y_pred:
        items_clean = drop_duplicates(items)
        missing_values = 10 - len(items_clean)
        items_clean = items_clean + [0] * missing_values
        results.append(items_clean)
    
    return results
        
def dcg(y_pred, y_true, metadata):
    values = []
    for i in range(len(y_pred)):
        rel_value = rel(y_pred[i], y_true, metadata)
        dcg_value = rel_value / np.log(i + 2.0)
        values.append(dcg_value)
    return np.sum(values)
    
def idcg(y_pred, y_true):
    return 22.42461597

def rel(y_hat, y, metadata):
    if y_hat == y:
        return 12.0
    if domain(y_hat, metadata) == domain(y, metadata):
        return 1.0
    return 0.0

def domain(item, metadata):
    return metadata[item]['domain_id']

Overwriting metrics.py


In [6]:
from sklearn.model_selection import train_test_split
from utils import jl_to_list

samples = 50000
rows = jl_to_list('data/train_dataset.jl.gz')
if samples:
    rows = rows[:samples]
rows_train, rows_test = train_test_split(rows, test_size=0.2, random_state=42)

item_data = jl_to_list('data/item_data.jl.gz')

metadata = {x['item_id']:x for x in item_data}
all_items = list(metadata.keys())

y_true = [row['item_bought'] for row in rows_test]

test_dataset = jl_to_list('data/test_dataset.jl.gz')

In [7]:
from models import TopViewedItemsByMostFrequentDomainBaseline, LastViewedBaseline

fill_model = TopViewedItemsByMostFrequentDomainBaseline(all_items, metadata, verbose=False)
baseline = LastViewedBaseline(all_items, fill_model=fill_model)
baseline.fit(rows_train)
y_pred = baseline.predict(rows_test)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [8]:
from metrics import ndcg

ndcg(y_pred, y_true, metadata)

0.2468331335418059

In [None]:
y_pred = baseline.predict(test_dataset)
df = pd.DataFrame(data=y_pred)
df.to_csv("./results/fill_model.csv",sep=',',index=False,header=False)

In [None]:
df[df.isnull().any(axis=1)]

In [9]:
baseline = TopViewedItemsByMostFrequentDomainBaseline(all_items, metadata)
baseline.fit(rows_train)
y_pred = baseline.predict(rows_test)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [10]:
ndcg(y_pred, y_true, metadata)

0.1398981169433177

In [None]:
len(y_true)

In [None]:
len(y_pred)

In [None]:
df = pd.DataFrame(data=y_pred)
df[df.isnull().any(axis=1)]