In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from datetime import datetime
from collections import Counter
from joblib import Parallel, delayed, parallel_backend
import random
import gc
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.model_selection import train_test_split
from scipy.stats import mode

from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB, CategoricalNB, GaussianNB
import re

import nltk
stopwords = nltk.corpus.stopwords.words('portuguese')

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_columns', 300)
from textblob import TextBlob
from sklearn.model_selection import StratifiedKFold, KFold
pd.options.display.float_format = "{:.4f}".format
from datetime import datetime
import dateutil.parser

In [2]:
import itertools
import lightgbm as lgb
import bottleneck
from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin
from sklearn.neural_network import MLPClassifier
import random

In [3]:
from ml_utils import *
%load_ext autoreload
%autoreload 2

In [None]:
### READ ORIGINAL FILES AND SAVE THEM AS PICKLE 

# i = []
# with open('item_data.jl', 'rb') as f:
#     for item in tqdm(json_lines.reader(f)):
#         i.append(item)

In [4]:
items = pd.read_pickle('./items')
train = pd.read_pickle('./train')
indices = train.index.values

domain = items[['item_id', 'domain_id']].set_index('item_id').to_dict()['domain_id']
domain_inverse = items.groupby('domain_id').agg({'item_id': 'unique'}).to_dict()['item_id']
items['item_id'] = items['item_id'].astype('O')
items['price'] = items['price'].astype(np.float32)

item_info = {}
for item_id, price, condition in tqdm(items[['item_id', 'price', 'condition']].values):
    item_info[item_id] = (price, 1 if condition == 'new' else 0)



100%|██████████| 2102277/2102277 [00:02<00:00, 823023.48it/s]


In [6]:
# test = pd.read_pickle('./test')

train, test, indices_train, indices_test = train_test_split(train, indices, test_size=0.20, random_state=15)
y_test = test.item_bought.values
y_test_domain = np.array([domain[i] for i in y_test])

In [7]:
y_train = train.item_bought.values
y_train_domain = np.array([domain[i] for i in y_train])

In [8]:
uh_train = list(train.user_history.values)
uh_test = list(test.user_history.values)
uh = uh_train + uh_test

In [9]:
test_items = []
for uh in tqdm(test.user_history.values):
    for i in uh:
        if isinstance(i['event_info'], int):
            test_items.append(i['event_info'])
            
# test_items = {domain[i] for i in test_items}
test_items = set(test_items)

train_items = []
for uh in tqdm(train.user_history.values):
    for i in uh:
        if isinstance(i['event_info'], int):
            train_items.append(i['event_info'])
            
# train_items = {domain[i] for i in train_items}
train_items = set(train_items)


domain_train_items = {domain[i] for i in train_items}
domain_test_items = {domain[i] for i in test_items}

target_train_items = set(train.item_bought.values)

# target_test_items = set(test.item_bought.values)


domain_inverse_sorted = items[items.item_id.isin(train.item_bought.values)].groupby('domain_id').agg({'item_id': 'unique'}).to_dict()['item_id']

vc = train.item_bought.value_counts()
domain_inverse_sorted = {k: list(vc.loc[v].sort_values(ascending=False).index) for k, v in domain_inverse_sorted.items()}
domain_inverse_sorted_count = {k: list(vc.loc[v].sort_values(ascending=False).values) for k, v in domain_inverse_sorted.items()}
domain_inverse_sorted_score = {k: [i/sum(v) for i in v] for k, v in domain_inverse_sorted_count.items()}

100%|██████████| 82633/82633 [00:00<00:00, 93659.86it/s]
100%|██████████| 330530/330530 [00:03<00:00, 96742.59it/s] 


In [10]:
# Getting viewed history

uh = train.user_history.values
views_train = []
for l in tqdm(uh):
    current = []
    for d in l[::-1]:
        event_info = d['event_info']
        if isinstance(event_info, int):
            current.append(event_info)
    views_train.append(current)

# Getting viewed history

uh = test.user_history.values
views_test = []
for l in tqdm(uh):
    current = []
    for d in l[::-1]:
        event_info = d['event_info']
        if isinstance(event_info, int):
            current.append(event_info)
    views_test.append(current)

100%|██████████| 330530/330530 [00:05<00:00, 56798.10it/s]
100%|██████████| 82633/82633 [00:02<00:00, 40441.04it/s]


In [11]:
views = views_train + views_test

In [12]:
# from sklearn.preprocessing import LabelEncoder
# unique_domains = list(set(np.concatenate((y_train_domain, y_test_domain))))
# le = LabelEncoder()
# le.fit(unique_domains)

# Domains

In [12]:
domains = []
uh = train.user_history.values
for l in tqdm(uh):
    current = []
    for d in l:
        event_info = d['event_info']
        if isinstance(event_info, int):
            if domain[event_info] in domain_test_items and domain[event_info] != None:
                current.append(domain[event_info])
    domains.append(current)
    

domains_test = []
uh = test.user_history.values
for l in tqdm(uh):
    current = []
    for d in l:
        event_info = d['event_info']
        if isinstance(event_info, int):
            if domain[event_info] in domain_train_items and domain[event_info] != None:
                current.append(domain[event_info])
    domains_test.append(current)
    
domains_total = domains + domains_test

100%|██████████| 330530/330530 [00:09<00:00, 33483.48it/s]
100%|██████████| 82633/82633 [00:01<00:00, 48322.30it/s]


In [13]:
x = get_csr_matrix(domains_total).astype(np.float32)

In [15]:
clf_dm = ComplementNB(alpha=102)
clf_dm.fit(x[:len(train)], y_train_domain)

In [26]:
def parallelize_predict_proba(clf, x):

    def predict(clf, x_loc):
        p = clf.predict_proba(x_loc)
        proba = np.sort(p, axis=1)[:, :-11:-1]
        indices = p.argsort(axis=1)[:, :-11:-1]
        preds_10 = []
        for i in tqdm(indices):
            preds_10.append(list(clf.classes_[i]))
        return proba, preds_10
    
    chunks_locks = np.array_split(np.arange(x.shape[0], dtype=int), 60)
    
    with parallel_backend('loky', n_jobs=60):
        pool_result = Parallel(verbose=5)(delayed(predict)(clf, x[loc]) for loc in chunks_locks)
        
    proba = []
    for i in tqdm(pool_result):
        proba.append(i[0])
    proba = np.concatenate(tuple(proba), axis=0)
    
    preds = []
    for i in tqdm(pool_result):
        preds.extend(i[1])
        
    return proba, preds

In [27]:
proba_domain, preds_domain = parallelize_predict_proba(clf_dm, x)

[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done   6 out of  60 | elapsed:   27.4s remaining:  4.1min
[Parallel(n_jobs=60)]: Done  19 out of  60 | elapsed:   29.9s remaining:  1.1min
[Parallel(n_jobs=60)]: Done  32 out of  60 | elapsed:   32.0s remaining:   28.0s
[Parallel(n_jobs=60)]: Done  45 out of  60 | elapsed:   37.2s remaining:   12.4s
[Parallel(n_jobs=60)]: Done  58 out of  60 | elapsed:   39.3s remaining:    1.4s
[Parallel(n_jobs=60)]: Done  60 out of  60 | elapsed:   39.6s finished
100%|██████████| 60/60 [00:00<00:00, 193285.90it/s]
100%|██████████| 60/60 [00:00<00:00, 2429.65it/s]


In [14]:
# with open('./proba_domain_finaltest.pickle', 'wb') as handle:
#     pickle.dump(proba_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('./preds_domain_finaltest.pickle', 'wb') as handle:
#     pickle.dump(preds_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('./proba_domain_finaltest.pickle', 'rb') as f:
#     proba_domain = pickle.load(f)

# with open('./preds_domain_finaltest.pickle', 'rb') as f:
#     preds_domain = pickle.load(f)

In [13]:
# with open('./proba_domain.pickle', 'wb') as handle:
#     pickle.dump(proba_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('./preds_domain.pickle', 'wb') as handle:
#     pickle.dump(preds_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('./proba_domain.pickle', 'rb') as f:
#     proba_domain = pickle.load(f)

# with open('./preds_domain.pickle', 'rb') as f:
#     preds_domain = pickle.load(f)

In [361]:
accuracy_score(y_test_domain, [i[0] for i in preds_domain_test])

0.3794609901613157

# Text

 Pega só o primeiro palpite do domínio. Acurácias: 1o palipite: 23%, 2o palipite: 6%, 3o: 3% (no conjunto teste)

In [17]:
def get_queries(df):
    user_history = df.user_history.values
    queries = []
    for uh in tqdm(user_history):
        s = []
        for d in uh:
            if d['event_type'] == 'search':
                s.append(d['event_info'])
        queries.append(s)
    queries = [' '.join(list(set(i))).lower() for i in queries]
    return queries

queries = get_queries(train)

queries_test = get_queries(test)

100%|██████████| 330530/330530 [00:05<00:00, 62571.01it/s] 
100%|██████████| 82633/82633 [00:00<00:00, 111400.01it/s]


In [19]:
def preprocess_text(text):
    text = re.sub(r'\d+', '', text)
    return text

cv = CountVectorizer(stop_words=stopwords, preprocessor=preprocess_text, max_df=0.2, min_df=0.0001, ngram_range=(1,2))

bag_of_words = cv.fit_transform(queries + queries_test)

In [33]:
clf = ComplementNB(alpha=100)
clf.fit(bag_of_words[:len(train)], y_train_domain)

ComplementNB(alpha=100)

In [34]:
def parallelize_predict_proba(clf, x):

    def predict(clf, x_loc):
        p = clf.predict_proba(x_loc)
        proba = np.sort(p, axis=1)[:, :-11:-1]
        indices = p.argsort(axis=1)[:, :-11:-1]
        preds_10 = []
        for i in tqdm(indices):
            preds_10.append(list(clf.classes_[i]))
        return proba, preds_10
    
    chunks_locks = np.array_split(np.arange(x.shape[0], dtype=int), 60)
    
    with parallel_backend('loky', n_jobs=60):
        pool_result = Parallel(verbose=5)(delayed(predict)(clf, x[loc]) for loc in chunks_locks)
        
    proba = []
    for i in tqdm(pool_result):
        proba.append(i[0])
    proba = np.concatenate(tuple(proba), axis=0)
    
    preds = []
    for i in tqdm(pool_result):
        preds.extend(i[1])
        
    return proba, preds

In [35]:
proba_text, preds_text = parallelize_predict_proba(clf, bag_of_words)
# proba_text, preds_text = parallelize_predict_proba(clf, bag_of_words[len(train):])

[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done   6 out of  60 | elapsed:   16.9s remaining:  2.5min
[Parallel(n_jobs=60)]: Done  19 out of  60 | elapsed:   19.9s remaining:   42.9s
[Parallel(n_jobs=60)]: Done  32 out of  60 | elapsed:   22.2s remaining:   19.5s
[Parallel(n_jobs=60)]: Done  45 out of  60 | elapsed:   24.6s remaining:    8.2s
[Parallel(n_jobs=60)]: Done  58 out of  60 | elapsed:   30.7s remaining:    1.1s
[Parallel(n_jobs=60)]: Done  60 out of  60 | elapsed:   31.1s finished
100%|██████████| 60/60 [00:00<00:00, 127615.74it/s]
100%|██████████| 60/60 [00:00<00:00, 1630.30it/s]


In [15]:
# with open('./proba_text_finaltest.pickle', 'wb') as handle:
#     pickle.dump(proba_text, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('./preds_text_finaltest.pickle', 'wb') as handle:
#     pickle.dump(preds_text, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('./proba_text_finaltest.pickle', 'rb') as f:
#     proba_text = pickle.load(f)

# with open('./preds_text_finaltest.pickle', 'rb') as f:
#     preds_text = pickle.load(f)

In [14]:
# with open('./proba_text.pickle', 'wb') as handle:
#     pickle.dump(proba_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('./preds_text.pickle', 'wb') as handle:
#     pickle.dump(preds_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# with open('./proba_text.pickle', 'rb') as f:
#     proba_text = pickle.load(f)

# with open('./preds_text.pickle', 'rb') as f:
#     preds_text = pickle.load(f)

# Ensemble

In [15]:
def get_item_score(item, domain):
    try:
        index = domain_inverse_sorted[domain].index(item)
        return domain_inverse_sorted_score[domain][index]
    except:
        return 0

def generate_datapoint(v, y, domains_10, text_10, proba_domains_10, proba_text_10):
    candidates = np.unique(v + list(domain_inverse_sorted[domains_10[0]][:10]) + list(domain_inverse_sorted[domains_10[1]][:2]))


    nunique = len(np.unique(v))
    lenn = len(v)
    counts = counter(v)
    rec_score = get_recurrent_datapoint(v)
    prices = [0] if len(v) == 0 else [item_info[i][0] for i in v]
    maxx = np.max(prices)
    minn = np.min(prices)
    amp = maxx - minn
    
    try:
        unique_domains = get_unique_domains(v)
    except:
        unique_domains = 1

    datapoints = []
    for item in candidates:
        datapoints.append({
            'is_first': 1 if len(v) > 0 and item == v[0] else 0,
            'is_last': 1 if len(v) > 0 and item == v[-1] else 0,
            'count': counts.get(item, 0),
            'nunique': nunique,
            'max_price_viewed': maxx,
            'min_price_viewed': minn,
            'amp_price': amp,
            'lenn': lenn,
            'rec_score': rec_score.get(item, 0),
            'prob_domain': get_domain_score(domain[item], domains_10, proba_domains_10),
            'prob_text': get_domain_score(domain[item], text_10, proba_text_10),
            'prob_item': get_item_score(item, domain[item]),
            'unique_domains': unique_domains,
            'price': item_info[item][0],
            'condition': item_info[item][1],
            'target': 1 if y == item else 0
        })

    datapoints = pd.DataFrame().from_records(datapoints)
    return datapoints
    

In [21]:
datapoints = []
for i in tqdm(range(len(train[:100000]))):
    datapoints.append(generate_datapoint(views[i], y_train[i], preds_domain[i], preds_text[i], proba_domain[i], proba_text[i]))

datapoints = pd.concat(datapoints).reset_index(drop=True)

100%|██████████| 100000/100000 [03:10<00:00, 524.35it/s]


In [22]:
def dp_fe(df):
    df['perc'] = df['count']/df['lenn']
    df['prob_item_domain'] = df['prob_domain']*df['prob_item']
    df['prob_item_text'] = df['prob_text']*df['prob_item']
    return df

datapoints = dp_fe(datapoints)

datapoints = undersample(datapoints)

In [20]:
# datapoints.to_pickle('./datapoints.pickle')

# with open('./datapoints.pickle', 'rb') as f:
#     datapoints = pickle.load(f)

In [23]:
dp_x = datapoints.drop('target', axis=1)
dp_y = datapoints['target']

dp_x_train, dp_x_test, dp_y_train, dp_y_test = train_test_split(dp_x, dp_y, test_size=0.25, random_state=42, stratify=dp_y)

In [24]:
datapoints

Unnamed: 0,is_first,is_last,count,nunique,max_price_viewed,min_price_viewed,amp_price,lenn,rec_score,prob_domain,prob_text,prob_item,unique_domains,price,condition,target,perc,prob_item_domain,prob_item_text
1887446,0,0,0,37,3999.8999,18.0000,3981.8999,51,0,1.0000,0.0018,0.0211,1,179.0000,1,0,0.0000,0.0211,0.0000
1668470,0,0,1,14,461.7900,62.6900,399.1000,25,0,0.9994,0.0007,0.0000,1,139.9900,1,0,0.0400,0.0000,0.0000
1540843,0,0,1,3,2099.0000,1499.0000,600.0000,3,0,0.0157,0.0000,0.0016,1,2099.0000,1,1,0.3333,0.0000,0.0000
1082758,0,0,0,1,87.3000,87.3000,0.0000,1,0,0.0004,0.0000,0.1220,1,774.0000,1,0,0.0000,0.0001,0.0000
1475166,0,0,1,22,220000.0000,15.0000,219985.0000,51,0,0.0000,0.0000,0.0000,1,17.9000,1,0,0.0196,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1173379,0,0,0,27,9199.0000,40.0000,9159.0000,34,0,0.0053,0.0000,0.0825,1,279.9900,1,0,0.0000,0.0004,0.0000
1517143,0,0,0,6,7889.8799,549.0000,7340.8799,9,0,0.8105,0.0014,0.0079,1,899.0000,1,0,0.0000,0.0064,0.0000
1665134,0,0,0,11,159.4000,19.7500,139.6500,23,0,0.8222,0.0028,0.0438,1,119.9000,1,0,0.0000,0.0360,0.0001
1087598,0,0,0,57,87000.0000,56.7000,86943.3000,86,0,0.2088,0.0005,0.0568,1,43.9900,1,0,0.0000,0.0119,0.0000


In [25]:
lr = 0.12
lr_decay = 0.98


params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 512,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'lambda_l2': 1.2, #0.65,
    'lambda_l1': 1.2, #0.65,
    'bagging_freq': 5,
    'num_boost_round': 100,
    'verbose': 1
}


dtrain = lgb.Dataset(dp_x_train, dp_y_train)
dvalid = lgb.Dataset(dp_x_test, dp_y_test, reference=dtrain)

bst = lgb.train(
    params, dtrain, valid_sets=dvalid, verbose_eval=10,
    callbacks=[lgb.reset_parameter(learning_rate=lambda current_round: lr*(lr_decay**current_round))],
)




[LightGBM] [Info] Number of positive: 26581, number of negative: 79742
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3022
[LightGBM] [Info] Number of data points in the train set: 106323, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.250002 -> initscore=-1.098600
[LightGBM] [Info] Start training from score -1.098600
[10]	valid_0's binary_logloss: 0.300173
[20]	valid_0's binary_logloss: 0.258582
[30]	valid_0's binary_logloss: 0.247211
[40]	valid_0's binary_logloss: 0.243633
[50]	valid_0's binary_logloss: 0.242283
[60]	valid_0's binary_logloss: 0.241764
[70]	valid_0's binary_logloss: 0.241587
[80]	valid_0's binary_logloss: 0.241564
[90]	valid_0's binary_logloss: 0.241495
[100]	valid_0's binary_logloss: 0.241563


In [26]:
def dp_fe(df):
    df['perc'] = df['count']/df['lenn']
    df['prob_item_domain'] = df['prob_domain']*df['prob_item']
    df['prob_item_text'] = df['prob_text']*df['prob_item']
    
    return df

def generate_datapoint_without_y(candidates, v, domains_10, text_10, proba_domains_10, proba_text_10):

    nunique = len(np.unique(v))
    lenn = len(v)
    counts = counter(v)
    rec_score = get_recurrent_datapoint(v)
    prices = [0] if len(v) == 0 else [item_info[i][0] for i in v]
    maxx = np.max(prices)
    minn = np.min(prices)
    amp = maxx - minn
    
    try:
        unique_domains = get_unique_domains(v)
    except:
        unique_domains = 1

    datapoints = []
    for item in candidates:
        datapoints.append({
            'is_first': 1 if len(v) > 0 and item == v[0] else 0,
            'is_last': 1 if len(v) > 0 and item == v[-1] else 0,
            'count': counts.get(item, 0),
            'nunique': nunique,
            'max_price_viewed': maxx,
            'min_price_viewed': minn,
            'amp_price': amp,
            'lenn': lenn,
            'rec_score': rec_score.get(item, 0),
            'prob_domain': get_domain_score(domain[item], domains_10, proba_domains_10),
            'prob_text': get_domain_score(domain[item], text_10, proba_text_10),
            'prob_item': get_item_score(item, domain[item]),
            'unique_domains': unique_domains,
            'price': item_info[item][0],
            'condition': item_info[item][1]
        })

    datapoints = pd.DataFrame().from_records(datapoints)
    datapoints = dp_fe(datapoints)
    return datapoints
    

In [40]:
def p(lenn_test, preds_domain, preds_text, proba_domain, proba_text, views):

    outputs = []
    for idx in tqdm(range(lenn_test)):
        v = views[idx]
        domains_10 = preds_domain[idx]
        texts_10 = preds_text[idx]
        proba_domains_10 = proba_domain[idx]
        proba_text_10 = proba_text[idx]

        output = []
        j = 0
        while len(output) < 10:
            if len(v) > 0:
#             if proba_domains_10[j] >= 0.0003808080808080808:
                current_domain = domains_10[j]
            else:
                current_domain = texts_10[j]
            current_domain_items = domain_inverse_sorted[current_domain]

            views_of_domain = unique([i for i in v if domain[i] == current_domain])

            current_domain_items = [i for i in current_domain_items if i in views_of_domain]

            output.extend(current_domain_items)

            j += 1
            if j == 10:
                k = 0
                while len(output) < 10:
                    if len(v) > 0:
                        current_domain = domains_10[k]
                    else:
                        current_domain = texts_10[k]
                    current_domain_items = domain_inverse_sorted[current_domain]

                    for i in current_domain_items:
                        if i not in output:
                            output.append(i)
                    k += 1

        output = output[:10]

        datapoints = generate_datapoint_without_y(output, v, domains_10, texts_10, proba_domains_10, proba_text_10)
        preds = bst.predict(datapoints, num_iteration=bst.best_iteration)
        args = preds.argsort()
        output = list(np.array(output)[args[::-1]][:10])

        outputs.append(output)
        
    return outputs

In [41]:
print(len(preds_domain), len(preds_text), len(proba_domain), len(proba_text), len(views), len(uh))

413163 413163 413163 413163 413163 82633


In [91]:
outputs = p(len(test[:1000]), preds_domain[len(train):], preds_text[len(train):], proba_domain[len(train):], proba_text[len(train):], views[len(train):])

100%|██████████| 1000/1000 [00:27<00:00, 36.00it/s]


In [89]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 6000/6000 [00:00<00:00, 33312.54it/s]


0.2933451058874392

In [61]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 6000/6000 [00:00<00:00, 33186.72it/s]


0.2933451058874392

In [31]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 2000/2000 [00:00<00:00, 28499.24it/s]


0.2964340121237794

In [104]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 1000/1000 [00:00<00:00, 28476.12it/s]


0.283689159463642

In [92]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 1000/1000 [00:00<00:00, 24474.56it/s]


0.2920434383738991

In [33]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 1000/1000 [00:00<00:00, 25805.54it/s]


0.2920434383738991

In [243]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 1000/1000 [00:00<00:00, 27623.55it/s]


0.2910180853923794

In [181]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 1000/1000 [00:00<00:00, 27173.63it/s]


0.28888895067853193

In [147]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 1000/1000 [00:00<00:00, 26570.11it/s]


0.2850787020990637

In [96]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 5000/5000 [00:00<00:00, 27979.15it/s]


0.2861882115066076

In [41]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 1000/1000 [00:00<00:00, 31877.91it/s]


0.28614013452515735

In [184]:
average_NDCG(outputs, y_test, domain)

100%|██████████| 1000/1000 [00:00<00:00, 27811.84it/s]


0.28601171348072796

### Format output

In [39]:
for i in outputs:
    assert len(i) == 10
    assert len(set(i)) == 10

In [40]:
with open("output_.csv", "w") as f:
    for row in outputs:
        f.write("%s\n" % ','.join(str(col) for col in row))