In [1]:
import pandas as pd
import sys, os, os.path
import numpy as np
import json
from sklearn.metrics import roc_auc_score
import traceback
import gezi
from gezi import tqdm

In [2]:
def dcg_score(y_true, y_score, k=10):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)
    

def ndcg_score(y_true, y_score, k=10):
    best = dcg_score(y_true, y_true, k)
    actual = dcg_score(y_true, y_score, k)
    return actual / best


def mrr_score(y_true, y_score):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order)
    rr_score = y_true / (np.arange(len(y_true)) + 1)
    return np.sum(rr_score) / np.sum(y_true)

def parse_line(l, is_truth=True):
    impid, ranks = l.strip('\n').split(' ', 1)
    ranks = json.loads(ranks)
    return impid, ranks

def scoring_file(sub_f, is_rank=False):
    aucs = []
    mrrs = []
    ndcg5s = []
    ndcg10s = []
    
    line_index = 1
    # cat get by using tools/to_truth.py and ../input/dev/behaviours.tsv
    truth_f = '../input/dev/truth.txt'
    total = len(open(truth_f).readlines())
    truth_f = open(truth_f)
    sub_f = open(sub_f)
    for lt in tqdm(truth_f, total=total):
        ls = sub_f.readline()
        impid, labels = parse_line(lt)
        
        # ignore masked impressions
        if labels == []:
            continue 
        
        if ls == '':
            # empty line: filled with 0 ranks
            sub_impid = impid
            sub_ranks = [1] * len(labels)
        else:
            try:
                sub_impid, sub_ranks = parse_line(ls, is_truth=False)
            except:
                raise ValueError("line-{}: Invalid Input Format!".format(line_index))       
        
        if sub_impid != impid:
            raise ValueError("line-{}: Inconsistent Impression Id {} and {}".format(
                line_index,
                sub_impid,
                impid
            ))        
        
        lt_len = float(len(labels))
        
        y_true =  np.array(labels,dtype='float32')
        y_score = []
        for rank in sub_ranks:
            score_rslt = 1./rank if is_rank else float(rank)
            if score_rslt < 0 or score_rslt > 1:
                raise ValueError("Line-{}: score_rslt should be int from 0 to {}".format(
                    line_index,
                    lt_len
                ))
            y_score.append(score_rslt)
            
        try:
          auc = roc_auc_score(y_true,y_score)
          mrr = mrr_score(y_true,y_score)
          ndcg5 = ndcg_score(y_true,y_score,5)
          ndcg10 = ndcg_score(y_true,y_score,10)

          aucs.append(auc)
          mrrs.append(mrr)
          ndcg5s.append(ndcg5)
          ndcg10s.append(ndcg10)
        except Exception:
          pass
        
        line_index += 1

    return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)
  

def get_scores(pred_file):
  res = {}
  total = len(open(pred_file).readlines())
  for line in tqdm(open(pred_file), total=total):
    impression_id, labels = line.rstrip('\t').split(' ')
    scores = json.loads(labels)
    res[impression_id] = np.asarray(scores)
  return res
  
gold = {}
for line in open('../input/dev/truth.txt'):
  impression_id, labels = line.rstrip('\t').split(' ')
  labels = json.loads(labels)
  gold[impression_id] = labels
  
def scoring(res):
  aucs = []
  mrrs = []
  ndcg5s = []
  ndcg10s = []
  for impression_id in tqdm(gold, total=len(gold)):
    y_true = gold[impression_id]
    y_score = res[impression_id]
    auc = roc_auc_score(y_true,y_score)
    mrr = mrr_score(y_true,y_score)
    ndcg5 = ndcg_score(y_true,y_score,5)
    ndcg10 = ndcg_score(y_true,y_score,10)

    aucs.append(auc)
    mrrs.append(mrr)
    ndcg5s.append(ndcg5)
    ndcg10s.append(ndcg10)
  return np.mean(aucs), np.mean(mrrs), np.mean(ndcg5s), np.mean(ndcg10s)

def prob2rank(res, ofile):
  with open(ofile, 'w') as out:
    for i in tqdm(range(len(res))):
      impression_id = str(i + 1)
      scores = res[impression_id]
      ranks = (-np.asarray(scores)).argsort().argsort() + 1
      print(impression_id, '[' + ','.join(map(str, ranks)) + ']', sep=' ', file=out)

In [None]:
# label file (dev)

In [None]:
total = len(open('../input/dev/behaviors.tsv').readlines())
with open('../input/dev/truth.txt', 'w') as out:
  for line in tqdm(open('../input/dev/behaviors.tsv'), total=total):
    l = line.strip('\n').split('\t')
    id, history = l[0], l[-1]
    labels = []
    for item in history.split():
      labels.append(item.split('-')[-1])
    print(id, '[' + ','.join(labels) + ']', sep=' ', file=out)

In [None]:
label_file = '../input/dev/truth.txt'

In [None]:
# v8/din-title-pretrain.run2 l2 normalize word emb first
# test 0.698 

In [None]:
!ls ../working/v8/din-title-pretrain.run2/infos/dev

In [None]:
valid_file = '../working/v8/din-title-pretrain.run2/infos/dev/valid.csv'

In [None]:
x = open(valid_file).readline()

In [None]:
x

In [None]:
json.loads(json.loads(x.strip('\n').split(' ', 1)[1]))

In [None]:
scoring(label_file, valid_file, is_rank=False)

In [None]:
len(json.loads(json.loads('"[0.14826497435569763, 0.011713951826095581, 0.12337428331375122, 0.006044149398803711, 0.024754375219345093, 0.09507068991661072, 0.016183137893676758, 0.03523629903793335, 0.018186092376708984, 0.014278829097747803, 0.004878878593444824, 0.20712491869926453, 0.021077275276184082, 0.017076998949050903, 0.013494908809661865, 0.02057015895843506, 0.004425853490829468, 0.019422262907028198, 0.06679847836494446, 0.05480983853340149, 0.032058119773864746, 0.009513318538665771, 0.044002264738082886, 0.02062860131263733, 0.01940673589706421, 0.014451682567596436]"')))

In [None]:
!more ../working/v8/din-title-pretrain.run2/infos/dev/valid.csv

In [None]:
!more ../working/v8/din-title-pretrain.ru

In [None]:
!more ../working/v8/din-title-pretrain.run2/submission.csv

In [2]:
file = '../working/v8/din-title-pretrain.run2/submission.csv'
total = len(open(file).readlines())
m1 = {}
for line in tqdm(open(file), total=total):
  id, scores = line.rstrip('\n').split(' ', 1)
  id = int(id)
  scores = json.loads(scores)
  m1[id] = scores

 49%|████▉     | 1160216/2370727 [00:41<00:39, 30687.03it/s]

1154912 [0.0508275032043457]


100%|██████████| 2370727/2370727 [01:27<00:00, 27140.47it/s]


In [None]:
# v10/din-title-pretrain day train add dev

In [3]:
file = '../working/v10/din-title-pretrain/submission.csv'
total = len(open(file).readlines())
m2 = {}
for line in tqdm(open(file), total=total):
  id, scores = line.rstrip('\n').split(' ', 1)
  id = int(id)
  scores = json.loads(scores)
  m2[id] = scores

 49%|████▉     | 1160581/2370727 [00:41<00:40, 30044.48it/s]

1154912 [0.0364307165145874]


100%|██████████| 2370727/2370727 [01:28<00:00, 26641.16it/s]


In [4]:
file = '../working/v8/din-title-pretrain.negfilter/submission.csv'
total = len(open(file).readlines())
m3 = {}
for line in tqdm(open(file), total=total):
  id, scores = line.rstrip('\n').split(' ', 1)
  id = int(id)
  scores = json.loads(scores)
  m3[id] = scores

 49%|████▉     | 1160594/2370727 [00:43<00:39, 30375.03it/s]

1154912 [0.053904205560684204]


100%|██████████| 2370727/2370727 [01:23<00:00, 28461.96it/s]


In [None]:
m1

In [5]:
m = {}
for id in tqdm(m1.keys(), total=len(m1)):
  m[id] = np.asarray(m1[id]) * 0.3 + np.asarray(m2[id]) * 0.5 + np.asarray(m3[id]) * 0.2

100%|██████████| 2370727/2370727 [00:47<00:00, 49701.37it/s]


In [6]:
m

{1: array([0.08088148, 0.01868847, 0.01211649, 0.04430644, 0.03138803,
        0.04364353, 0.00847481, 0.03818206, 0.15528254, 0.08877633,
        0.040024  , 0.05353349, 0.11976711, 0.12968148, 0.03217955,
        0.10852921]),
 2: array([0.04801027, 0.06903843, 0.43627895, 0.09669978, 0.0644867 ,
        0.01917397, 0.08071567]),
 3: array([0.03796177, 0.02025431, 0.14627591, 0.08878711, 0.12558669,
        0.00633252, 0.05831364, 0.02819996, 0.05778228, 0.02249835,
        0.03631652, 0.02966544, 0.03564324, 0.10676052, 0.06979345,
        0.11111856, 0.00838509, 0.02111778, 0.07006317, 0.02217698,
        0.12228225, 0.02359671, 0.01019831, 0.12925933, 0.01141429,
        0.0259729 , 0.01061601, 0.05949727, 0.04729583, 0.08519006,
        0.0980736 , 0.03577048, 0.04741798, 0.05779521, 0.03473026,
        0.06245918, 0.0792029 , 0.09230341, 0.12526409, 0.099626  ,
        0.03652449, 0.02063065, 0.17929618, 0.0466097 , 0.01961064,
        0.03438022, 0.05726211, 0.03194829, 0.04948

In [7]:
res = {}

In [None]:
for id in tqdm(m.keys(), total=len(m)):
  res[id] = ['[' + ','.join(map(str,(-np.asarray(x)).argsort().argsort() + 1)) + ']' for x in m[id]]

 70%|███████   | 1664296/2370727 [08:00<03:17, 3575.68it/s]

In [None]:
with open('../working/prediction.txt', 'w') as out:
  for id in range(len(m)):
    print(id + 1, res[id], sep=' ', file=out)

In [None]:
import os
odir = '../working'
os.system(f'cd {odir};zip prediction.zip prediction.txt')  

In [11]:
scoring_file('../working/v16/base/infos/dev/valid-0.csv')

100%|██████████| 376471/376471 [08:40<00:00, 723.68it/s]


(0.6996642001146163,
 0.3460991747858584,
 0.3869146134654522,
 0.4484090926147784)

In [21]:
res = parse_scores('../working/v16/base/infos/dev/valid-0.csv')

100%|██████████| 376471/376471 [00:12<00:00, 29954.96it/s]


In [26]:
scoring(res)

100%|██████████| 376471/376471 [08:05<00:00, 775.35it/s]


(0.6996642001146163,
 0.3460991747858584,
 0.3869146134654522,
 0.4484090926147784)

In [8]:
def mean_ensemble(files, weights=None):
  res = {}
  l = []
  for file in files:
    print(file)
    l.append(get_scores(file))
  if not weights:
    weights = [1 / len(files)] * len(files)
  res = {}
  for key in l[0]:
    res[key] = l[0][key] * weights[0]
    for i in range(1, len(files)):
      res[key] += l[i][key] * weights[i]
  return res

In [9]:
files = [
#   '../working/v16/base/infos/dev/valid-0.csv',    #0.6997
  
  '../working/v16/body/infos/dev/valid.csv',      #0.7032
  '../working/v16/bert/infos/dev/valid.csv',      #0.7036
  '../working/v16/bert-body/infos/dev/valid.csv', #0.7019
]

In [10]:
res = mean_ensemble(files)

../working/v16/body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:20<00:00, 17982.96it/s]


../working/v16/bert/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:21<00:00, 17642.33it/s]


../working/v16/bert-body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:19<00:00, 19038.00it/s]


In [15]:
res

{'1': array([0.09204045, 0.2315596 , 0.11181816, 0.03214641, 0.08558753,
        0.14457659, 0.37601922, 0.08197393, 0.10221019, 0.16637239,
        0.08667238, 0.19169571, 0.11548899, 0.08294751, 0.1694784 ,
        0.0840946 ]),
 '2': array([0.14230788, 0.02457006, 0.41307425, 0.02516605, 0.09238218,
        0.03407094, 0.05418836, 0.15148267, 0.07282664, 0.09403811,
        0.07987885, 0.01437077, 0.08621   , 0.21152154, 0.1007905 ,
        0.06063387, 0.05250672, 0.35327411, 0.2408855 , 0.28090395,
        0.01855125, 0.03047545, 0.03244193, 0.05905674, 0.05988741,
        0.1716207 , 0.02964579, 0.05041198, 0.37010634, 0.01833101,
        0.09108555, 0.26751764, 0.04543915, 0.22917771, 0.02703737,
        0.56648815, 0.02108018, 0.03354366, 0.09126714, 0.03516169,
        0.17470196, 0.0271748 , 0.08105478, 0.05076475, 0.02408928,
        0.0580413 , 0.4702941 , 0.0329068 , 0.09676932, 0.12875476,
        0.08065614, 0.23816761, 0.01707423, 0.24910694, 0.20654298,
        0.023237

In [11]:
np.asarray([1,2.,3]) * 3

array([3., 6., 9.])

In [17]:
scoring(res)

100%|██████████| 376471/376471 [07:11<00:00, 872.12it/s]


(0.7037429259287659,
 0.34927063679819276,
 0.3908604904781282,
 0.4520881269614195)

In [18]:
scoring_file(files[1])

100%|██████████| 376471/376471 [09:37<00:00, 651.66it/s]


(0.7031702802781835, 0.3486213566609537, 0.390114675854853, 0.451380388216034)

In [26]:
scoring_file(files[2])

100%|██████████| 376471/376471 [08:29<00:00, 739.14it/s]


(0.7035922947159546,
 0.3494844533391006,
 0.3901419334081718,
 0.4514067530598507)

In [27]:
scoring_file(files[3])

100%|██████████| 376471/376471 [09:27<00:00, 662.93it/s]


(0.7018593210045291,
 0.3492488210895961,
 0.3897183195502203,
 0.45109102897487596)

In [23]:
scoring(mean_ensemble(files[1:3]))

../working/v16/body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:11<00:00, 32935.13it/s]


../working/v16/bert/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:11<00:00, 33449.66it/s]
100%|██████████| 376471/376471 [07:07<00:00, 880.93it/s]


(0.7058625317529923,
 0.35102640872004653,
 0.39259167666269296,
 0.4537627916810185)

In [24]:
scoring(mean_ensemble(files[1:4]))

../working/v16/body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:11<00:00, 33556.43it/s]


../working/v16/bert/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:11<00:00, 33230.90it/s]


../working/v16/bert-body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:11<00:00, 33180.46it/s]
100%|██████████| 376471/376471 [06:56<00:00, 904.15it/s]


(0.7059093478311262,
 0.35163698133738525,
 0.3931353913418697,
 0.45421754590796054)

In [25]:
scoring(mean_ensemble(files))

../working/v16/base/infos/dev/valid-0.csv


100%|██████████| 376471/376471 [00:11<00:00, 33883.70it/s]


../working/v16/body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:11<00:00, 34005.64it/s]


../working/v16/bert/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:11<00:00, 33615.93it/s]


../working/v16/bert-body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:11<00:00, 33228.73it/s]
100%|██████████| 376471/376471 [06:58<00:00, 899.81it/s]


(0.7057999806117466,
 0.351517094148398,
 0.39302212809983805,
 0.4542241906064121)

# fix bad input

In [29]:
!ls ../working/v16/body/submission.csv

../working/v16/body/submission.csv


In [30]:
files = [
  '../working/v16/body/submission.csv',
  '../working/v16/bert/submission.csv',
  '../working/v16/bert-body/submission.csv'
]

In [37]:
def fix_bad_input(file):
  total = len(open(file).readlines())
  ofile = file.replace('.csv', '_fix.csv')
  bad_count = 0
  with open(ofile, 'w') as out:
    for line in tqdm(open(file), total=total):
      id, scores = line.rstrip('\n').split(' ', 1)
      try:
        scores = json.loads(json.loads(scores))
      except Exception:
        scores = json.loads(scores)
        print(line)
        print(id, scores)
        bad_count += 1
        if len(scores) > 1 or bad_count > 10:
          break
      print(id, '[' + ','.join(map(str, scores)) + ']', sep=' ', file=out)

In [42]:
fix_bad_input(files[1])

 49%|████▊     | 1155346/2370727 [04:10<06:34, 3083.33it/s]

1154912 [0.03233453631401062]

1154912 [0.03233453631401062]


100%|██████████| 2370727/2370727 [10:04<00:00, 3922.27it/s]


In [43]:
fix_bad_input(files[2])

 49%|████▊     | 1155380/2370727 [05:29<03:31, 5757.89it/s]

1154912 [0.051009178161621094]

1154912 [0.051009178161621094]


100%|██████████| 2370727/2370727 [07:54<00:00, 4999.77it/s] 


In [67]:
files = [
  '../working/v17/submit2/body.dev0/submission.csv',
  '../working/v17/submit2/body.dev1/submission.csv'
]

In [68]:
# res = mean_ensemble(files, [0.4,0.4,0.2])

../working/v16/body/submission_fix.csv


100%|██████████| 2370727/2370727 [01:19<00:00, 29638.90it/s]


../working/v16/bert/submission_fix.csv


100%|██████████| 2370727/2370727 [01:20<00:00, 29541.56it/s]


../working/v16/bert-body/submission_fix.csv


100%|██████████| 2370727/2370727 [01:20<00:00, 29563.02it/s]


In [None]:
res = mean_ensemble(files)

In [46]:
res

{'1': array([0.12070448, 0.05242237, 0.04787794, 0.1411528 , 0.10925671,
        0.08617344, 0.01710832, 0.17611112, 0.41407838, 0.40574256,
        0.13383442, 0.11809298, 0.32796857, 0.22857566, 0.15229298,
        0.22313296]),
 '2': array([0.10913187, 0.11212065, 0.59044532, 0.10126273, 0.1332846 ,
        0.03964369, 0.14687284]),
 '3': array([0.10081576, 0.08728915, 0.3154933 , 0.22124943, 0.29437386,
        0.03456776, 0.11892953, 0.11418432, 0.23505049, 0.07847967,
        0.18867595, 0.12663794, 0.09237354, 0.27780068, 0.17042695,
        0.17643086, 0.02275212, 0.09651431, 0.20595823, 0.03618126,
        0.27379182, 0.02881333, 0.02012199, 0.27998197, 0.06545304,
        0.12413106, 0.05606925, 0.16276833, 0.10994026, 0.16312851,
        0.2038922 , 0.0649024 , 0.08305675, 0.126166  , 0.12254   ,
        0.13687462, 0.11444991, 0.28014644, 0.30541597, 0.21151077,
        0.11846724, 0.05562629, 0.33424929, 0.15272502, 0.07049232,
        0.08552358, 0.1421723 , 0.1446428 , 0

In [49]:
prob2rank(res, '../working/v17/submit2/prediction.txt')

100%|██████████| 2370727/2370727 [01:59<00:00, 19820.80it/s]


In [None]:
odir = '../working/v17/submit/'
os.system(f'cd {odir};zip prediction.zip prediction.txt')  

In [50]:
files = [
#   '../working/v16/base/infos/dev/valid-0.csv',    #0.6997
  '../working/v16/base.pairloss/infos/dev/valid.csv', #0.7005 
  '../working/v16/body/infos/dev/valid.csv',      #0.7032
  '../working/v16/bert/infos/dev/valid.csv',      #0.7036
  '../working/v16/bert-body/infos/dev/valid.csv', #0.7019
]

In [52]:
res = mean_ensemble(files)

../working/v16/base.pairloss/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:13<00:00, 28386.53it/s]


../working/v16/body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:13<00:00, 27870.55it/s]


../working/v16/bert/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:13<00:00, 28374.06it/s]


../working/v16/bert-body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:13<00:00, 28669.01it/s]


In [53]:
scoring(res)

100%|██████████| 376471/376471 [07:56<00:00, 789.85it/s]


(0.7058645483809668,
 0.35148798041400764,
 0.3929760593834304,
 0.4540967924048985)

In [73]:
files = [
  '../working/v16/body/infos/dev/valid.csv',      #0.7032
  '../working/v16/bert/infos/dev/valid.csv',      #0.7035
  '../working/v16/bert-body/infos/dev/valid.csv', #0.7019
  '../working/v16/body.pairloss/infos/dev/valid.csv', #7053
]

In [None]:
scoring(mean_ensemble(files))

../working/v16/body/infos/dev/valid.csv


100%|██████████| 376471/376471 [00:21<00:00, 17258.89it/s]


../working/v16/bert/infos/dev/valid.csv


 89%|████████▉ | 335830/376471 [00:19<00:02, 19126.58it/s]

In [None]:
scoring(mean_ensemble(files, [0.4, 0.4, 0.2]))

In [61]:
from scipy.stats import rankdata

In [60]:
def blend_byrank(weights):
  ranked = rankdata(weights)
  sum_rank = np.sum(ranked)
  for i in range(len(weights)):
    weights[i] = ranked[i] / sum_rank
  return weights

In [62]:
rankdata([1.5, 2.8, 0.2])

array([2., 3., 1.])

In [None]:
# body.dev 91
# 