In [1]:
from gezi.common import *
sys.path.append('..')
from src.config import *
from src.preprocess import *
from src.eval import *
gezi.init_flags()

In [2]:
# https://www.kaggle.com/code/goldenlock/ai4code-base?scriptVersionId=101148262
# 9025

In [3]:
root = '../working/offline/6/0'
context_model_name = 'deberta-v3-small.flag-context2-aug.n_context-40.cls_loss_rate-0.1.eval.p13-4'
pairwise_model_name = 'all-mpnet-base-v2.flag-pairwise13-4.pooling_mask-attention_mask.grad_acc-4'

In [4]:
xc = gezi.load(f'{root}/{context_model_name}/valid.pkl')

In [5]:
xp = gezi.load(f'{root}/{pairwise_model_name}/valid.pkl')

In [6]:
gezi.sort_dict_byid_(xc, 'cid')
gezi.sort_dict_byid_(xp, 'cid')

In [7]:
ids = set(xc['id'])

In [8]:
df_gt = pd.read_csv(f'{FLAGS.root}/train_orders.csv')
df_gt = df_gt[df_gt.id.isin(ids)]
df_gt['cell_order'] = df_gt['cell_order'].apply(lambda x: x.split())
df_gt.head()

Unnamed: 0,id,cell_order
4,0002115f48f982,"[9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe..."
11,00062ab8487156,"[dcad687f, a2e1fc80, 7d977ee8, 45a82a59, cbbc3..."
28,000efd285fb982,"[74a30f80, ee2c8e08, 5523374e, ae8f8fe8, 2138e..."
39,0012865b766949,"[f9cb50e9, 25f7db90, d804e819, 6593a545, fc5bb..."
42,001308991e0c5e,"[21147235, 6c01d0d2, 5bd28595, b8fd3a8c, a2501..."


In [9]:
calc_metric(xc, 'reg_pred')

0.9006251692558017

In [10]:
calc_metric(xc, 'pred')

0.8992880214723898

In [11]:
calc_metric(xc, 'cls_pred')

0.896173129530509

In [12]:
calc_metric(xp, 'cls_pred')

0.8966581181771137

In [13]:
x = xc.copy()
x['pred'] = xc['reg_pred'] * 0.5 + xp['cls_pred'] * 0.5
calc_metric(x)

0.9044248355992179

In [14]:
x = xc.copy()
x['pred'] = xc['cls_pred'] * 0.5 + xp['cls_pred'] * 0.5
calc_metric(x)

0.9018247872358709

In [15]:
x = xc.copy()
x['pred'] = xc['reg_pred'] * 0.5 + xp['pred'] * 0.5
calc_metric(x)

0.9036697563235938

In [16]:
def merge(x, y, prob):
  # return y
  # return x
  if prob > 0.9:
    return x * (1 - 0.0001) + y * 0.0001
  elif abs(y - x) < 0.1:
    return x * (1 - 0.0001) + y * 0.0001
  elif abs(y - x) < 0.2:
    return x * 0.95 * prob + y * (1 - 0.95 * prob)
  elif abs(y - x) < 0.3:
    return x * 0.85 * prob + y * (1 - 0.85 * prob)
  elif abs(y - x) < 0.4:
    return x * 0.5 * prob + y * (1 - 0.5 * prob)
  else:
    return y

In [17]:
x['pred'] = [merge(x, y, prob) for x, y, prob in zip(xp['pred'], xc['pred'], xp['max_prob'])]
calc_metric(x)

0.908585923202479

In [18]:
df_p = pd.DataFrame(xp)

In [19]:
df_c = pd.DataFrame(gezi.batch2list(xc))

In [20]:
df = df_c.merge(df_p[['cid', 'pred', 'cls_pred', 'max_prob', 'max_sim', 'probs', 'sims']], on='cid', suffixes=['_c', '_p'])

In [21]:
df_train = pd.read_feather('../working/train.fea')

In [22]:
df_train = df_train[df_train.id.isin(ids)]

In [23]:
df = df.merge(df_train[['cid', 'ancestor_id', 'n_words', 'source']], on='cid')

In [24]:
gezi.set_fold(df, 5, 'ancestor_id')

In [25]:
df['pred_diff0'] = abs(df['pred_c'] - df['pred_p'])
df['pred_diff1'] = abs(df['reg_pred'] - df['pred_p'])
df['pred_diff2'] = abs(df['cls_pred_c'] - df['pred_p'])
df['pred_diff3'] = abs(df['cls2_pred'] - df['pred_p'])
df['markdown_frac'] = df['n_markdown_cell'] / df['n_cell']
df['span'] = 1 / (df['n_code_cell'] + 1)
top2, top3, top4, top5 = [], [], [], []
top2_prob, top3_prob, top4_prob, top5_prob = [], [], [], []
top2_sim, top3_sim, top4_sim, top5_sim = [], [], [], []
for i in tqdm(range(len(df)), desc='top'):
  # cls_preds = df['cls_pred_ori'].values[i]
  n_code = df['n_code_cell'].values[i]
  probs = df['probs'].values[i]
  sims = df['sims'].values[i]
  idxes = (-probs).argsort()
  if len(idxes) > 1:
    top2.append((idxes[1] + 0.5) / (n_code + 1))
    top2_prob.append(probs[idxes[1]])
    top2_sim.append(sims[idxes[1]])
  else:
    top2.append(-1)
    top2_prob.append(-1)
    top2_sim.append(-1)
  if len(idxes) > 2:
    top3.append((idxes[2] + 0.5) / (n_code + 1))
    top3_prob.append(probs[idxes[2]])
    top3_sim.append(sims[idxes[2]])
  else:
    top3.append(-1)
    top3_prob.append(-1)
    top3_sim.append(-1)
  if len(idxes) > 3:
    top4.append((idxes[3] + 0.5) / (n_code + 1))
    top4_prob.append(probs[idxes[3]])
    top4_sim.append(sims[idxes[3]])
  else:
    top4.append(-1)
    top4_prob.append(-1)
    top4_sim.append(-1)
  if len(idxes) > 4:
    top5.append((idxes[4] + 0.5) / (n_code + 1))
    top5_prob.append(probs[idxes[4]])
    top5_sim.append(sims[idxes[4]])
  else:
    top5.append(-1)
    top5_prob.append(-1)
    top5_sim.append(-1)
ctop_prob, ctop2, ctop3, ctop4, ctop2_prob, ctop3_prob, ctop4_prob = [], [], [], [], [], [], []
for i in tqdm(range(len(df)), desc='ctop'):
  preds = df['cls_pred_ori'].values[i]
  probs = gezi.softmax(preds)
  idxes = (-probs).argsort()
  ctop_prob.append(probs[idxes[0]])
  ctop2.append((idxes[1] + 0.5) / FLAGS.num_classes)
  ctop2_prob.append(probs[idxes[1]])
  ctop3.append((idxes[2] + 0.5) / FLAGS.num_classes)
  ctop3_prob.append(probs[idxes[2]])
  ctop4.append((idxes[3] + 0.5) / FLAGS.num_classes)
  ctop4_prob.append(probs[idxes[3]])
# for i in range(FLAGS.num_classes):
#   df[f'cls_pred{i}'] = df['cls_pred_ori'].apply(lambda x: x[i])
df['top2'] = top2
df['top2_prob'] = top2_prob
df['top2_sim'] = top2_sim
df['top3'] = top3
df['top3_prob'] = top3_prob
df['top3_sim'] = top3_sim
df['top4'] = top4
df['top4_prob'] = top4_prob
df['top4_sim'] = top4_sim
df['top5'] = top5
df['top5_prob'] = top5_prob
df['top5_sim'] = top5_sim
df['ctop_prob'] = ctop_prob
df['ctop2'] = ctop2
df['ctop2_prob'] = ctop2_prob
df['ctop3'] = ctop3
df['ctop3_prob'] = ctop3_prob
df['ctop4'] = ctop4
df['ctop4_prob'] = ctop4_prob
df['pred_diff4'] = abs(df['pred_c'] - df['top2'])
df['pred_diff5'] = abs(df['reg_pred'] - df['top2'])
df['pred_diff6'] = abs(df['cls_pred_c'] - df['top2'])
df['pred_diff7'] = abs(df['cls2_pred'] - df['top2'])
df['pred_diff8'] = abs(df['pred_p'] - df['top2'])
df['pred_diff9'] = abs(df['cls_pred_c'] - df['top3'])
df['pred_diff10'] = abs(df['pred_p'] - df['top3'])
df['rule_pred'] = [merge(x, y, prob) for x, y, prob in tqdm(zip(df.pred_p.values, df.pred_c.values, df.max_prob.values), total=len(df), desc='rule')]

top:   0%|          | 0/424943 [00:00<?, ?it/s]

ctop:   0%|          | 0/424943 [00:00<?, ?it/s]

rule:   0%|          | 0/424943 [00:00<?, ?it/s]

In [27]:
reg_cols =  [
          'n_code_cell',
          'n_markdown_cell',
          'n_cell',
          'cls_pred_c',
          'pred_c',
          'reg_pred',
          'cls2_pred',
          'pred_p',
          'cls_pred_p',
          'rule_pred',
          'pred_diff0',
          'pred_diff1',
          'pred_diff2',
          'pred_diff3',
          'pred_diff4',
          'pred_diff5',
          'pred_diff6',
          'pred_diff7',
          'pred_diff8',
          'pred_diff9',
          'pred_diff10',
          'max_sim',
          'max_prob',
          'markdown_frac',
          'span',
          'top2',
          'top2_prob',
          'top2_sim',
          'top3',
          'top3_prob',
          'top3_sim',
          'top4',
          'top4_prob',
          'top4_sim',
          'top5',
          'top5_prob',
          'top5_sim',
          'ctop_prob', 
          'ctop2', 
          'ctop2_prob', 
        ]

cat_cols = [
          
            ]
label_col = 'rel_rank'
cols = reg_cols + cat_cols

In [28]:
from catboost import CatBoostRegressor, Pool
xgb_params = {'learning_rate': 0.02,
              'reg_lambda': 7.960622217848342e-07, 
              'subsample': 0.7422597612762745,
              'max_depth': 10, 
              'early_stopping_rounds': 500,
              'n_estimators': 10000,
              'cat_features': [],
              'loss_function': 'MAE',
              }

xgb_params2 = {'learning_rate': 0.09827605967564293,'tree_method':'gpu_hist', 'gpu_id':0,
               'early_stopping_rounds': 50,
               'n_estimators': 10000, }

In [29]:
FOLDS = 5

In [31]:
merge_scores = []
scores = []
for fold in tqdm(range(FOLDS)):
  dvalid = df[df.fold==fold]
  dtrain = df[df.fold!=fold]
  X_train = dtrain[cols]
  y_train = dtrain[label_col]
  X_valid = dvalid[cols]
  y_valid = dvalid[label_col]
  model = CatBoostRegressor(**xgb_params)
  model.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
                verbose=500,
                )  
  dvalid['cb_pred'] = model.predict(dvalid[cols])
  x = {'id': dvalid.id.values, 'cell_id': dvalid.cell_id.values}
  x['pred'] = [merge(x, y, prob) for x, y, prob in zip(dvalid.pred_p.values, dvalid.pred_c.values, dvalid.max_prob.values)]
  merge_score = calc_metric(x, 'pred', df_gt)
  ic(abs(x['pred'] - dvalid['rel_rank']).mean())
  score = calc_metric({'id': dvalid.id.values, 'cell_id': dvalid.cell_id.values, 'pred': dvalid.cb_pred.values})
  ic(fold, merge_score, score, score - merge_score)
  merge_scores.append(merge_score)
  scores.append(score)
  ic(fold, np.asarray(merge_scores).mean(), np.asarray(scores).mean())
  model.save_model(f'../working/cbt/{fold}.cbt')

  0%|          | 0/5 [00:00<?, ?it/s]

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.2544365	test: 0.2544365	test1: 0.2543514	best: 0.2543514 (0)	total: 77.7ms	remaining: 12m 56s
500:	learn: 0.0632407	test: 0.0632407	test1: 0.0656280	best: 0.0656280 (500)	total: 32.6s	remaining: 10m 18s
1000:	learn: 0.0610853	test: 0.0610853	test1: 0.0647097	best: 0.0647097 (1000)	total: 1m 4s	remaining: 9m 38s
1500:	learn: 0.0595594	test: 0.0595594	test1: 0.0644209	best: 0.0644209 (1500)	total: 1m 37s	remaining: 9m 10s
2000:	learn: 0.0583866	test: 0.0583866	test1: 0.0643415	best: 0.0643407 (1993)	total: 2m 12s	remaining: 8m 48s
2500:	learn: 0.0573591	test: 0.0573591	test1: 0.0644437	best: 0.0643313 (2133)	total: 2m 46s	remaining: 8m 18s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.06433133434
bestIteration = 2133

Shrink model to first 2134 iterations.


[07/19/22 17:05:43] 4047567722.py:19 in <module>
                    abs(x['pred'] - dvalid['rel_rank']).mean(): 0.06579407253591144
[07/19/22 17:05:45] 4047567722.py:21 in <module>
                    fold: 0
                    merge_score: 0.9070703368237418
                    score: 0.9104315790384504
                    score - merge_score: 0.0033612422147085708
[07/19/22 17:05:45] 4047567722.py:24 in <module>
                    fold: 0
                    np.asarray(merge_scores).mean(): 0.9070703368237418
                    np.asarray(scores).mean(): 0.9104315790384504


0:	learn: 0.2544434	test: 0.2544434	test1: 0.2543537	best: 0.2543537 (0)	total: 64.3ms	remaining: 10m 43s
500:	learn: 0.0634916	test: 0.0634916	test1: 0.0644702	best: 0.0644702 (500)	total: 31.2s	remaining: 9m 51s
1000:	learn: 0.0613120	test: 0.0613120	test1: 0.0635332	best: 0.0635332 (1000)	total: 1m 2s	remaining: 9m 23s
1500:	learn: 0.0597885	test: 0.0597885	test1: 0.0632748	best: 0.0632740 (1497)	total: 1m 35s	remaining: 9m 1s
2000:	learn: 0.0586173	test: 0.0586173	test1: 0.0631869	best: 0.0631860 (1984)	total: 2m 14s	remaining: 8m 56s
2500:	learn: 0.0576180	test: 0.0576180	test1: 0.0631791	best: 0.0631777 (2403)	total: 2m 49s	remaining: 8m 26s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.0631777357
bestIteration = 2403

Shrink model to first 2404 iterations.


[07/19/22 17:09:09] 4047567722.py:19 in <module>
                    abs(x['pred'] - dvalid['rel_rank']).mean(): 0.06494859335498927
[07/19/22 17:09:10] 4047567722.py:21 in <module>
                    fold: 1
                    merge_score: 0.9082661676659034
                    score: 0.9113040501897806
                    score - merge_score: 0.0030378825238771556
[07/19/22 17:09:10] 4047567722.py:24 in <module>
                    fold: 1
                    np.asarray(merge_scores).mean(): 0.9076682522448226
                    np.asarray(scores).mean(): 0.9108678146141155


0:	learn: 0.2543568	test: 0.2543568	test1: 0.2547335	best: 0.2547335 (0)	total: 87.2ms	remaining: 14m 31s
500:	learn: 0.0633974	test: 0.0633974	test1: 0.0645867	best: 0.0645867 (500)	total: 35.2s	remaining: 11m 6s
1000:	learn: 0.0611939	test: 0.0611939	test1: 0.0636154	best: 0.0636149 (997)	total: 1m 10s	remaining: 10m 34s
1500:	learn: 0.0597349	test: 0.0597349	test1: 0.0633332	best: 0.0633332 (1500)	total: 1m 45s	remaining: 9m 57s
2000:	learn: 0.0585400	test: 0.0585400	test1: 0.0632418	best: 0.0632418 (2000)	total: 2m 21s	remaining: 9m 27s
2500:	learn: 0.0575181	test: 0.0575181	test1: 0.0632323	best: 0.0632187 (2350)	total: 2m 57s	remaining: 8m 51s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.06321870254
bestIteration = 2350

Shrink model to first 2351 iterations.


[07/19/22 17:12:35] 4047567722.py:19 in <module>
                    abs(x['pred'] - dvalid['rel_rank']).mean(): 0.06479006687657934
[07/19/22 17:12:37] 4047567722.py:21 in <module>
                    fold: 2
                    merge_score: 0.9108612410210817
                    score: 0.913384589045658
                    score - merge_score: 0.002523348024576322
[07/19/22 17:12:37] 4047567722.py:24 in <module>
                    fold: 2
                    np.asarray(merge_scores).mean(): 0.908732581836909
                    np.asarray(scores).mean(): 0.9117067394246297


0:	learn: 0.2544802	test: 0.2544802	test1: 0.2542154	best: 0.2542154 (0)	total: 69.5ms	remaining: 11m 34s
500:	learn: 0.0639086	test: 0.0639086	test1: 0.0630433	best: 0.0630433 (500)	total: 34.3s	remaining: 10m 49s
1000:	learn: 0.0617233	test: 0.0617233	test1: 0.0620711	best: 0.0620711 (1000)	total: 1m 9s	remaining: 10m 25s
1500:	learn: 0.0602526	test: 0.0602526	test1: 0.0617742	best: 0.0617736 (1499)	total: 1m 43s	remaining: 9m 43s
2000:	learn: 0.0590934	test: 0.0590934	test1: 0.0616766	best: 0.0616766 (2000)	total: 2m 17s	remaining: 9m 9s
2500:	learn: 0.0582094	test: 0.0582094	test1: 0.0616503	best: 0.0616483 (2491)	total: 2m 50s	remaining: 8m 32s
3000:	learn: 0.0574687	test: 0.0574687	test1: 0.0616585	best: 0.0616402 (2877)	total: 3m 26s	remaining: 8m 2s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.06164022083
bestIteration = 2877

Shrink model to first 2878 iterations.


[07/19/22 17:16:34] 4047567722.py:19 in <module>
                    abs(x['pred'] - dvalid['rel_rank']).mean(): 0.06352573785829235
[07/19/22 17:16:36] 4047567722.py:21 in <module>
                    fold: 3
                    merge_score: 0.912103082718156
                    score: 0.9153450754329285
                    score - merge_score: 0.003241992714772546
[07/19/22 17:16:36] 4047567722.py:24 in <module>
                    fold: 3
                    np.asarray(merge_scores).mean(): 0.9095752070572207
                    np.asarray(scores).mean(): 0.9126163234267044


0:	learn: 0.2544156	test: 0.2544156	test1: 0.2545067	best: 0.2545067 (0)	total: 75ms	remaining: 12m 29s
500:	learn: 0.0631691	test: 0.0631691	test1: 0.0656771	best: 0.0656771 (500)	total: 36.2s	remaining: 11m 25s
1000:	learn: 0.0610502	test: 0.0610502	test1: 0.0647680	best: 0.0647674 (999)	total: 1m 13s	remaining: 10m 57s
1500:	learn: 0.0595070	test: 0.0595070	test1: 0.0644590	best: 0.0644590 (1500)	total: 1m 50s	remaining: 10m 27s
2000:	learn: 0.0583593	test: 0.0583593	test1: 0.0643696	best: 0.0643680 (1964)	total: 2m 28s	remaining: 9m 55s
2500:	learn: 0.0573699	test: 0.0573699	test1: 0.0643467	best: 0.0643405 (2462)	total: 3m 6s	remaining: 9m 18s
3000:	learn: 0.0566059	test: 0.0566059	test1: 0.0643463	best: 0.0643402 (2567)	total: 3m 42s	remaining: 8m 40s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.06434017347
bestIteration = 2567

Shrink model to first 2568 iterations.


[07/19/22 17:20:27] 4047567722.py:19 in <module>
                    abs(x['pred'] - dvalid['rel_rank']).mean(): 0.06573988537486983
[07/19/22 17:20:29] 4047567722.py:21 in <module>
                    fold: 4
                    merge_score: 0.9039693073200067
                    score: 0.9063462862332429
                    score - merge_score: 0.002376978913236183
[07/19/22 17:20:29] 4047567722.py:24 in <module>
                    fold: 4
                    np.asarray(merge_scores).mean(): 0.908454027109778
                    np.asarray(scores).mean(): 0.911362315988012


In [32]:
gezi.plot.feature_importance(model, topn=20)

AttributeError: type object 'CatBoostRegressor' has no attribute 'startswith'