In [1]:
import pandas as pd, numpy as np
from catboost import CatBoostClassifier
import pickle
import sys

# Load Train Data and Labels

In [2]:
dtypes = {"session_id": 'int64',
          "index": np.int16,
          "elapsed_time": np.int32,
          "event_name": 'category',
          "name": 'category',
          "level": np.int8,
          "page": np.float16,
          "room_coor_x": np.float16,
          "room_coor_y": np.float16,
          "screen_coor_x": np.float16,
          "screen_coor_y": np.float16,
          "hover_duration": np.float32,
          "text": 'category',
          "fqid": 'category',
          "room_fqid": 'category',
          "text_fqid": 'category',
          "fullscreen": np.int8,
          "hq": np.int8,
          "music": np.int8,
          "level_group": 'category'
          }
use_col = ['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level', 'page',
           'room_coor_x', 'room_coor_y', 'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid', 'level_group']

In [3]:
targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
print( targets.shape )
targets.head()

(424116, 4)


Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [4]:
feature_df = pd.read_csv('/kaggle/input/featur/feature_sort.csv')

# Feature Engineer

In [5]:
def delt_time_def(df):
    df.sort_values(by=['session_id', 'elapsed_time'], inplace=True)
    df['d_time'] = df['elapsed_time'].diff(1)
    df['d_time'].fillna(0, inplace=True)
    df['delt_time'] = df['d_time'].clip(0, 103000)
    df['delt_time_next'] = df['delt_time'].shift(-1)
    return df

In [6]:
def feature_engineer(train, kol_f):
    global kol_col, kol_col_max
    kol_col = 9
    kol_col_max = 11+kol_f*2
    col = [i for i in range(0,kol_col_max)]
    new_train = pd.DataFrame(index=train['session_id'].unique(), columns=col, dtype=np.float16)  
    new_train[10] = new_train.index # "session_id"    

    new_train[0] = train.groupby(['session_id'])['d_time'].quantile(q=0.3)
    new_train[1] = train.groupby(['session_id'])['d_time'].quantile(q=0.8)
    new_train[2] = train.groupby(['session_id'])['d_time'].quantile(q=0.5)
    new_train[3] = train.groupby(['session_id'])['d_time'].quantile(q=0.65)
    new_train[4] = train.groupby(['session_id'])['hover_duration'].agg('mean')
    new_train[5] = train.groupby(['session_id'])['hover_duration'].agg('std')    
    new_train[6] = new_train[10].apply(lambda x: int(str(x)[:2])).astype(np.uint8) # "year"
    new_train[7] = new_train[10].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8) # "month"
    new_train[8] = new_train[10].apply(lambda x: int(str(x)[4:6])).astype(np.uint8) # "day"
    new_train[9] = new_train[10].apply(lambda x: int(str(x)[6:8])).astype(np.uint8) + new_train[10].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)/60
    new_train[10] = 0
    new_train = new_train.fillna(-1)
    
    return new_train

In [7]:
def feature_next_t(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    col1 = row_f['col1']
    val1 = row_f['val1']
    maska = (train[col1] == val1)
    if row_f['kol_col'] == 1:       
        new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = maska & (train[col2] == val2)        
        new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska].groupby(['session_id'])['index'].count()
    return new_train

In [8]:
def feature_next_t_otvet(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    col1 = row_f['col1']
    val1 = row_f['val1']
    maska = (train[col1] == val1)
    if row_f['kol_col'] == 1:      
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = maska & (train[col2] == val2)        
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()
    return new_train

In [9]:
def experiment_feature_next_t_otvet(row_f, new_train, train, gran_1, gran_2, i):
    global kol_col
    kol_col +=1
    if row_f['kol_col'] == 1: 
        maska = train[row_f['col1']] == row_f['val1']
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()          
    elif row_f['kol_col'] == 2: 
        col2 = row_f['col2']
        val2 = row_f['val2']
        maska = (train[col1] == val1) & (train[col2] == val2)        
        new_train[kol_col] = train[maska]['delt_time_next'].sum()
        if gran_1:
            kol_col +=1
            new_train[kol_col] = train[maska]['delt_time'].mean()
        if gran_2:
            kol_col +=1
            new_train[kol_col] = train[maska]['index'].count()
    return new_train

In [10]:
def feature_quest_otvet(new_train, train, quest, kol_f):
    global kol_col
    kol_col = 9
    g1 = 0.7 
    g2 = 0.3 

    feature_q = feature_df[feature_df['quest'] == quest].copy()
    feature_q.reset_index(drop=True, inplace=True)
    
    gran1 = round(kol_f * g1)
    gran2 = round(kol_f * g2)    
    for i in range(0, kol_f):         
        row_f = feature_q.loc[i]
        new_train = feature_next_t_otvet(row_f, new_train, train, i < gran1, i <  gran2, i) 
    col = [i for i in range(0,kol_col+1)]
    return new_train[col]

In [11]:
def feature_engineer_new(new_train, train, feature_q, kol_f):
    g1 = 0.7 
    g2 = 0.3 
    gran1 = round(kol_f * g1)
    gran2 = round(kol_f * g2)    
    for i in range(0, kol_f): 
        row_f = feature_q.loc[i]       
        new_train = feature_next_t(row_f, new_train, train, i < gran1, i <  gran2, i)         
    return new_train

In [12]:
def feature_quest(new_train, train, quest, kol_f):
    global kol_col
    kol_col = 9
    feature_q = feature_df[feature_df['quest'] == quest].copy()
    feature_q.reset_index(drop=True, inplace=True)
    new_train = feature_engineer_new(new_train, train, feature_q, kol_f)
    col = [i for i in range(0,kol_col+1)]
    return new_train[col]

In [13]:
def create_model(old_train, quests, models, list_kol_f):
    
    kol_quest = len(quests)
    # ITERATE THRU QUESTIONS
    for q in quests:
        print('### quest ', q, end='')
        new_train = feature_engineer(old_train, list_kol_f[q])
        train_x = feature_quest(new_train, old_train, q, list_kol_f[q])
        print (' ---- ', 'train_q.shape = ', train_x.shape)
           
        # TRAIN DATA
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==q].set_index('session').loc[train_users]

        # TRAIN MODEL 

        model = CatBoostClassifier(
            n_estimators = 300,
            learning_rate= 0.045,
            depth = 6
        )
        
        model.fit(train_x.astype('float32'), train_y['correct'], verbose=False)

        # SAVE MODEL, PREDICT VALID OOF
        models[f'{q}'] = model
    print('***')
    
    return models

In [14]:
models = {}
best_threshold = 0.63

In [15]:
list_kol_f = {
    1:140,3:110,
    4:120, 5:220, 6:130, 7:110, 8:110, 9:100, 10:140, 11:120,
    14: 160, 15:160, 16:130, 17:140             
             }

In [16]:
df0_4 = pd.read_csv('/kaggle/input/featur/train_0_4t.csv', dtype=dtypes) 
kol_lvl = (df0_4 .groupby(['session_id'])['level'].agg('nunique') < 5)
list_session = kol_lvl[kol_lvl].index
df0_4  = df0_4 [~df0_4 ['session_id'].isin(list_session)]
df0_4 = delt_time_def(df0_4)

quests_0_4 = [1, 3] 
# list_kol_f = {1:140,3:110}

models = create_model(df0_4, quests_0_4, models, list_kol_f)
del df0_4

### quest  1 ----  train_q.shape =  (23562, 290)
### quest  3 ----  train_q.shape =  (23562, 230)
***


In [17]:
df5_12 = pd.read_csv('/kaggle/input/featur/train_5_12t.csv', dtype=dtypes)
kol_lvl = (df5_12.groupby(['session_id'])['level'].agg('nunique') < 8)
list_session = kol_lvl[kol_lvl].index
df5_12 = df5_12[~df5_12['session_id'].isin(list_session)]
df5_12 = delt_time_def(df5_12)
quests_5_12 = [4, 5, 6, 7, 8, 9, 10, 11] 

# list_kol_f = {4:110, 5:220, 6:120, 7:110, 8:110, 9:100, 10:140, 11:120}

models = create_model(df5_12, quests_5_12, models, list_kol_f)
del df5_12

### quest  4 ----  train_q.shape =  (23561, 250)
### quest  5 ----  train_q.shape =  (23561, 450)
### quest  6 ----  train_q.shape =  (23561, 270)
### quest  7 ----  train_q.shape =  (23561, 230)
### quest  8 ----  train_q.shape =  (23561, 230)
### quest  9 ----  train_q.shape =  (23561, 210)
### quest  10 ----  train_q.shape =  (23561, 290)
### quest  11 ----  train_q.shape =  (23561, 250)
***


In [18]:
df13_22 = pd.read_csv('/kaggle/input/featur/train_13_22t.csv', dtype=dtypes) 
kol_lvl = (df13_22 .groupby(['session_id'])['level'].agg('nunique') < 10)
list_session = kol_lvl[kol_lvl].index
df13_22  = df13_22 [~df13_22 ['session_id'].isin(list_session)]
df13_22 = delt_time_def(df13_22)

quests_13_22 = [14, 15, 16, 17] 
# list_kol_f = {14: 160, 15:160, 16:105, 17:140}

models = create_model(df13_22, quests_13_22, models, list_kol_f)


### quest  14 ----  train_q.shape =  (22986, 330)
### quest  15 ----  train_q.shape =  (22986, 330)
### quest  16 ----  train_q.shape =  (22986, 270)
### quest  17 ----  train_q.shape =  (22986, 290)
***


In [19]:
#Saving a Model
# for q in quests_0_4 + quests_5_12 + quests_13_22:
#     models[q].save_model(f'cat_model_{q}.bin')

In [20]:
#Model Reading
# dir = '/kaggle/input/catbust/'
# for q in quests_0_4 + quests_5_12 + quests_13_22:
#     models[q] = CatBoostClassifier().load_model(dir+f'cat_model_{q}.bin')

# XGBoost

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import pickle
import polars as pl
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
from collections import defaultdict
import warnings
from itertools import combinations

warnings.filterwarnings('ignore')

In [22]:
CATS = ['event_name', 'name', 'fqid', 'room_fqid', 'text_fqid']
NUMS = ['page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
        'hover_duration', 'elapsed_time_diff']

DIALOGS = ['that', 'this', 'it', 'you','find','found','Found','notebook','Wells','wells','help','need', 'Oh','Ooh','Jo', 'flag', 'can','and','is','the','to']

name_feature = ['basic', 'undefined', 'close', 'open', 'prev', 'next']
event_name_feature = ['cutscene_click', 'person_click', 'navigate_click',
       'observation_click', 'notification_click', 'object_click',
       'object_hover', 'map_hover', 'map_click', 'checkpoint',
       'notebook_click']

fqid_lists = ['archivist', 'archivist_glasses', 'block', 'block_0', 'block_1', 'block_badge', 'block_badge_2', 'block_magnify', 'block_nelson', 'block_tocollection', 'block_tomap1', 'block_tomap2', 'boss', 'businesscards', 'businesscards.card_0.next', 'businesscards.card_1.next', 'businesscards.card_bingo.bingo', 'businesscards.card_bingo.next', 'ch3start', 'chap1_finale', 'chap1_finale_c', 'chap2_finale', 'chap2_finale_c', 'chap4_finale_c', 'coffee', 'colorbook', 'confrontation', 'crane_ranger', 'cs', 'directory', 'directory.closeup.archivist', 'door_block_clean', 'door_block_talk', 'doorblock', 'expert', 'flag_girl', 'fox', 'glasses', 'gramps', 'groupconvo', 'groupconvo_flag', 'intro', 'janitor', 'journals', 'journals.hub.topics', 'journals.pic_0.next', 'journals.pic_1.next', 'journals.pic_2.bingo', 'journals.pic_2.next', 'journals_flag', 'journals_flag.hub.topics', 'journals_flag.hub.topics_old', 'journals_flag.pic_0.bingo', 'journals_flag.pic_0.next', 'journals_flag.pic_0_old.next', 'journals_flag.pic_1.bingo', 'journals_flag.pic_1.next', 'journals_flag.pic_1_old.next', 'journals_flag.pic_2.bingo', 'journals_flag.pic_2.next', 'journals_flag.pic_2_old.next', 'key', 'lockeddoor', 'logbook', 'logbook.page.bingo', 'magnify', 'need_glasses', 'notebook', 'outtolunch', 'photo', 'plaque', 'plaque.face.date', 'reader', 'reader.paper0.next', 'reader.paper0.prev', 'reader.paper1.next', 'reader.paper1.prev', 'reader.paper2.bingo', 'reader.paper2.next', 'reader.paper2.prev', 'reader_flag', 'reader_flag.paper0.next', 'reader_flag.paper0.prev', 'reader_flag.paper1.next', 'reader_flag.paper1.prev', 'reader_flag.paper2.bingo', 'reader_flag.paper2.next', 'reader_flag.paper2.prev', 'remove_cup', 'report', 'retirement_letter', 'savedteddy', 'seescratches', 'teddy', 'tobasement', 'tocage', 'tocloset', 'tocloset_dirty', 'tocollection', 'tocollectionflag', 'toentry', 'tofrontdesk', 'togrampa', 'tohallway', 'tomap', 'tomicrofiche', 'tostacks', 'tracks', 'tracks.hub.deer', 'trigger_coffee', 'trigger_scarf', 'tunic', 'tunic.capitol_0', 'tunic.capitol_1', 'tunic.capitol_2', 'tunic.drycleaner', 'tunic.flaghouse', 'tunic.historicalsociety', 'tunic.hub.slip', 'tunic.humanecology', 'tunic.kohlcenter', 'tunic.library', 'tunic.wildlife', 'unlockdoor', 'wells', 'wellsbadge', 'what_happened', 'worker']
text_lists = ['tunic.historicalsociety.cage.confrontation', 'tunic.wildlife.center.crane_ranger.crane', 'tunic.historicalsociety.frontdesk.archivist.newspaper', 'tunic.historicalsociety.entry.groupconvo', 'tunic.wildlife.center.wells.nodeer', 'tunic.historicalsociety.frontdesk.archivist.have_glass', 'tunic.drycleaner.frontdesk.worker.hub', 'tunic.historicalsociety.closet_dirty.gramps.news', 'tunic.humanecology.frontdesk.worker.intro', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation', 'tunic.historicalsociety.basement.seescratches', 'tunic.historicalsociety.collection.cs', 'tunic.flaghouse.entry.flag_girl.hello', 'tunic.historicalsociety.collection.gramps.found', 'tunic.historicalsociety.basement.ch3start', 'tunic.historicalsociety.entry.groupconvo_flag', 'tunic.library.frontdesk.worker.hello', 'tunic.library.frontdesk.worker.wells', 'tunic.historicalsociety.collection_flag.gramps.flag', 'tunic.historicalsociety.basement.savedteddy', 'tunic.library.frontdesk.worker.nelson', 'tunic.wildlife.center.expert.removed_cup', 'tunic.library.frontdesk.worker.flag', 'tunic.historicalsociety.frontdesk.archivist.hello', 'tunic.historicalsociety.closet.gramps.intro_0_cs_0', 'tunic.historicalsociety.entry.boss.flag', 'tunic.flaghouse.entry.flag_girl.symbol', 'tunic.historicalsociety.closet_dirty.trigger_scarf', 'tunic.drycleaner.frontdesk.worker.done', 'tunic.historicalsociety.closet_dirty.what_happened', 'tunic.wildlife.center.wells.animals', 'tunic.historicalsociety.closet.teddy.intro_0_cs_0', 'tunic.historicalsociety.cage.glasses.afterteddy', 'tunic.historicalsociety.cage.teddy.trapped', 'tunic.historicalsociety.cage.unlockdoor', 'tunic.historicalsociety.stacks.journals.pic_2.bingo', 'tunic.historicalsociety.entry.wells.flag', 'tunic.humanecology.frontdesk.worker.badger', 'tunic.historicalsociety.stacks.journals_flag.pic_0.bingo', 'tunic.historicalsociety.closet.intro', 'tunic.historicalsociety.closet.retirement_letter.hub', 'tunic.historicalsociety.entry.directory.closeup.archivist', 'tunic.historicalsociety.collection.tunic.slip', 'tunic.kohlcenter.halloffame.plaque.face.date', 'tunic.historicalsociety.closet_dirty.trigger_coffee', 'tunic.drycleaner.frontdesk.logbook.page.bingo', 'tunic.library.microfiche.reader.paper2.bingo', 'tunic.kohlcenter.halloffame.togrampa', 'tunic.capitol_2.hall.boss.haveyougotit', 'tunic.wildlife.center.wells.nodeer_recap', 'tunic.historicalsociety.cage.glasses.beforeteddy', 'tunic.historicalsociety.closet_dirty.gramps.helpclean', 'tunic.wildlife.center.expert.recap', 'tunic.historicalsociety.frontdesk.archivist.have_glass_recap', 'tunic.historicalsociety.stacks.journals_flag.pic_1.bingo', 'tunic.historicalsociety.cage.lockeddoor', 'tunic.historicalsociety.stacks.journals_flag.pic_2.bingo', 'tunic.historicalsociety.collection.gramps.lost', 'tunic.historicalsociety.closet.notebook', 'tunic.historicalsociety.frontdesk.magnify', 'tunic.humanecology.frontdesk.businesscards.card_bingo.bingo', 'tunic.wildlife.center.remove_cup', 'tunic.library.frontdesk.wellsbadge.hub', 'tunic.wildlife.center.tracks.hub.deer', 'tunic.historicalsociety.frontdesk.key', 'tunic.library.microfiche.reader_flag.paper2.bingo', 'tunic.flaghouse.entry.colorbook', 'tunic.wildlife.center.coffee', 'tunic.capitol_1.hall.boss.haveyougotit', 'tunic.historicalsociety.basement.janitor', 'tunic.historicalsociety.collection_flag.gramps.recap', 'tunic.wildlife.center.wells.animals2', 'tunic.flaghouse.entry.flag_girl.symbol_recap', 'tunic.historicalsociety.closet_dirty.photo', 'tunic.historicalsociety.stacks.outtolunch', 'tunic.library.frontdesk.worker.wells_recap', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap', 'tunic.capitol_0.hall.boss.talktogramps', 'tunic.historicalsociety.closet.photo', 'tunic.historicalsociety.collection.tunic', 'tunic.historicalsociety.closet.teddy.intro_0_cs_5', 'tunic.historicalsociety.closet_dirty.gramps.archivist', 'tunic.historicalsociety.closet_dirty.door_block_talk', 'tunic.historicalsociety.entry.boss.flag_recap', 'tunic.historicalsociety.frontdesk.archivist.need_glass_0', 'tunic.historicalsociety.entry.wells.talktogramps', 'tunic.historicalsociety.frontdesk.block_magnify', 'tunic.historicalsociety.frontdesk.archivist.foundtheodora', 'tunic.historicalsociety.closet_dirty.gramps.nothing', 'tunic.historicalsociety.closet_dirty.door_block_clean', 'tunic.capitol_1.hall.boss.writeitup', 'tunic.library.frontdesk.worker.nelson_recap', 'tunic.library.frontdesk.worker.hello_short', 'tunic.historicalsociety.stacks.block', 'tunic.historicalsociety.frontdesk.archivist.need_glass_1', 'tunic.historicalsociety.entry.boss.talktogramps', 'tunic.historicalsociety.frontdesk.archivist.newspaper_recap', 'tunic.historicalsociety.entry.wells.flag_recap', 'tunic.drycleaner.frontdesk.worker.done2', 'tunic.library.frontdesk.worker.flag_recap', 'tunic.humanecology.frontdesk.block_0', 'tunic.library.frontdesk.worker.preflag', 'tunic.historicalsociety.basement.gramps.seeyalater', 'tunic.flaghouse.entry.flag_girl.hello_recap', 'tunic.historicalsociety.closet.doorblock', 'tunic.drycleaner.frontdesk.worker.takealook', 'tunic.historicalsociety.basement.gramps.whatdo', 'tunic.library.frontdesk.worker.droppedbadge', 'tunic.historicalsociety.entry.block_tomap2', 'tunic.library.frontdesk.block_nelson', 'tunic.library.microfiche.block_0', 'tunic.historicalsociety.entry.block_tocollection', 'tunic.historicalsociety.entry.block_tomap1', 'tunic.historicalsociety.collection.gramps.look_0', 'tunic.library.frontdesk.block_badge', 'tunic.historicalsociety.cage.need_glasses', 'tunic.library.frontdesk.block_badge_2', 'tunic.kohlcenter.halloffame.block_0', 'tunic.capitol_0.hall.chap1_finale_c', 'tunic.capitol_1.hall.chap2_finale_c', 'tunic.capitol_2.hall.chap4_finale_c', 'tunic.wildlife.center.fox.concern', 'tunic.drycleaner.frontdesk.block_0', 'tunic.historicalsociety.entry.gramps.hub', 'tunic.humanecology.frontdesk.block_1', 'tunic.drycleaner.frontdesk.block_1']
room_lists = ['tunic.historicalsociety.entry', 'tunic.wildlife.center', 'tunic.historicalsociety.cage', 'tunic.library.frontdesk', 'tunic.historicalsociety.frontdesk', 'tunic.historicalsociety.stacks', 'tunic.historicalsociety.closet_dirty', 'tunic.humanecology.frontdesk', 'tunic.historicalsociety.basement', 'tunic.kohlcenter.halloffame', 'tunic.library.microfiche', 'tunic.drycleaner.frontdesk', 'tunic.historicalsociety.collection', 'tunic.historicalsociety.closet', 'tunic.flaghouse.entry', 'tunic.historicalsociety.collection_flag', 'tunic.capitol_1.hall', 'tunic.capitol_0.hall', 'tunic.capitol_2.hall']

LEVELS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
level_groups = ["0-4", "5-12", "13-22"]

# best_F1_scores: 0.700135261005169, best_threshold_xgbs: 0.6250000000000002, best_feature_selection: ["Well, that's good enough for me.", 'So? History is boring!', "Why don't you head to the Basketball Center and rustle up some clues?", 'Yes! This old slip from 1916.', "The slip is from 1916 but the team didn't start until 1974!", 'We need to talk about that missing paperwork.', "I feel like I'm forgetting something.", 'Ooh, I like clues!', "It's already all done!", "Hey Jo, let's take a look at the shirt!", 'I should go talk to Gramps!', "Hmmm. Shouldn't you be doing your homework?", 'Our shirt is too old to be a basketball jersey!', 'Your teacher said you missed 7 assignments in a row!', "This can't be right!", 'Find anything?', 'See?', 'Gramps is in trouble for losing papers?', 'Then do it for me!', 'Will do, Boss.', 'Plus, my teacher said I could help you out for extra credit!', 'Yes! This cool old slip from 1916.', 'Found it!', 'Your teacher said you could help me for extra credit.', 'Teddy and I were gonna go climb that huge tree out back!', 'Sure thing, Jo. Grab your notebook and come upstairs!', 'Whatcha doing over there, Jo?', "Let's get started. The Wisconsin Wonders exhibit opens tomorrow!", "Meet me back in my office and we'll get started!", 'Wells, finish up your report.', 'Sure!', 'See you later, Teddy.', 'A boring old shirt.', 'Meetings are BORING!', 'Now where did I put my notebook?', 'Ugh. Meetings are so boring.', 'Did you do all of them?', "I'll record this in my notebook.", 'Gramps is the best historian ever!', "I get to go to Gramps's meeting!", "Why don't you go play with your grampa?", 'I love these photos of me and Teddy!', "Why don't you go catch up with your grampa?", 'I suppose historians are boring, too?', 'Head over to the Basketball Center.', 'Can I come, Gramps?', 'Better check back later.', "Leopold, why don't you help me set up in the Capitol?", 'I gotta run to my meeting!', 'Just talking to Teddy.', 'Just this old slip from 1916.', '\\u00f0\\u0178\\u02dc\\u00b4', 'Gramps said to look for clues. Better look around.', "I'll be at the Capitol. Let me know if you find anything!", 'I should see what Grampa is up to!', 'Hooray, a boring old shirt.', 'Grab your notebook and come upstairs!', 'I need to get to the Capitol and tell Gramps!', "It's a women's basketball jersey!", 'What a fascinating artifact!', "Not Leopold here. He's been losing papers lately.", "Wow, that's so cool, Gramps!", 'Hot Dog! I knew it!', 'Have a look at the artifact!', "Hmmm. Don't forget about your homework.", 'No... because history is boring!', 'Do I have to?', 'Hey!', "Well, I did SOME of those. I just couldn't find them!", 'Hang tight, Teddy.', 'Besides, I already figured out the shirt.', 'Wait, you mean Wells is wrong?!', 'I should see what Gramps is up to!', "That's it!", 'Can we hurry up, Gramps?', "Look at that! It's the bee's knees!", "No way, Gramps. You're the best!", 'Gramps is a great historian!', 'Who wants to investigate the shirt artifact?', "Hmm. Button's still not working.", "It's true, they do keep going missing lately.", 'Your grampa is waiting for you in the collection room.', 'Go ahead, take a peek at the shirt!', 'Could be. But we need evidence!', "I'll hurry back and then we can go exploring!", 'Well, Leopold here is always losing papers...', 'This button never works!', 'Come on, Jo!', 'I knew it!', 'This looks like a clue!', 'Hopefully you can rustle up some clues!', 'That settles it.', 'Hopefully you can find some clues!', 'Can I take a closer look?', "I'm not so sure that this is a basketball jersey.", 'Ha. Told you so!']
level1_text_lists = ["Well, that's good enough for me.", 'So? History is boring!', "Why don't you head to the Basketball Center and rustle up some clues?", 'Yes! This old slip from 1916.', "The slip is from 1916 but the team didn't start until 1974!", 'We need to talk about that missing paperwork.', "I feel like I'm forgetting something.", 'Ooh, I like clues!', "It's already all done!", "Hey Jo, let's take a look at the shirt!", 'I should go talk to Gramps!', "Hmmm. Shouldn't you be doing your homework?", 'Our shirt is too old to be a basketball jersey!', 'Your teacher said you missed 7 assignments in a row!', "This can't be right!", 'Find anything?', 'See?', 'Gramps is in trouble for losing papers?', 'Then do it for me!', 'Will do, Boss.', 'Plus, my teacher said I could help you out for extra credit!', 'Yes! This cool old slip from 1916.', 'Found it!', 'Your teacher said you could help me for extra credit.', 'Teddy and I were gonna go climb that huge tree out back!', 'Sure thing, Jo. Grab your notebook and come upstairs!', 'Whatcha doing over there, Jo?', "Let's get started. The Wisconsin Wonders exhibit opens tomorrow!", "Meet me back in my office and we'll get started!", 'Wells, finish up your report.', 'Sure!', 'See you later, Teddy.', 'A boring old shirt.', 'Meetings are BORING!', 'Now where did I put my notebook?', 'Ugh. Meetings are so boring.', 'Did you do all of them?', "I'll record this in my notebook.", 'Gramps is the best historian ever!', "I get to go to Gramps's meeting!", "Why don't you go play with your grampa?", 'I love these photos of me and Teddy!', "Why don't you go catch up with your grampa?", 'I suppose historians are boring, too?', 'Head over to the Basketball Center.', 'Can I come, Gramps?', 'Better check back later.', "Leopold, why don't you help me set up in the Capitol?", 'I gotta run to my meeting!', 'Just talking to Teddy.', 'Just this old slip from 1916.', '\\u00f0\\u0178\\u02dc\\u00b4', 'Gramps said to look for clues. Better look around.', "I'll be at the Capitol. Let me know if you find anything!", 'I should see what Grampa is up to!', 'Hooray, a boring old shirt.', 'Grab your notebook and come upstairs!', 'I need to get to the Capitol and tell Gramps!', "It's a women's basketball jersey!", 'What a fascinating artifact!', "Not Leopold here. He's been losing papers lately.", "Wow, that's so cool, Gramps!", 'Hot Dog! I knew it!', 'Have a look at the artifact!', "Hmmm. Don't forget about your homework.", 'No... because history is boring!', 'Do I have to?', 'Hey!', "Well, I did SOME of those. I just couldn't find them!", 'Hang tight, Teddy.', 'Besides, I already figured out the shirt.', 'Wait, you mean Wells is wrong?!', 'I should see what Gramps is up to!', "That's it!", 'Can we hurry up, Gramps?', "Look at that! It's the bee's knees!", "No way, Gramps. You're the best!", 'Gramps is a great historian!', 'Who wants to investigate the shirt artifact?', "Hmm. Button's still not working.", "It's true, they do keep going missing lately.", 'Your grampa is waiting for you in the collection room.', 'Go ahead, take a peek at the shirt!', 'Could be. But we need evidence!', "I'll hurry back and then we can go exploring!", 'Well, Leopold here is always losing papers...', 'This button never works!', 'Come on, Jo!', 'I knew it!', 'This looks like a clue!', 'Hopefully you can rustle up some clues!', 'That settles it.', 'Hopefully you can find some clues!', 'Can I take a closer look?', "I'm not so sure that this is a basketball jersey.", 'Ha. Told you so!']

# level2_text_lists = ['*COUGH COUGH COUGH*', '*cough cough*', '*grumble grumble*', 'A little horse!', 'AND I know who took Teddy!', 'AND he stole Teddy!', "Ah, that's better!", 'An old shirt? Try the university.', "And I'll figure out the shirt, too.", "And he messed up Gramps's office, too!", "And look! She's wearing the shirt!", "And where's your grampa?", 'And you are?', 'Are you okay?', 'BUT WELLS STOLE TEDDY!', 'Badgers? No.', 'Better check back later.', "But I hear the museum's got one on the loose!", 'But he never goes anywhere without his scarf!', 'But what if Wells kidnapped Teddy?', "Calm down, kid. I haven't seen him.", 'Can I give you the tour?', 'Can you help me find Wells?', 'Can you help me tidy up?', 'Can you help me-', 'Can you help me? I need to find Wells!', 'Can you help me? I need to find the owner of this slip.', 'Can you help-', "Can't believe I lost my reading glasses.", "Check out our microfiche. It's right through that door.", 'Could be. But we need evidence.', 'Did you drop something, Dear?', "Did you drop something, Dear? There's a card on the floor.", 'Did you figure out the shirt?', 'Did you have a question or not?', 'Did you have a question?', 'Do you have any info on Theodora Youmans?', 'Do you know anything about this slip?', 'Do you know what this slip is?', 'Do you know who Theodora Youmans is?', "Don't worry, Gramps. I'll find Teddy!", 'Easy, Jo.', "Fine. Let's investigate!", 'Go find your grampa and get to work!', 'Great! Thanks for the help!', "Guess it couldn't hurt to let you take a look.", 'Ha! Good one.', 'Ha! What do you call a pony with a sore throat?', "Ha! You're funny.", 'Have you seen a badger around here?', 'He needs our help!', 'He was looking for a taxidermist.', "He's always trying to get you in trouble, and he doesn't like animals!", "He's our expert record keeper.", "He's wrong about old shirts and his name rhymes with \\smells\\...", 'Head over to the university.', 'Head upstairs and talk to the archivist. He might be able to help!', 'Hello there!', 'Here I am!', "Here's a call number to find more info in the Stacks.", "Here's the log book.", 'Hey, this is Youmans!', 'Hi! *cough*', 'Hi! How can I help you?', 'Hi, Mrs. M.', "Hmm. Button's still not working.", "Hmmm... not sure. Why don't you try the library?", 'Hold your horses, Jo.', 'How can I help you?', 'Huh?', 'I bet the archivist could use this!', "I can't calm down. This is important!", "I don't have time for kids.", "I don't know!", "I don't need that right now.", 'I figured out that you kidnapped him!', 'I found it on an old shirt.', 'I found it!', 'I got here and the whole place was a mess!', 'I got here and the whole place was ransacked!', 'I got that one from my Gramps!', 'I had some cleaning up to do in my office.', 'I have an idea.', "I haven't quite figured it out just yet...", "I haven't seen him.", 'I hope you find your badger, kid.', 'I knew I could count on you, Jo!', 'I knew you could do it, Jo!', 'I love these photos of me and Teddy.', 'I need to find Wells right away! Do you know where he is?', 'I need to find Wells right away!! Do you know where he is?', 'I need to find Wells!!!', 'I need to find the owner of this slip.', 'I need your help!', 'I ran into Wells there this morning.', 'I should ask the librarian why Wells was here.', 'I should check that logbook to see who owned this slip...', 'I should find out if she can help me!', 'I should go to the Capitol and tell everyone!', 'I should help Gramps clean.', 'I should stay and look for clues!', "I think he's in trouble!", 'I used to have a magnifying glass around here\\u00e2\\u20ac\\u00a6', "I wonder if there's a clue in those business cards...", "I'm Leopold's grandkid!", "I'm afraid my papers have gone missing in this mess.", "I'm afraid not.", "I'm also looking for Theodora Youmans. Have you heard of her?", "I'm sure you'll find Theodora in there somewhere!", "I've got a stack of business cards from my favorite cleaners.", 'Is this your coffee, Gramps?', "It must've been Wells.", "It'll be okay, Jo. We'll find Teddy!", "It's a match!", "It's for Grampa Leo. He's a historian!", "It's our Norwegian Craft exhibit!", "It's such a nice fall day.", 'Jo, meet me back at my office.', "Jolie! I was hoping you'd stop by. Any news on the shirt artifact?", 'Knew what?', 'Leo... you mean Leopold?', 'Leopold, can you run back to the museum?', 'Looks like a dry cleaning receipt.', 'Maybe I can help!', 'Maybe he just got scared and ran off.', "Maybe there's a clue in this mess!", 'Mrs. M, I think Wells kidnapped Teddy.', 'Nice decorations.', 'Nice seeing you, Jolie!', 'Nice work on the shirt, Jolie!', "Nope, that's from Bean Town. I only drink Holdgers!", 'Nope. But Youmans and other suffragists worked hard to change that.', 'Not sure. Here, let me look it up.', 'Now I Just need to find all the cleaners from way back in 1916.', 'Now I just need to find all the cleaners from wayyyy back in 1916.', 'Now can I tell you what happened to Teddy?', 'Now if only I could read this thing.', 'Now if only I could read this thing. Blasted tiny letters...', 'Oh my!', 'Oh no!', 'Oh no... Teddy!', "Oh, I'm fine! Just a little hoarse.", 'Oh, hello there!', "Oh, that's from Bean Town.", "Okay. I'll find Teddy!", 'Okay. Thanks anyway.', 'One step at a time, Jo.', 'Ooh, nice decorations!', 'Ooh, thanks!', 'Please let me know if you do.', "Please let me know if you do. It's important!", 'Please?', 'Please? This is really important.', "Poor Gramps! I should make sure he's okay.", 'Right outside the door.', 'Run along to the university.', 'She helped get votes for women!', 'She led marches and helped women get the right to vote!', 'Slow down, Jo.', 'So much cleaning to do...', "Sorry I'm late.", 'Sorry for the delay, Boss.', "Sorry, I'm in a hurry.", "Sorry, I'm too busy for kids right now.", "Sorry, can't help you.", 'Sounds good, Boss.', 'Take a look!', "Teddy's scarf! Somebody must've taken him!", 'Thanks for the help!', 'Thanks to them, Wisconsin was the first state to approve votes for women!', 'Thanks.', 'Thanks. Did you figure out the shirt?', 'The archivist said I should look in the stacks.', 'The libarian said I could find some information on Youmans in here...', 'Then we need evidence.', 'Theodora Youmans must be the owner!', 'Theodora Youmans? Is that who owned the shirt?', 'Theodora Youmans? Of course!', 'Theodora wearing the shirt!', 'They study clothes and fabric.', 'This button never works!', 'This place was around in 1916! I can start there!', 'Try not to panic, Jo.', 'Two missions, actually!', 'Ugh. Fine.', 'Um, are you okay?', "Unless you're too busy horsing around.", 'Wait a minute!', "Wait a sec. Women couldn't vote?!", 'Welcome back, Jolie. Did you figure out the shirt?', "Well, I can't show our log books to just anybody.", "Well, get on it. I'm counting on you and your gramps to figure this out!", 'Well? What are you still doing here?', 'Wells sabotaged Gramps!', 'Wells! What was he doing here? I should ask the librarian.', "Wells! Where's Teddy? Is he okay?", 'Wells? I knew it!', "Weren't you going to check out our microfiche?", 'What are you still doing here,  Jolie?', 'What are you waiting for? The Stacks are right outside the door.', 'What happened here?!', 'What should I do first?', 'What the-', 'What was Wells doing here?', "What's a taxidermist?", "What's a textile expert?", 'Where are the Stacks?', 'Where did you get that coffee?', 'Where should I go again?', 'Who are you?', "Who could've done this?", 'Who is Teddy?', "Why didn't you say so?", "Why don't you go upstairs and see the archivist?", "Why don't you prove your case?", "Why don't you take a look?", 'Wow!', 'Wow! What is all this stuff?', 'Yeah. Thanks anyway.', 'Yep.', 'Yes!', 'Yes! I was wondering-', 'Yikes... this could take a while.', 'You better get to the capitol!', 'You can talk to a textile expert there.', 'You could ask the archivist. He knows everybody!', 'You could try the archivist. Maybe he can help you find Wells!', "You haven't seen any badgers around here, have you?", "You look like you're on a mission.", "You'll have to get started without me.", "You're right, Gramps. Let's investigate!", "You're still here? I'm trying to work!", 'Youmans was a suffragist here in Wisconsin.', 'Youmans was a suffragist!', 'Your gramps is awesome! Always full of stories.', "Yup, that's him!", '\\Taxidermy: the art of preparing, stuffing, and mounting the skins of animals.\\']
level2_text_lists = ['*COUGH COUGH COUGH*', '*cough cough*', '*grumble grumble*', 'A little horse!', 'AND I know who took Teddy!', 'AND he stole Teddy!', "Ah, that's better!", 'An old shirt? Try the university.', "And I'll figure out the shirt, too.", "And he messed up Gramps's office, too!", "And look! She's wearing the shirt!", "And where's your grampa?", 'And you are?', 'Are you okay?', 'BUT WELLS STOLE TEDDY!', 'Badgers? No.', 'Better check back later.', "But I hear the museum's got one on the loose!", 'But he never goes anywhere without his scarf!', 'But what if Wells kidnapped Teddy?', "Calm down, kid. I haven't seen him.", 'Can I give you the tour?', 'Can you help me find Wells?', 'Can you help me tidy up?', 'Can you help me-', 'Can you help me? I need to find Wells!', 'Can you help me? I need to find the owner of this slip.', 'Can you help-', "Can't believe I lost my reading glasses.", "Check out our microfiche. It's right through that door.", 'Could be. But we need evidence.', 'Did you drop something, Dear?', "Did you drop something, Dear? There's a card on the floor.", 'Did you figure out the shirt?', 'Did you have a question or not?', 'Did you have a question?', 'Do you have any info on Theodora Youmans?', 'Do you know anything about this slip?', 'Do you know what this slip is?', 'Do you know who Theodora Youmans is?', "Don't worry, Gramps. I'll find Teddy!", 'Easy, Jo.', "Fine. Let's investigate!", 'Go find your grampa and get to work!', 'Great! Thanks for the help!', "Guess it couldn't hurt to let you take a look.", 'Ha! Good one.', 'Ha! What do you call a pony with a sore throat?', "Ha! You're funny.", 'Have you seen a badger around here?', 'He needs our help!', 'He was looking for a taxidermist.', "He's always trying to get you in trouble, and he doesn't like animals!", "He's our expert record keeper.", "He's wrong about old shirts and his name rhymes with \\smells\\...", 'Head over to the university.', 'Head upstairs and talk to the archivist. He might be able to help!', 'Hello there!', 'Here I am!', "Here's a call number to find more info in the Stacks.", "Here's the log book.", 'Hey, this is Youmans!', 'Hi! *cough*', 'Hi! How can I help you?', 'Hi, Mrs. M.', "Hmm. Button's still not working.", "Hmmm... not sure. Why don't you try the library?", 'Hold your horses, Jo.', 'How can I help you?', 'Huh?', 'I bet the archivist could use this!', "I can't calm down. This is important!", "I don't have time for kids.", "I don't know!", "I don't need that right now.", 'I figured out that you kidnapped him!', 'I found it on an old shirt.', 'I found it!', 'I got here and the whole place was a mess!', 'I got here and the whole place was ransacked!', 'I got that one from my Gramps!', 'I had some cleaning up to do in my office.', 'I have an idea.', "I haven't quite figured it out just yet...", "I haven't seen him.", 'I hope you find your badger, kid.', 'I knew I could count on you, Jo!', 'I knew you could do it, Jo!', 'I love these photos of me and Teddy.', 'I need to find Wells right away! Do you know where he is?', 'I need to find Wells right away!! Do you know where he is?', 'I need to find Wells!!!', 'I need to find the owner of this slip.', 'I need your help!', 'I ran into Wells there this morning.', 'I should ask the librarian why Wells was here.', 'I should check that logbook to see who owned this slip...', 'I should find out if she can help me!', 'I should go to the Capitol and tell everyone!', 'I should help Gramps clean.', 'I should stay and look for clues!', "I think he's in trouble!", 'I used to have a magnifying glass around here\\u00e2\\u20ac\\u00a6', "I wonder if there's a clue in those business cards...", "I'm Leopold's grandkid!", "I'm afraid my papers have gone missing in this mess.", "I'm afraid not.", "I'm also looking for Theodora Youmans. Have you heard of her?", "I'm sure you'll find Theodora in there somewhere!", "I've got a stack of business cards from my favorite cleaners.", 'Is this your coffee, Gramps?', "It must've been Wells.", "It'll be okay, Jo. We'll find Teddy!", "It's a match!", "It's for Grampa Leo. He's a historian!", "It's our Norwegian Craft exhibit!", "It's such a nice fall day.", 'Jo, meet me back at my office.', "Jolie! I was hoping you'd stop by. Any news on the shirt artifact?", 'Knew what?', 'Leo... you mean Leopold?', 'Leopold, can you run back to the museum?', 'Looks like a dry cleaning receipt.', 'Maybe I can help!', 'Maybe he just got scared and ran off.', "Maybe there's a clue in this mess!", 'Mrs. M, I think Wells kidnapped Teddy.', 'Nice decorations.', 'Nice seeing you, Jolie!', 'Nice work on the shirt, Jolie!', "Nope, that's from Bean Town. I only drink Holdgers!", 'Nope. But Youmans and other suffragists worked hard to change that.', 'Not sure. Here, let me look it up.', 'Now I Just need to find all the cleaners from way back in 1916.', 'Now I just need to find all the cleaners from wayyyy back in 1916.', 'Now can I tell you what happened to Teddy?', 'Now if only I could read this thing.', 'Now if only I could read this thing. Blasted tiny letters...', 'Oh my!', 'Oh no!', 'Oh no... Teddy!', "Oh, I'm fine! Just a little hoarse.", 'Oh, hello there!', "Oh, that's from Bean Town.", "Okay. I'll find Teddy!", 'Okay. Thanks anyway.', 'One step at a time, Jo.', 'Ooh, nice decorations!', 'Ooh, thanks!', 'Please let me know if you do.', "Please let me know if you do. It's important!", 'Please?', 'Please? This is really important.', "Poor Gramps! I should make sure he's okay.", 'Right outside the door.', 'Run along to the university.', 'She helped get votes for women!', 'She led marches and helped women get the right to vote!', 'Slow down, Jo.', 'So much cleaning to do...', "Sorry I'm late.", 'Sorry for the delay, Boss.', "Sorry, I'm in a hurry.", "Sorry, I'm too busy for kids right now.", "Sorry, can't help you.", 'Sounds good, Boss.', 'Take a look!', "Teddy's scarf! Somebody must've taken him!", 'Thanks for the help!', 'Thanks to them, Wisconsin was the first state to approve votes for women!', 'Thanks.', 'Thanks. Did you figure out the shirt?', 'The archivist said I should look in the stacks.', 'The libarian said I could find some information on Youmans in here...', 'Then we need evidence.', 'Theodora Youmans must be the owner!', 'Theodora Youmans? Is that who owned the shirt?', 'Theodora Youmans? Of course!', 'Theodora wearing the shirt!', 'They study clothes and fabric.', 'This button never works!', 'This place was around in 1916! I can start there!', 'Try not to panic, Jo.', 'Two missions, actually!', 'Ugh. Fine.', 'Um, are you okay?', "Unless you're too busy horsing around.", 'Wait a minute!', "Wait a sec. Women couldn't vote?!", 'Welcome back, Jolie. Did you figure out the shirt?', "Well, I can't show our log books to just anybody.", "Well, get on it. I'm counting on you and your gramps to figure this out!", 'Well? What are you still doing here?', 'Wells sabotaged Gramps!', 'Wells! What was he doing here? I should ask the librarian.', "Wells! Where's Teddy? Is he okay?", 'Wells? I knew it!', "Weren't you going to check out our microfiche?", 'What are you still doing here,  Jolie?', 'What are you waiting for? The Stacks are right outside the door.', 'What happened here?!', 'What should I do first?', 'What the-', 'What was Wells doing here?', "What's a taxidermist?", "What's a textile expert?", 'Where are the Stacks?', 'Where did you get that coffee?', 'Where should I go again?', 'Who are you?', "Who could've done this?", 'Who is Teddy?', "Why didn't you say so?", "Why don't you go upstairs and see the archivist?", "Why don't you prove your case?", "Why don't you take a look?", 'Wow!', 'Wow! What is all this stuff?', 'Yeah. Thanks anyway.', 'Yep.', 'Yes!', 'Yes! I was wondering-', 'Yikes... this could take a while.', 'You better get to the capitol!', 'You can talk to a textile expert there.', 'You could ask the archivist. He knows everybody!', 'You could try the archivist. Maybe he can help you find Wells!', "You haven't seen any badgers around here, have you?", "You look like you're on a mission.", "You'll have to get started without me.", "You're right, Gramps. Let's investigate!", "You're still here? I'm trying to work!", 'Youmans was a suffragist here in Wisconsin.', 'Youmans was a suffragist!', 'Your gramps is awesome! Always full of stories.', "Yup, that's him!", '\\Taxidermy: the art of preparing, stuffing, and mounting the skins of animals.\\']


PAGES = [0, 1, 2, 3, 4, 5, 6]

In [23]:
def feature_engineer_xgboost(x, grp, use_extra, feature_suffix):
    aggs = [
        pl.col("index").count().alias(f"session_number_{feature_suffix}"),        
        
        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique_{feature_suffix}") for c in CATS],

        *[pl.col(c).mean().alias(f"{c}_mean_{feature_suffix}") for c in NUMS],
        *[pl.col(c).std().alias(f"{c}_std_{feature_suffix}") for c in NUMS],
        *[pl.col(c).min().alias(f"{c}_min_{feature_suffix}") for c in NUMS],
        *[pl.col(c).max().alias(f"{c}_max_{feature_suffix}") for c in NUMS],
        *[pl.col(c).median().alias(f"{c}_median_{feature_suffix}") for c in NUMS],
        
        *[pl.col(c).quantile(0.1, "nearest").alias(f"{c}_quantile1_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.2, "nearest").alias(f"{c}_quantile2_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.4, "nearest").alias(f"{c}_quantile4_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.6, "nearest").alias(f"{c}_quantile6_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.8, "nearest").alias(f"{c}_quantile8_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.9, "nearest").alias(f"{c}_quantile9_{feature_suffix}") for c in NUMS],
        
        

        *[pl.col("fqid").filter(pl.col("fqid") == c).count().alias(f"{c}_fqid_counts{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in fqid_lists],

        *[pl.col("text_fqid").filter(pl.col("text_fqid") == c).count().alias(f"{c}_text_fqid_counts{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in text_lists],

        *[pl.col("room_fqid").filter(pl.col("room_fqid") == c).count().alias(f"{c}_room_fqid_counts{feature_suffix}")
          for c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}")
          for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in room_lists],
        
        

        *[pl.col("event_name").filter(pl.col("event_name") == c).count().alias(f"{c}_event_name_counts{feature_suffix}")
          for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}")
          for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).median().alias(
            f"{c}_ET_median_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in event_name_feature],

        *[pl.col("name").filter(pl.col("name") == c).count().alias(f"{c}_name_counts{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in
          name_feature],

        *[pl.col("level").filter(pl.col("level") == c).count().alias(f"{c}_LEVEL_count{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c
          in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in
          LEVELS],
        
        *[pl.col("page").filter(pl.col("page") == c).count().alias(f"{c}_page_count{feature_suffix}") for c in
          PAGES],
        *[pl.col("elapsed_time_diff").filter(pl.col("page") == c).std().alias(f"{c}_page_std_{feature_suffix}") for c in
          PAGES],
        *[pl.col("elapsed_time_diff").filter(pl.col("page") == c).mean().alias(f"{c}_page_mean_{feature_suffix}") for c
          in PAGES],
        *[pl.col("elapsed_time_diff").filter(pl.col("page") == c).sum().alias(f"{c}_page_sum_{feature_suffix}") for c in
          PAGES],
        *[pl.col("elapsed_time_diff").filter(pl.col("page") == c).median().alias(f"{c}_page_median_{feature_suffix}") for
          c in PAGES],
        *[pl.col("elapsed_time_diff").filter(pl.col("page") == c).max().alias(f"{c}_page_max_{feature_suffix}") for c in
          PAGES],
        

        

        *[pl.col("level_group").filter(pl.col("level_group") == c).count().alias(
            f"{c}_LEVEL_group_count{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}")
          for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).median().alias(
            f"{c}_ET_median_{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in
          level_groups],
        
        
        *[pl.col("text").filter(pl.col("text") == c).count().alias(f"{c}_level1_text_counts{feature_suffix}") for c in level1_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).std().alias(f"{c}_level1_text_ET_std_{feature_suffix}") for c in level1_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).mean().alias(f"{c}_level1_text_ET_mean_{feature_suffix}") for c in level1_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).sum().alias(f"{c}_level1_text_ET_sum_{feature_suffix}") for c in level1_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).median().alias(f"{c}_level1_text_ET_median_{feature_suffix}") for c in level1_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).max().alias(f"{c}_level1_text_ET_max_{feature_suffix}") for c in level1_text_lists],
        
        
        *[pl.col("text").filter(pl.col("text") == c).count().alias(f"{c}_level2_text_counts{feature_suffix}") for c in level2_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).std().alias(f"{c}_level2_text_ET_std_{feature_suffix}") for c in level2_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).mean().alias(f"{c}_level2_text_ET_mean_{feature_suffix}") for c in level2_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).sum().alias(f"{c}_level2_text_ET_sum_{feature_suffix}") for c in level2_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).median().alias(f"{c}_level2_text_ET_median_{feature_suffix}") for c in level2_text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).max().alias(f"{c}_level2_text_ET_max_{feature_suffix}") for c in level2_text_lists],
        
        
    

    ]

    df = x.groupby(['session_id'], maintain_order=True).agg(aggs).sort("session_id")

    if use_extra:
        if grp == '5-12':
            aggs = [
                pl.col("elapsed_time").filter((pl.col("text") == "Here's the log book.")
                                              | (pl.col("fqid") == 'logbook.page.bingo'))
                    .apply(lambda s: s.max() - s.min()).alias("logbook_bingo_duration"),
                pl.col("index").filter(
                    (pl.col("text") == "Here's the log book.") | (pl.col("fqid") == 'logbook.page.bingo')).apply(
                    lambda s: s.max() - s.min()).alias("logbook_bingo_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader')) | (
                            pl.col("fqid") == "reader.paper2.bingo")).apply(lambda s: s.max() - s.min()).alias(
                    "reader_bingo_duration"),
                pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader')) | (
                        pl.col("fqid") == "reader.paper2.bingo")).apply(lambda s: s.max() - s.min()).alias(
                    "reader_bingo_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals')) | (
                            pl.col("fqid") == "journals.pic_2.bingo")).apply(lambda s: s.max() - s.min()).alias(
                    "journals_bingo_duration"),
                pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals')) | (
                        pl.col("fqid") == "journals.pic_2.bingo")).apply(lambda s: s.max() - s.min()).alias(
                    "journals_bingo_indexCount"),
                
                
               (pl.col('index').filter((pl.col('fqid')=='logbook.page.bingo') & (pl.col('event_name')=='object_click')).first() - pl.col('index').filter((pl.col('fqid')=='logbook')).first()).alias('logbingo-logbook_first'), # Not Working
               
            ]            
                
#             # logbook first feature (index, elapsed_time)                            

            
            
            tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
            df = df.join(tmp, on="session_id", how='left')

        if grp == '13-22':
            aggs = [
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader_flag')) | (
                            pl.col("fqid") == "tunic.library.microfiche.reader_flag.paper2.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("reader_flag_duration"),
                pl.col("index").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader_flag')) | (
                            pl.col("fqid") == "tunic.library.microfiche.reader_flag.paper2.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("reader_flag_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals_flag')) | (
                            pl.col("fqid") == "journals_flag.pic_0.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("journalsFlag_bingo_duration"),
                pl.col("index").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals_flag')) | (
                            pl.col("fqid") == "journals_flag.pic_0.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("journalsFlag_bingo_indexCount")
            ]
            tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
            df = df.join(tmp, on="session_id", how='left')

    return df.to_pandas()

In [24]:
import pickle
f_read = open('/kaggle/input/cv0604-cv070030-thr062/importance_dict.pkl', 'rb')
importance_dict= pickle.load(f_read)
f_read.close()

In [25]:
import pickle
f_read = open('/kaggle/input/cv0604-cv070030-thr062/importance_dict_origin.pkl', 'rb')
importance_dict_origin= pickle.load(f_read)
f_read.close()

In [26]:
models_list = []

for q in range(1, 19):
    fold_models = []
    for fold in range(5):
        model = XGBClassifier()
        model.load_model(f"/kaggle/input/cv0604-cv070030-thr062/XGB_fold{fold}_q{q}.xgb")
        fold_models.append(model)
    models_list.append(fold_models)

In [27]:
FEATURES_list = []

for t in range(1, 19):
        FEATURES_list.append(importance_dict_origin[str(t)].copy() )

In [28]:
# Origin feature
FEATURES1 = FEATURES_list[0]
FEATURES2 = FEATURES_list[4]
FEATURES3 = FEATURES_list[17]

In [29]:
print('We will train with', len(FEATURES1), len(FEATURES2), len(FEATURES3) ,'features')

We will train with 1134 2130 1150 features


In [30]:
FEATURES_list_post = []

for t in range(1, 19):
    FEATURES_list_post.append(importance_dict[str(t)].copy() )    

In [31]:
joined_FEATURES2 = FEATURES_list_post[4]
len(joined_FEATURES2)

3035

# LGBM

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import pickle
import polars as pl
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
from collections import defaultdict
import warnings
from itertools import combinations
import joblib
from lightgbm import LGBMClassifier

In [33]:
CATS_nunique = ['fqid', 'text_fqid', 'name', 'room_fqid', 'event_name']
NUMS_mean = ['elapsed_time_diff', 'screen_coor_y', 'hover_duration', 'screen_coor_x', 'room_coor_y', 'page', 'room_coor_x']
NUMS_std = ['elapsed_time_diff', 'screen_coor_x', 'room_coor_x', 'screen_coor_y']
NUMS_min = ['elapsed_time_diff']
NUMS_max = ['room_coor_y']

NUMS_median = ['elapsed_time_diff', 'screen_coor_y']
NUMS_quantile1 = ['elapsed_time_diff', 'room_coor_y', 'page']
NUMS_quantile4 = ['screen_coor_x', 'hover_duration']
NUMS_quantile6 = ['elapsed_time_diff']
NUMS_quantile7 = ['elapsed_time_diff', 'room_coor_y', 'hover_duration']
NUMS_quantile8 = ['elapsed_time_diff']
NUMS_quantile9 = ['elapsed_time_diff', 'hover_duration', 'screen_coor_x']

fqid_count = ['reader', 'journals.pic_1.next', 'archivist', 'flag_girl', 'journals_flag.pic_0_old.next']
fqid_mean = ['logbook.page.bingo', 'coffee', 'chap1_finale_c', 'cs', 'journals_flag.pic_0.next']

text_fqid_count = ['tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap']
text_fqid_mean = ['tunic.historicalsociety.cage.glasses.afterteddy', 'tunic.flaghouse.entry.colorbook']
text_fqid_sum = ['tunic.humanecology.frontdesk.worker.intro', 'tunic.library.microfiche.reader_flag.paper2.bingo', 'tunic.historicalsociety.frontdesk.archivist.newspaper', 'tunic.historicalsociety.closet_dirty.trigger_scarf', 'tunic.historicalsociety.stacks.journals.pic_2.bingo', 'tunic.historicalsociety.frontdesk.archivist.have_glass', 'tunic.historicalsociety.cage.unlockdoor']
text_fqid_median = ['tunic.wildlife.center.wells.nodeer', 'tunic.historicalsociety.cage.glasses.beforeteddy', 'tunic.wildlife.center.crane_ranger.crane', 'tunic.historicalsociety.frontdesk.key']

level1_text_count = ["Hey Jo, let's take a look at the shirt!", 'Did you do all of them?', 'So? History is boring!']
level1_text_std = ['I gotta run to my meeting!']

level2_text_mean = ['Thanks for the help!', "He's wrong about old shirts and his name rhymes with \\smells\\...", "Well, I can't show our log books to just anybody.", 'Thanks to them, Wisconsin was the first state to approve votes for women!', 'Now I Just need to find all the cleaners from way back in 1916.', "It's a match!", 'Maybe he just got scared and ran off.', 'Nope. But Youmans and other suffragists worked hard to change that.', 'Theodora Youmans must be the owner!', "I don't need that right now.", 'Maybe I can help!', "I'm afraid not.", 'Okay. Thanks anyway.', "It's such a nice fall day."]


level3_text_std = ["Yes, he has. I've seen him eating homework and important papers, too.", "Let's go find Gramps!"]


In [34]:
def feature_engineer_LGBM(x, grp, use_extra, feature_suffix):
    aggs = [
        pl.col("index").count().alias(f"session_number{feature_suffix}"),        
        
        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique{feature_suffix}") for c in CATS_nunique],
        *[pl.col(c).mean().alias(f"{c}_mean{feature_suffix}") for c in NUMS_mean],
        *[pl.col(c).std().alias(f"{c}_std{feature_suffix}") for c in NUMS_std],
        *[pl.col(c).min().alias(f"{c}_min{feature_suffix}") for c in NUMS_min],
        *[pl.col(c).max().alias(f"{c}_max{feature_suffix}") for c in NUMS_max],        
        *[pl.col(c).median().alias(f"{c}_median{feature_suffix}") for c in NUMS_median],                           
    
        *[pl.col(c).quantile(0.1, "nearest").alias(f"{c}_quantile1{feature_suffix}") for c in NUMS_quantile1],
        *[pl.col(c).quantile(0.4, "nearest").alias(f"{c}_quantile4{feature_suffix}") for c in NUMS_quantile4],
        *[pl.col(c).quantile(0.6, "nearest").alias(f"{c}_quantile6{feature_suffix}") for c in NUMS_quantile6],
        *[pl.col(c).quantile(0.7, "nearest").alias(f"{c}_quantile7{feature_suffix}") for c in NUMS_quantile7],
        *[pl.col(c).quantile(0.8, "nearest").alias(f"{c}_quantile8{feature_suffix}") for c in NUMS_quantile8],
        *[pl.col(c).quantile(0.9, "nearest").alias(f"{c}_quantile9{feature_suffix}") for c in NUMS_quantile9],
        
        *[pl.col("fqid").filter(pl.col("fqid") == c).count().alias(f"{c}_(fqid)_count{feature_suffix}") for c in fqid_count], # 더 개선 시킬 수 있음
        # *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).std().alias(f"{c}_(fqid)_std{feature_suffix}") for c in fqid_std], # Not improved (50)
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).mean().alias(f"{c}_(fqid)_mean{feature_suffix}") for c in fqid_mean],
        # *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).sum().alias(f"{c}_(fqid)_sum{feature_suffix}") for c in fqid_sum], # Not improved (50)
        # *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).median().alias(f"{c}_(fqid)_median{feature_suffix}") for c in fqid_median],
        # *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).max().alias(f"{c}_(fqid)_max_{feature_suffix}") for c in fqid_max],        

        *[pl.col("text_fqid").filter(pl.col("text_fqid") == c).count().alias(f"{c}_(text_fqid)_count{feature_suffix}") for c in text_fqid_count],
        # *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).std().alias(f"{c}_(text_fqid)_std{feature_suffix}") for c in text_fqid_std],        
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).mean().alias(f"{c}_(text_fqid)_mean{feature_suffix}") for c in text_fqid_mean],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_(text_fqid)_sum{feature_suffix}") for c in text_fqid_sum],        
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).median().alias(f"{c}_(text_fqid)_median{feature_suffix}") for c in text_fqid_median],
        # *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).max().alias(f"{c}_(text_fqid)_max{feature_suffix}") for c in text_fqid_max],
    
        # *[pl.col("room_fqid").filter(pl.col("room_fqid") == c).count().alias(f"{c}_(room_fqid)_count{feature_suffix}") for c in room_fqid_count],
        # *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).std().alias(f"{c}_(room_fqid)_std{feature_suffix}") for c in room_fqid_std],
#         *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).mean().alias(f"{c}_(room_fqid)_mean{feature_suffix}") for c in room_fqid_mean],
        # *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_(room_fqid)_sum{feature_suffix}") for c in room_fqid_sum],
        # *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).median().alias(f"{c}_(room_fqid)_median{feature_suffix}") for c in room_fqid_median],
        # *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).max().alias(f"{c}_(room_fqid)_max{feature_suffix}") for c in room_fqid_max],
        
        
        *[pl.col("text").filter(pl.col("text") == c).count().alias(f"{c}_(text1)_count{feature_suffix}") for c in level1_text_count],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).std().alias(f"{c}_(text1)_std{feature_suffix}") for c in level1_text_std],
        # *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).mean().alias(f"{c}_(text1)_mean{feature_suffix}") for c in level1_text_mean],        
#         *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).sum().alias(f"{c}_(text1)_sum{feature_suffix}") for c in level1_text_sum],                
#         *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).median().alias(f"{c}_(text1)_median{feature_suffix}") for c in level1_text_median],        
#         *[pl.col("elapsed_time_diff").filter(p
        
#         *[pl.col("text").filter(pl.col("text") == c).count().alias(f"{c}_(text2)_count{feature_suffix}") for c in level2_text_count],
#         *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).std().alias(f"{c}_(text2)_std{feature_suffix}") for c in level2_text_std],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).mean().alias(f"{c}_(text2)_mean_{feature_suffix}") for c in level2_text_mean],
        # *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).sum().alias(f"{c}_(text2)_sum{feature_suffix}") for c in level2_text_sum],
#         *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).median().alias(f"{c}_(text2)_median{feature_suffix}") for c in level2_text_median],        
        # *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).max().alias(f"{c}_(text2)_max{feature_suffix}") for c in level2_text_max],
        
        
        
#         *[pl.col("text").filter(pl.col("text") == c).count().alias(f"{c}_(text3)_count{feature_suffix}") for c in level3_text_count],
        *[pl.col("elapsed_time_diff").filter(pl.col("text") == c).std().alias(f"{c}_(text3)_std{feature_suffix}") for c in level3_text_std],
  
        
        

        

        
        
    ]
    
    df = x.groupby(['session_id'], maintain_order=True).agg(aggs).sort("session_id")
   

    return df.to_pandas()

In [35]:
import pickle
f_read = open('/kaggle/input/0622-cv069391-thr0625-no-join/importance_dict_origin.pkl', 'rb')
importance_dict_origin_LGBM= pickle.load(f_read)
f_read.close()

In [36]:
len(importance_dict_origin_LGBM['15'])

51

**Infer Test Data**

In [37]:
import jo_wilder

try:
    jo_wilder.make_env.__called__ = False
    env.__called__ = False
    type(env)._state = type(type(env)._state).__dict__['INIT']
except:
    pass

env = jo_wilder.make_env()
iter_test = env.iter_test()    

In [38]:
import time

In [39]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}
g_end4 = 0
g_end5 = 0
list_q = {'0-4':quests_0_4, '5-12':quests_5_12, '13-22':quests_13_22}
######################### xgboost ############
# Remember to change the threshold

# [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17] -1번째 를 수정하기.
best_threshold_lists = [0.625, 0.62, 0.620, 0.605, 0.620, 0.615, 0.620, 0.620, 0.595, 0.625, 0.625, 0.62, 0.62, 0.620, 0.620, 0.620, 0.615, 0.62]
fold = 0

session_features = defaultdict(dict)
historical_meta = defaultdict(list)
historical_meta_LGBM = defaultdict(list)

################################
for (test, sam_sub) in iter_test:
    test0 = test.copy()
    test0 = test0.sort_values('index') # elapsed_time 기준으로 sorting
        
    sam_sub['question'] = [int(label.split('_')[1][1:]) for label in sam_sub['session_id']]    
    grp = test.level_group.values[0]   
    sam_sub['correct'] = 1
    sam_sub.loc[sam_sub.question.isin([5, 8, 10, 13, 15]), 'correct'] = 0  
    old_train = delt_time_def(test[test.level_group == grp])
    
    ##################### XGBoost #######################################
    grp = test0.level_group.values[0]
    session_id = test0.session_id.values[0]
    sam_sub['question'] = [int(label.split('_')[1][1:]) for label in sam_sub['session_id']]
    columns = [
        pl.col("page").cast(pl.Float32),
        (
            (pl.col("elapsed_time") - pl.col("elapsed_time").shift(1))
            .fill_null(0)        
            .clip(0, 1e8)
            .over(["session_id", "level"])
            .alias("elapsed_time_diff")
        ),
        (
            (pl.col("screen_coor_x") - pl.col("screen_coor_x").shift(1))
            .abs()
            .over(["session_id", "level"])
        ),
        (
            (pl.col("screen_coor_y") - pl.col("screen_coor_y").shift(1))
            .abs()
            .over(["session_id", "level"])
        ),
        pl.col("fqid").fill_null("fqid_None"),
        pl.col("text_fqid").fill_null("text_fqid_None"),
        pl.col('text').fill_null('text_None')

    ]

    test0 = (pl.from_pandas(test0)
          .drop(["fullscreen", "hq", "music"])
          .with_columns(columns))
    
    # FEATURE ENGINEER TEST DATA
    test0_featured = feature_engineer_xgboost(test0, grp, use_extra=True, feature_suffix='')
    
    # FEATURE ENGINEER TEST DATA
    test0_LGBM = feature_engineer_LGBM(test0, grp, use_extra=True, feature_suffix='')

    
#     print(grp)
    # Save features
    session_features[session_id][grp] = test0_featured.copy()
    # Load features
    if grp == '0-4':
        pass
    elif grp == '5-12':
        df1 = session_features[session_id]['0-4'][FEATURES1].copy()
        df2 = test0_featured[FEATURES2].copy()                                
        df_joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2')        
        test0_featured = df_joined
    else:
        df1 = session_features[session_id]['0-4'][FEATURES1].copy()
        df2 = session_features[session_id]['5-12'][FEATURES2].copy()
        df_joined_2 = df1.join(df2, lsuffix='_df1', rsuffix='_df2')     
                                                
        df3 = test0_featured[FEATURES3].copy()     
        
        df_joined_3 = df_joined_2[joined_FEATURES2].join(df3, lsuffix='_joined_df2', rsuffix='_df3')
        test0_featured = df_joined_3
    
                    
    a,b = limits[grp]
    for q in range(a, b):
    # for q in list_q[grp]:        
        if q in [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17]:
            start4 = time.time()
            new_train = feature_engineer(old_train, list_kol_f[q])
            new_train = feature_quest_otvet(new_train, old_train, q, list_kol_f[q])
    #         new_train = feature_quest(new_train, old_train, q, kol_f)

            end4 = time.time() - start4
            g_end4 += end4

            start5 = time.time()        

            clf = models[f'{q}']
            p1 = clf.predict_proba(new_train.astype('float32'))[:,1]        

            end5 = time.time() - start5
            g_end5 += end5


            mask = sam_sub.question == q
            # x1 = int(p[0]>best_threshold)
            x1 = p1[0]
        
        ################################## xgboost #########################
        FEATURES = importance_dict[str(q)].copy()
        
        # Load the previous model predictions if available
        if q > 1:
            for prev_t in range(1, q):
                # Add the previous model predictions as feature
                test0_featured[f'prev_pred_{prev_t}'] = historical_meta[session_id][prev_t - 1]
            FEATURES += [f'prev_pred_{prev_t}' for prev_t in range(1, q)]
        
        print(len(FEATURES))
        
        model_0 = models_list[q-1][fold]
        model_1 = models_list[q-1][fold+1]
        model_2 = models_list[q-1][fold+2]
        model_3 = models_list[q-1][fold+3]
        model_4 = models_list[q-1][fold+4]                                
        
        
        pred_0 = model_0.predict_proba(np.array(test0_featured[FEATURES]).astype('float32'))[:,1]
        pred_1 = model_1.predict_proba(np.array(test0_featured[FEATURES]).astype('float32'))[:,1]
        pred_2 = model_2.predict_proba(np.array(test0_featured[FEATURES]).astype('float32'))[:,1]
        pred_3 = model_3.predict_proba(np.array(test0_featured[FEATURES]).astype('float32'))[:,1]
        pred_4 = model_4.predict_proba(np.array(test0_featured[FEATURES]).astype('float32'))[:,1]
        
        p2 = (pred_0 + pred_1 + pred_2 + pred_3 + pred_4) / 5
        
        # Store the prediction for future use
        historical_meta[session_id].append(p2)
        ########################### LGBM: LB 0.692 #############################        
        FEATURES_LGBM = importance_dict_origin_LGBM[str(q)].copy() 
        
        if q > 1:
            for prev_q in range(1, q):
                # Add the previous model predictions as feature
                test0_LGBM[f'prev_pred_{prev_q}'] = historical_meta_LGBM[session_id][prev_q - 1]
            FEATURES_LGBM += [f'prev_pred_{prev_q}' for prev_q in range(1, q)]                    
        
        model0 = joblib.load(f'/kaggle/input/0622-cv069391-thr0625-no-join/LGBM_fold0_q{q}.pkl') # fold 0
        model1 = joblib.load(f'/kaggle/input/0622-cv069391-thr0625-no-join/LGBM_fold1_q{q}.pkl') # fold 1
        model2 = joblib.load(f'/kaggle/input/0622-cv069391-thr0625-no-join/LGBM_fold2_q{q}.pkl') # fold 2
        model3 = joblib.load(f'/kaggle/input/0622-cv069391-thr0625-no-join/LGBM_fold3_q{q}.pkl') # fold 3
        model4 = joblib.load(f'/kaggle/input/0622-cv069391-thr0625-no-join/LGBM_fold4_q{q}.pkl') # fold 4
        
        p_0 = model0.predict_proba(np.array(test0_LGBM[FEATURES_LGBM]).astype('float32'))[:,1]
        p_1 = model1.predict_proba(np.array(test0_LGBM[FEATURES_LGBM]).astype('float32'))[:,1]
        p_2 = model2.predict_proba(np.array(test0_LGBM[FEATURES_LGBM]).astype('float32'))[:,1]
        p_3 = model3.predict_proba(np.array(test0_LGBM[FEATURES_LGBM]).astype('float32'))[:,1]
        p_4 = model4.predict_proba(np.array(test0_LGBM[FEATURES_LGBM]).astype('float32'))[:,1]
        
        p3 = (p_0 + p_1 + p_2 + p_3 + p_4) / 5   
        
        historical_meta_LGBM[session_id].append(p3)
        
        # 총 14개의 threshold
        if q in [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 16, 17]: # 이 경우에만 Ensemble
            x2 = p2
            x3 = p3
        ################################# Ensemble ###################################
            # XGboost weight: 0.89, catboost weight: 0.09, LGBM: 0.02
            # x = (x1 * 0.1) + (x2 * 0.9)
            x = (x1 * 0.09) + (x2 * 0.89) + (x3 * 0.02)
            print(f'question {q} threshold: {best_threshold_lists[q-1]}')
            sam_sub.loc[mask,'correct'] = (x > best_threshold_lists[q-1]).astype('int')
 
    sam_sub = sam_sub[['session_id', 'correct']]          
    env.predict(sam_sub)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
1134
question 1 threshold: 0.625
1135
1136
question 3 threshold: 0.62
3038
question 4 threshold: 0.605
3039
question 5 threshold: 0.62
3040
question 6 threshold: 0.615
3041
question 7 threshold: 0.62
3042
question 8 threshold: 0.62
3043
question 9 threshold: 0.595
3044
question 10 threshold: 0.625
3045
question 11 threshold: 0.625
3046
3047
4164
question 14 threshold: 0.62
4165
question 15 threshold: 0.62
4166
question 16 threshold: 0.62
4167
question 17 threshold: 0.615
4168
1134
question 1 threshold: 0.625
1135
1136
question 3 threshold: 0.62
3038
question 4 threshold: 0.605
3039
question 5 threshold: 0.62
3040
question 6 threshold: 0.615
3041
question 7 threshold: 0.62
3042
question 8 threshold: 0.62
3043
question 9 threshold: 0.595
3044
question 10 threshold: 0.625
3045
question 11 threshold: 0.625
3046
3047
4164
question 14 threshold: 0.62
4165
question 15 t

# EDA submission.csv

In [40]:
sam_sub

Unnamed: 0,session_id,correct
0,20090312331414616_q14,1
1,20090312331414616_q15,1
2,20090312331414616_q16,1
3,20090312331414616_q17,1
4,20090312331414616_q18,1


In [41]:
sub = pd.read_csv('./submission.csv')
# print(sub.correct.mean())

print(sub.shape, sub.correct.mean())
sub.head()

(54, 2) 0.6666666666666666


Unnamed: 0,session_id,correct
0,20090109393214576_q1,1
1,20090109393214576_q2,1
2,20090109393214576_q3,1
3,20090109393214576_q4,1
4,20090109393214576_q5,0


In [42]:
# df = pd.read_csv('submission.csv')
# print( df.shape )
# df.head(60)