In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


In [60]:
train = pd.read_csv('train.csv')

In [3]:
targets = pd.read_csv('train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))


In [4]:
CATS = ['event_name', 'fqid', 'room_fqid', 'text']
NUMS = ['elapsed_time', 'level', 'page', 'room_coor_x', 'room_coor_y',
        'screen_coor_x', 'screen_coor_y', 'hover_duration']

EVENTS = ['navigate_click', 'person_click', 'cutscene_click', 'object_click',
          'map_hover', 'notification_click', 'map_click', 'observation_click',
          'checkpoint']


In [5]:
def feature_engineer(train):

    dfs = []
    for c in CATS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('mean')
        tmp.name = tmp.name + '_mean'
        dfs.append(tmp)
    for c in NUMS:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    for c in EVENTS:
        train[c] = (train.event_name == c).astype('int8')
    for c in EVENTS + ['elapsed_time']:
        tmp = train.groupby(['session_id', 'level_group'])[c].agg('sum')
        tmp.name = tmp.name + '_sum'
        dfs.append(tmp)
    train = train.drop(EVENTS, axis=1)

    df = pd.concat(dfs, axis=1)
    df = df.fillna(-1)
    df = df.reset_index()
    df = df.set_index('session_id')
    return df


In [6]:
df = feature_engineer(train)


In [25]:
len(df)

70686

In [4]:
def choose_grp(col):
    if col <= 3:
        rlst = '0-4'
    elif (col <= 13)&(col>3):
        rlst = '5-12'
    elif (col <= 22)&(col>13):
        rlst = '13-22'
    return rlst


In [5]:
targets['level_group'] = targets.q.apply(lambda x: choose_grp(x))


In [6]:
label_dat = targets[['session', 'q', 'level_group', 'correct']].rename(
    columns={'session': 'session_id'}).set_index(['session_id', 'level_group'])


In [7]:
train = train.reset_index()
label_dat = label_dat.reset_index()

In [16]:
train.loc[train.session_id ==20090312431273200]

Unnamed: 0,level_0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,...,level_group,navigate_click,person_click,cutscene_click,object_click,map_hover,notification_click,map_click,observation_click,checkpoint
0,0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,...,0-4,0,0,1,0,0,0,0,0,0
1,1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,...,0-4,0,1,0,0,0,0,0,0,0
2,2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,...,0-4,0,1,0,0,0,0,0,0,0
3,3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,...,0-4,0,1,0,0,0,0,0,0,0
4,4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,...,0-4,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876,876,20090312431273200,927,1267357,navigate_click,undefined,22,,927.307255,-10.355929,...,13-22,1,0,0,0,0,0,0,0,0
877,877,20090312431273200,928,1268292,map_hover,basic,22,,,,...,13-22,0,0,0,0,1,0,0,0,0
878,878,20090312431273200,929,1269474,map_click,undefined,22,,457.523005,22.141338,...,13-22,0,0,0,0,0,0,1,0,0
879,879,20090312431273200,930,1270708,navigate_click,undefined,22,,224.190321,-60.268671,...,13-22,1,0,0,0,0,0,0,0,0


In [14]:
train = pd.merge(train, label_dat, how = 'left', on = ['session_id', 'level_group'])


In [18]:
len(label_dat)

424116

In [17]:
len(train)

167743910

In [18]:
train.columns


Index(['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level',
       'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
       'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid',
       'fullscreen', 'hq', 'music', 'level_group'],
      dtype='object')

In [60]:
def feature_engineer(data):
    train_agg1 = pd.crosstab(index=[data['session_id'], data['level_group']], columns=data['event_name'])
    train_agg2 = pd.crosstab(index=[data['session_id'], data['level_group']], columns=data['room_fqid'])

    def click_fn(col):
        if pd.isnull(col):
            rlst = 0
        else:
            rlst = 1
        return rlst

    data['click_yn'] = data['room_coor_x'].apply(lambda x: click_fn(x))
    
    binary = ['fullscreen', 'hq', 'music', 'click_yn']
    cont = ['elapsed_time', 'page', 'hover_duration']

    train_agg3 = data.groupby(['session_id', 'level_group'])[binary].sum()
    train_agg4 = data.groupby(['session_id', 'level_group'])[cont].mean()
    train_agg31 = data.groupby(['session_id', 'level_group'])[binary].mean()
    train_agg31.columns = ['fullscreen_avg', 'hq_avg', 'music_avg', 'click_yn_avg']
    finl_dat = pd.concat([train_agg1, train_agg2, train_agg3,
                          train_agg31[['click_yn_avg']], train_agg4], axis=1)
    finl_dat = finl_dat.reset_index()
    return finl_dat


In [None]:
def training_models(data):
    

In [8]:
train_agg1 = pd.crosstab(index = [train.session_id, train.level_group], columns = train.event_name)
train_agg2 = pd.crosstab(index = [train.session_id, train.level_group], columns = train.room_fqid)

In [9]:
def click_fn(col):
    if pd.isnull(col):
        rlst = 0
    else:
        rlst = 1
    return rlst

In [10]:
train['click_yn'] = train.room_coor_x.apply(lambda x: click_fn(x))

In [11]:
binary = ['fullscreen', 'hq', 'music', 'click_yn']

cont = ['elapsed_time', 'page', 'hover_duration']


In [12]:
train_agg3 = train.groupby(['session_id', 'level_group'])[binary].sum()
train_agg4 = train.groupby(['session_id', 'level_group'])[cont].mean()


In [13]:
train_agg31 = train.groupby(['session_id', 'level_group'])[binary].mean()


In [20]:
train_agg31.columns = ['fullscreen_avg', 'hq_avg', 'music_avg', 'click_yn_avg']


In [24]:
finl_dat = pd.concat([train_agg1, train_agg2, train_agg3,
                     train_agg31[['click_yn_avg']], train_agg4], axis=1)


In [25]:
finl_dat = finl_dat.reset_index()

In [26]:
finl_dat = pd.merge(finl_dat, label_dat, how = 'inner', on = ['session_id', 'level_group'])

In [28]:
question = list(finl_dat.q.unique())

In [29]:
len(question)

18

In [30]:
finl_dat.columns


Index(['session_id', 'level_group', 'checkpoint', 'cutscene_click',
       'map_click', 'map_hover', 'navigate_click', 'notebook_click',
       'notification_click', 'object_click', 'object_hover',
       'observation_click', 'person_click', 'tunic.capitol_0.hall',
       'tunic.capitol_1.hall', 'tunic.capitol_2.hall',
       'tunic.drycleaner.frontdesk', 'tunic.flaghouse.entry',
       'tunic.historicalsociety.basement', 'tunic.historicalsociety.cage',
       'tunic.historicalsociety.closet',
       'tunic.historicalsociety.closet_dirty',
       'tunic.historicalsociety.collection',
       'tunic.historicalsociety.collection_flag',
       'tunic.historicalsociety.entry', 'tunic.historicalsociety.frontdesk',
       'tunic.historicalsociety.stacks', 'tunic.humanecology.frontdesk',
       'tunic.kohlcenter.halloffame', 'tunic.library.frontdesk',
       'tunic.library.microfiche', 'tunic.wildlife.center', 'fullscreen', 'hq',
       'music', 'click_yn', 'click_yn_avg', 'elapsed_time', 'pag

In [6]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import pandas as pd

In [4]:
def feature_engineer(data):
    train_agg1 = pd.crosstab(
        index=[data['session_id'], data['level_group']], columns=data['event_name'])
    train_agg2 = pd.crosstab(
        index=[data['session_id'], data['level_group']], columns=data['room_fqid'])

    def click_fn(col):
        if pd.isnull(col):
            rlst = 0
        else:
            rlst = 1
        return rlst

    data['click_yn'] = data['room_coor_x'].apply(lambda x: click_fn(x))

    binary = ['fullscreen', 'hq', 'music', 'click_yn']
    cont = ['elapsed_time', 'page', 'hover_duration']

    train_agg3 = data.groupby(['session_id', 'level_group'])[binary].sum()
    train_agg4 = data.groupby(['session_id', 'level_group'])[cont].mean()
    train_agg31 = data.groupby(['session_id', 'level_group'])[binary].mean()
    train_agg31.columns = ['fullscreen_avg',
                           'hq_avg', 'music_avg', 'click_yn_avg']
    finl_dat = pd.concat([train_agg1, train_agg2, train_agg3,
                          train_agg31[['click_yn_avg']], train_agg4], axis=1)
    finl_dat = finl_dat.reset_index()
    return finl_dat



def label_preprocess(data):
    def choose_grp(col):
        if col <= 3:
            rlst = '0-4'
        elif (col <= 13) & (col > 3):
            rlst = '5-12'
        elif (col <= 22) & (col > 13):
            rlst = '13-22'
        return rlst
    data['session'] = data['session_id'].apply(lambda x: int(x.split('_')[0]))
    data['q'] = data['session_id'].apply(lambda x: int(x.split('_')[-1][1:]))
    data['level_group'] = data['q'].apply(lambda x: choose_grp(x))
    label_dat = data[['session', 'q', 'level_group', 'correct']].rename(
        columns={'session': 'session_id'})
    return label_dat
    
def train_test(train_dat, test_dat, level_group):
    input_col_ls = ['checkpoint', 'cutscene_click',
                    'map_click', 'map_hover', 'navigate_click', 'notebook_click',
                    'notification_click', 'object_click', 'object_hover',
                    'observation_click', 'person_click', 'tunic.capitol_0.hall',
                    'tunic.capitol_1.hall', 'tunic.capitol_2.hall',
                    'tunic.drycleaner.frontdesk', 'tunic.flaghouse.entry',
                    'tunic.historicalsociety.basement', 'tunic.historicalsociety.cage',
                    'tunic.historicalsociety.closet',
                    'tunic.historicalsociety.closet_dirty',
                    'tunic.historicalsociety.collection',
                    'tunic.historicalsociety.collection_flag',
                    'tunic.historicalsociety.entry', 'tunic.historicalsociety.frontdesk',
                    'tunic.historicalsociety.stacks', 'tunic.humanecology.frontdesk',
                    'tunic.kohlcenter.halloffame', 'tunic.library.frontdesk',
                    'tunic.library.microfiche', 'tunic.wildlife.center', 'fullscreen', 'hq',
                    'music', 'click_yn', 'click_yn_avg', 'elapsed_time', 'page',
                    'hover_duration']
    if level_group == '0-4':
        lv = [1, 2, 3]
    elif level_group == '5-12':
        lv = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    elif level_group == '13-22':
        lv = [14, 15, 16, 17, 18]

    blk_dat = pd.DataFrame(columns = ['session_id', 'q', 'correct'])

    temp_test = test_dat.loc[test_dat['level_group'] == level_group, :]
    for q in tqdm(lv):
        temp_train = train_dat.loc[train_dat['q'] == q, :]
        X_train, X_test, y_train, y_test = train_test_split(temp_train[input_col_ls], temp_train['correct'],
                                                            test_size=0.25, stratify=temp_train['correct'], random_state=32)
        std = StandardScaler()
        std.fit(X_train)
        X_train_scaled = std.transform(X_train)
        std.fit(X_test)
        X_test_scaled = std.transform(X_test)
        model = XGBClassifier(n_estimators=500, learning_rate=0.2,
                            max_depth=4, random_state=32)

        model.fit(X_train_scaled, y_train)

        temp_test['q'] = q
        y_pred = model.predict(temp_test[input_col_ls])
        temp_test['correct'] = y_pred
        blk_dat = pd.concat([blk_dat, temp_test[['session_id', 'q', 'correct']]], axis = 0)
    
    return blk_dat

def total_processing():
    train = pd.read_csv('train.csv')
    targets = pd.read_csv('train_labels.csv')
    test = pd.read_csv('test.csv')

    pre_train = feature_engineer(train)
    pre_target = label_preprocess(targets)

    mrg_dat = pd.merge(pre_train, pre_target, how='inner',
                    on=['session_id', 'level_group'])
    pre_test = feature_engineer(test)

    blk_dat1 = pd.DataFrame()
    for lv in tqdm(list(pre_test.level_group.unique())):
        rlst = train_test(mrg_dat, pre_test, lv)
        blk_dat1 = pd.concat([blk_dat1, rlst], axis=0)

    blk_dat1['session_level'] = blk_dat1['q']-1
    blk_dat1['session_id'] = blk_dat1['session_id'].astype('str')
    blk_dat1['q'] = blk_dat1['q'].astype('str')
    blk_dat1['tmp_id'] = blk_dat1['session_id'] + '_q' + blk_dat1['q']
    blk_dat1 = blk_dat1[['tmp_id', 'correct', 'session_level']].rename(columns={'tmp_id': 'session_id'})

    blk_dat1.to_csv('submission.csv')
return 


In [61]:
train = pd.read_csv('train.csv')
targets = pd.read_csv('train_labels.csv')
test = pd.read_csv('test.csv')


In [62]:
pre_train = feature_engineer(train)
pre_target = label_preprocess(targets)

In [63]:
mrg_dat = pd.merge(pre_train, pre_target, how ='inner', on = ['session_id', 'level_group'])


In [64]:
pre_test = feature_engineer(test)


In [65]:
input_col_ls = ['checkpoint', 'cutscene_click',
       'map_click', 'map_hover', 'navigate_click', 'notebook_click',
       'notification_click', 'object_click', 'object_hover',
       'observation_click', 'person_click', 'tunic.capitol_0.hall',
       'tunic.capitol_1.hall', 'tunic.capitol_2.hall',
       'tunic.drycleaner.frontdesk', 'tunic.flaghouse.entry',
       'tunic.historicalsociety.basement', 'tunic.historicalsociety.cage',
       'tunic.historicalsociety.closet',
       'tunic.historicalsociety.closet_dirty',
       'tunic.historicalsociety.collection',
       'tunic.historicalsociety.collection_flag',
       'tunic.historicalsociety.entry', 'tunic.historicalsociety.frontdesk',
       'tunic.historicalsociety.stacks', 'tunic.humanecology.frontdesk',
       'tunic.kohlcenter.halloffame', 'tunic.library.frontdesk',
       'tunic.library.microfiche', 'tunic.wildlife.center', 'fullscreen', 'hq',
       'music', 'click_yn', 'click_yn_avg', 'elapsed_time', 'page',
       'hover_duration']

In [37]:
def train_test(train_dat, test_dat, level_group):
    input_col_ls = ['checkpoint', 'cutscene_click',
                    'map_click', 'map_hover', 'navigate_click', 'notebook_click',
                    'notification_click', 'object_click', 'object_hover',
                    'observation_click', 'person_click', 'tunic.capitol_0.hall',
                    'tunic.capitol_1.hall', 'tunic.capitol_2.hall',
                    'tunic.drycleaner.frontdesk', 'tunic.flaghouse.entry',
                    'tunic.historicalsociety.basement', 'tunic.historicalsociety.cage',
                    'tunic.historicalsociety.closet',
                    'tunic.historicalsociety.closet_dirty',
                    'tunic.historicalsociety.collection',
                    'tunic.historicalsociety.collection_flag',
                    'tunic.historicalsociety.entry', 'tunic.historicalsociety.frontdesk',
                    'tunic.historicalsociety.stacks', 'tunic.humanecology.frontdesk',
                    'tunic.kohlcenter.halloffame', 'tunic.library.frontdesk',
                    'tunic.library.microfiche', 'tunic.wildlife.center', 'fullscreen', 'hq',
                    'music', 'click_yn', 'click_yn_avg', 'elapsed_time', 'page',
                    'hover_duration']
    if level_group == '0-4':
        lv = [1, 2, 3]
    elif level_group == '5-12':
        lv = [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    elif level_group == '13-22':
        lv = [14, 15, 16, 17, 18]

    blk_dat = pd.DataFrame(columns = ['session_id', 'q', 'correct'])

    temp_test = test_dat.loc[test_dat['level_group'] == level_group, :]
    for q in tqdm(lv):
        temp_train = train_dat.loc[train_dat['q'] == q, :]
        X_train, X_test, y_train, y_test = train_test_split(temp_train[input_col_ls], temp_train['correct'],
                                                            test_size=0.25, stratify=temp_train['correct'], random_state=32)
        std = StandardScaler()
        std.fit(X_train)
        X_train_scaled = std.transform(X_train)
        std.fit(X_test)
        X_test_scaled = std.transform(X_test)
        model = XGBClassifier(n_estimators=500, learning_rate=0.2,
                            max_depth=4, random_state=32)

        model.fit(X_train_scaled, y_train)

        temp_test['q'] = q
        y_pred = model.predict(temp_test[input_col_ls])
        temp_test['correct'] = y_pred
        blk_dat = pd.concat([blk_dat, temp_test[['session_id', 'q', 'correct']]], axis = 0)
    
    return blk_dat


In [38]:
blk_dat1 = pd.DataFrame()

In [39]:
for lv in tqdm(list(pre_test.level_group.unique())):
    rlst = train_test(mrg_dat, pre_test, lv)
    blk_dat1 = pd.concat([blk_dat1, rlst], axis = 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_test['q'] = q
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_test['correct'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_test['q'] = q
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [66]:
a = [0.85, 0.42, 0.65, 0.12]

In [68]:
int(0.63)

0

In [67]:
b = [1 for i in a if i>0.63 else 0]

SyntaxError: invalid syntax (2468123017.py, line 1)

In [43]:
blk_dat1['session_level'] = blk_dat1['q']-1
blk_dat1['session_id'] = blk_dat1['session_id'].astype('str')
blk_dat1['q'] = blk_dat1['q'].astype('str')
blk_dat1['tmp_id'] = blk_dat1['session_id'] + '_q' + blk_dat1['q']


In [45]:
blk_dat2 = blk_dat1[['tmp_id', 'correct', 'session_level']].rename(columns={'tmp_id' : 'session_id'})

In [47]:
blk_dat2.to_csv('submission.csv')


In [59]:
f1_ls = []
for q in tqdm(question):
    print(f'문제 {q}번 모델 학습 시작')
    temp_dat = finl_dat.loc[finl_dat.q == q, :]
    X_train, X_test, y_train, y_test = train_test_split(temp_dat[input_col_ls], temp_dat['correct'],
                                                        test_size=0.25, stratify=temp_dat['correct'], random_state=32)
    std = StandardScaler()
    std.fit(X_train)
    X_train_scaled = std.transform(X_train)
    std.fit(X_test)
    X_test_scaled = std.transform(X_test)
    model = XGBClassifier(n_estimators=500, learning_rate=0.2,
                          max_depth=4, random_state=32)

    model.fit(X_train, y_train)
    
    # globals()['model'+str(q)] = model
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    f1_ls.append(f1)

np.mean(f1_ls)



NameError: name 'question' is not defined

In [54]:
test_dat = pd.read_csv('test.csv')

In [None]:
test_dat_pp = feature_engineer(test_dat)


In [73]:
data_for_test = pd.DataFrame(columns = finl_dat.columns)

In [74]:
data_for_test = data_for_test.append(test_dat_pp)


In [76]:
data_for_test

Unnamed: 0,session_id,level_group,checkpoint,cutscene_click,map_click,map_hover,navigate_click,notebook_click,notification_click,object_click,...,fullscreen,hq,music,click_yn,click_yn_avg,elapsed_time,page,hover_duration,q,correct
0,20090109393214576,0-4,1,27,3,2,62,4,5,9,...,0,0,140,134,0.957143,117119.8,0.5,2006.8,,
1,20090109393214576,13-22,1,78,7,20,240,32,10,40,...,0,0,586,548,0.935154,6165666.0,4.84375,973.864865,,
2,20090109393214576,5-12,1,11,6,17,130,18,9,175,...,0,0,536,478,0.891791,2092368.0,1.888889,2301.315789,,
3,20090312143683264,0-4,1,33,2,2,62,3,9,22,...,0,0,163,153,0.93865,142418.3,0.0,3066.555556,,
4,20090312143683264,13-22,1,55,12,31,414,30,8,37,...,0,0,727,674,0.927098,2195108.0,4.833333,864.096154,,
5,20090312143683264,5-12,1,16,13,22,276,23,9,92,...,0,0,611,539,0.88216,791150.4,1.565217,1379.492958,,
6,20090312331414616,0-4,1,30,2,4,41,9,6,10,...,0,0,130,118,0.907692,112832.8,0.0,2176.818182,,
7,20090312331414616,13-22,1,55,9,26,202,22,11,50,...,0,0,517,457,0.883946,1262481.0,5.545455,794.542373,,
8,20090312331414616,5-12,1,11,7,25,92,12,9,36,...,0,0,318,265,0.833333,558520.5,2.25,1561.096154,,


In [79]:
pred = model1.predict(tmp_dat[input_col_ls])


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:checkpoint: object, cutscene_click: object, map_click: object, map_hover: object, navigate_click: object, notebook_click: object, notification_click: object, object_click: object, object_hover: object, observation_click: object, person_click: object, tunic.capitol_0.hall: object, tunic.capitol_1.hall: object, tunic.capitol_2.hall: object, tunic.drycleaner.frontdesk: object, tunic.flaghouse.entry: object, tunic.historicalsociety.basement: object, tunic.historicalsociety.cage: object, tunic.historicalsociety.closet: object, tunic.historicalsociety.closet_dirty: object, tunic.historicalsociety.collection: object, tunic.historicalsociety.collection_flag: object, tunic.historicalsociety.entry: object, tunic.historicalsociety.frontdesk: object, tunic.historicalsociety.stacks: object, tunic.humanecology.frontdesk: object, tunic.kohlcenter.halloffame: object, tunic.library.frontdesk: object, tunic.library.microfiche: object, tunic.wildlife.center: object, fullscreen: object, hq: object, music: object, click_yn: object

In [77]:
group_ls = ['0-4', '13-22', '5-12']
model_dic = {'0-4' : ([model1, model2, model3], [1, 2, 3]), 
            '5-12' : ([model4, model5, model6, model7, model8, model9, model10, model11, model12, model13], [4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
            '13-22' : ([model14, model15, model16, model17, model18], [14, 15, 16, 17, 18])}
blk_dat = pd.DataFrame()
for grp in group_ls:
    tmp_dat = data_for_test.loc[data_for_test['level_group'] == grp]
    m_idx = model_dic[grp]
    for i in range(len(m_idx[1])):
        tmp_dat['q'] = m_idx[1][i]
        tmp_model = m_idx[0][i]
        pred = tmp_model.predict(tmp_dat[input_col_ls])
        tmp_dat['correct'] = pred
        blk_dat = pd.concat([blk_dat, tmp_dat[['session_id', 'q', 'correct']]], axis = 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_dat['q'] = m_idx[1][i]


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:checkpoint: object, cutscene_click: object, map_click: object, map_hover: object, navigate_click: object, notebook_click: object, notification_click: object, object_click: object, object_hover: object, observation_click: object, person_click: object, tunic.capitol_0.hall: object, tunic.capitol_1.hall: object, tunic.capitol_2.hall: object, tunic.drycleaner.frontdesk: object, tunic.flaghouse.entry: object, tunic.historicalsociety.basement: object, tunic.historicalsociety.cage: object, tunic.historicalsociety.closet: object, tunic.historicalsociety.closet_dirty: object, tunic.historicalsociety.collection: object, tunic.historicalsociety.collection_flag: object, tunic.historicalsociety.entry: object, tunic.historicalsociety.frontdesk: object, tunic.historicalsociety.stacks: object, tunic.humanecology.frontdesk: object, tunic.kohlcenter.halloffame: object, tunic.library.frontdesk: object, tunic.library.microfiche: object, tunic.wildlife.center: object, fullscreen: object, hq: object, music: object, click_yn: object

In [48]:
blk_dat1.columns

Index(['session_id', 'q', 'correct', 'session_level', 'tmp_id'], dtype='object')

In [52]:
blk_dat1['session_level'].dtype


dtype('O')

In [58]:
train.dtypes

session_id          int64
index               int64
elapsed_time        int64
event_name         object
name               object
level               int64
page              float64
room_coor_x       float64
room_coor_y       float64
screen_coor_x     float64
screen_coor_y     float64
hover_duration    float64
text               object
fqid               object
room_fqid          object
text_fqid          object
fullscreen          int64
hq                  int64
music               int64
level_group        object
dtype: object

In [57]:
train.page.dtype == 'float32'

False

In [69]:
targets[['level_group', 'q']].drop_duplicates()

Unnamed: 0,level_group,q
0,0-4,1
23562,0-4,2
47124,0-4,3
70686,5-12,4
94248,5-12,5
117810,5-12,6
141372,5-12,7
164934,5-12,8
188496,5-12,9
212058,5-12,10


In [None]:
def inferrence(data):
    group_ls = ['0-4', '13-22', '5-12']
    for grq in group_ls:
        tmp_dat = data.loc[data['level_group'] == grp]
        
        

In [63]:
test_dat_pp.level_group.unique()


array(['0-4', '13-22', '5-12'], dtype=object)

In [58]:
sb = pd.read_csv('sample_submission.csv')

In [59]:
sb


Unnamed: 0,session_id,correct,session_level
0,20090109393214576_q1,0,0
1,20090312143683264_q1,0,0
2,20090312331414616_q1,0,0
3,20090109393214576_q2,0,0
4,20090312143683264_q2,0,0
5,20090312331414616_q2,0,0
6,20090109393214576_q3,0,0
7,20090312143683264_q3,0,0
8,20090312331414616_q3,0,0
9,20090109393214576_q4,0,1


In [56]:
test_dat.columns

Index(['session_id', 'index', 'elapsed_time', 'event_name', 'name', 'level',
       'page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
       'hover_duration', 'text', 'fqid', 'room_fqid', 'text_fqid',
       'fullscreen', 'hq', 'music', 'level_group', 'session_level'],
      dtype='object')

In [51]:
f1_ls

[0.8285564304461942,
 0.988929889298893,
 0.9645527311109157,
 0.8145903236280984,
 0.5988543655615345,
 0.8278373457414526,
 0.7856148988547577,
 0.9735993726583602,
 0.8855172413793103,
 0.6678165876096019,
 0.8696262150516083,
 0.8320033164058451,
 0.7138201569100785,
 0.8327075511055486,
 0.6165535079211122,
 0.7545591822511326,
 0.9221149425287356,
 0.2759226713532513]

In [52]:
model.coef_

AttributeError: Coefficients are not defined for Booster type None

In [33]:
X_train, X_test, y_train, y_test = train_test_split(finl_dat[input_col_ls], finl_dat['correct'],
                                                    test_size=0.25, stratify=finl_dat['correct'], random_state=32)


In [34]:
train_idx = X_train.index
test_idx = X_test.index


In [35]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()


In [36]:
X_train = finl_dat.iloc[train_idx][input_col_ls]
X_test = finl_dat.iloc[test_idx][input_col_ls]


In [37]:
std.fit(X_train)
X_train_scaled = std.transform(X_train)
std.fit(X_test)
X_test_scaled = std.transform(X_test)


In [38]:
model = XGBClassifier(n_estimators=500, learning_rate=0.2,
                      max_depth=4, random_state=32)


In [39]:
model.fit(X_train, y_train)


In [40]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)  # 예측 라벨(0과 1로 예측)

# 예측 라벨과 실제 라벨 사이의 정확도 측정
# accuracy_score(y_pred, y_test)  # 0.7847533632286996


In [41]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.66      0.36      0.47     31211
           1       0.78      0.92      0.84     74818

    accuracy                           0.76    106029
   macro avg       0.72      0.64      0.66    106029
weighted avg       0.74      0.76      0.73    106029



In [42]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred)


In [43]:
f1

0.8430500940331681

In [44]:
feature_importance = pd.DataFrame(model.feature_importances_.reshape(
    (1, -1)), columns=X_train.columns, index=['feature_importance'])
feature_importance.transpose().sort_values(
    ['feature_importance'], ascending=False)


Unnamed: 0,feature_importance
q,0.171065
tunic.library.frontdesk,0.080695
tunic.historicalsociety.frontdesk,0.075839
tunic.historicalsociety.cage,0.070017
map_click,0.067527
object_click,0.066452
tunic.historicalsociety.closet_dirty,0.055924
music,0.046331
cutscene_click,0.026888
tunic.historicalsociety.stacks,0.021557
