# Config

In [2]:
data_dir = '/opt/ml/input/data/train_dataset'
file_name = 'train_data.csv'
test_file_name = 'test_data.csv'

In [3]:
y_id = 'cycle' # 'user' or 'cycle'
y_method = 'last' # 'last' or 'next'
n_folds = 5

# Feature Engineering

In [4]:
import os
import easydict
import time
from datetime import datetime

from fe.feature import FEPipeline
from fe.agg import *

In [5]:
def y_id2columns(y_id):
    if y_id == 'user':
        y_id = 'userID'
    elif y_id == 'cycle':
        y_id = ['userID', 'testId', 'Retest']
    
    return y_id

In [6]:
def convert_time(s):
    timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
    return int(timestamp)

In [7]:
def preprocess(df):
    preprocess_df_path = '/opt/ml/features/preprocess.csv'
    
    if not os.path.exists(preprocess_df_path):
        # YYYY-MM-DD HH:MM:SS -> sec format
        df['Timestamp'] = df['Timestamp'].apply(convert_time)
        
        # 한 문제 푸는데 걸린 시간
        df['time_diff'] = df['Timestamp'] - df['Timestamp'].shift(1)
        
        # userID 별 푼 문항의 누적 합
        df['UserCumtestnum'] = df.groupby(['userID'])['answerCode'].cumcount()

        # userID, KnowledgeTag 별 푼 문항의 누적 합
        df['UserTagCumtestnum'] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].cumcount()

        # userID, testId 별 푼 문항의 누적 합
        df['UserTestCumtestnum'] = df.groupby(['userID','testId'])['answerCode'].cumcount()

        testId2maxlen = df[['assessmentItemID', 'testId']].drop_duplicates().groupby('testId').size()
        # test의 문항 수
        df['TestSize'] = df.testId.map(testId2maxlen)
        # user가 같은 test를 여러 번 푼 것인지 나타낸 변수 (처음 품 : 0, 두번 품 : 1, 세번 품 : 2)
        df['Retest'] = df['UserTestCumtestnum'] // df['TestSize']

        # Cycle 별 푼 문항의 누적 합
        df['UserCycleCumtestnum'] = df['UserTestCumtestnum'] % df['TestSize']        

        df.to_csv(preprocess_df_path, index=False)
        
        print('* Success to save preprocessed df')

    else:
        df = pd.read_csv(preprocess_df_path)
    
        print('* Success to load preprocessed df')
    
    return df

In [8]:
def set_y(df, y_id, y_method):
    if y_method == 'last':
        # y_id 별 마지막 행을 y로
        y = df.groupby(y_id)['answerCode'].apply(lambda x: list(x)[-1])
        y.name = 'y'
        df = df.merge(y, how="inner", on=y_id)

        # y_id 별 마지막 행 제거
        last_idx = df.groupby(y_id).apply(lambda x: x.index[-1]).values
        df = df.drop(last_idx)
    
    elif y_method == 'next':
        # 다음 answerCode를 y로
        df['y'] = df.answerCode.shift(-1)

        # y_id 별 마지막 행 제거
        last_idx = df.groupby(y_id).apply(lambda x: x.index[-1]).values
        df = df.drop(last_idx)
    
    df = df.reset_index(drop=True)
    
    print(f"* Success to set y by method '{y_method}'")
    
    return df

In [9]:
def make_cumdata(df):
    preprocess_df_path = '/opt/ml/features/cumdata.csv'
    
    if not os.path.exists(preprocess_df_path):
        temp_df = pd.DataFrame(df.userID)
        
        # test의 문항 번호
        temp_df['testNumber'] = [int(assessment[-3:]) for assessment in df.assessmentItemID]

        # userID 별 맞춘 문항의 누적 합
        temp_df["UserCumcorrectnum"] = df.groupby(['userID'])['answerCode'].apply(lambda x: x.cumsum().shift(1))
        # userID 별 누적 정답률
        temp_df["UserCumcorrectper"] = temp_df['UserCumcorrectnum'] / df['UserCumtestnum']

        # userID, KnowledgeTag 별 맞춘 문항의 누적 합
        temp_df["UserTagCumcorrectnum"] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].apply(lambda x: x.cumsum().shift(1))
        temp_df["UserTagCumcorrectnum"] = temp_df["UserTagCumcorrectnum"].fillna(0)
        # userID, KnowledgeTag 별 누적 정답률
        temp_df["UserTagCumcorrectper"] = temp_df['UserTagCumcorrectnum'] / df['UserTagCumtestnum']
        temp_df["UserTagCumcorrectper"] = temp_df["UserTagCumcorrectper"].fillna(0)

        # userID, testId 별 맞춘 문항의 누적 합
        temp_df["UserTestCumcorrectnum"] = df.groupby(['userID','testId'])['answerCode'].apply(lambda x: x.cumsum().shift(1))
        # userID, testId 별 누적 정답률
        temp_df["UserTestCumcorrectper"] = temp_df['UserTestCumcorrectnum'] / df['UserTestCumtestnum']

        # Cycle 별 맞춘 문항의 누적 합
        temp_df['UserCycleCumcorrectnum'] = df.groupby(['userID','testId','Retest'])['answerCode'].apply(lambda x: x.cumsum().shift(1))
        # Cycle 별 누적 정답률
        temp_df['UserCycleCumcorrectper'] = temp_df['UserCycleCumcorrectnum']/df['UserCycleCumtestnum']
        
        temp_df = temp_df.drop('userID', axis=1)
        temp_df.to_csv(preprocess_df_path, index=False)
        
        print('* Success to save cumulative data')
    
    else:
        temp_df = pd.read_csv(preprocess_df_path)
    
        print('* Success to load cumulative data')

    df = pd.concat([df, temp_df], axis=1)
    
    return df

In [10]:
csv_file_path = os.path.join(data_dir, file_name)
train_df = pd.read_csv(csv_file_path)

test_csv_file_path = os.path.join(data_dir, test_file_name)
test_df = pd.read_csv(test_csv_file_path)

df = pd.concat([train_df, test_df])

In [11]:
y_id = y_id2columns(y_id)
df = df.sort_values(by=['userID','Timestamp']).reset_index(drop=True)
df = preprocess(df)
df = set_y(df, y_id, y_method)
df = make_cumdata(df)

* Success to load preprocessed df
* Success to set y by method 'last'
* Success to load cumulative data


In [12]:
args = easydict.EasyDict({'root_dir' : './'})

In [13]:
# 건모님의 FE class
FEpl = FEPipeline(args, [MakeQuestionCount, 
                         MakeCorrectCount, 
                         MakeCorrectPercent, 
                         MakeTopNCorrectPercent,
                         MakeTagAnswerData,
                         MakeUserTagAnswerData,
                         MakeTestAnswerData,
                         MakeUserTestAnswerData,
                         MakeAssessmentAnswerData])

In [14]:
FEpl.debug()

In [14]:
FEpl.description()

[Feature Descriptions]

feature name : base_feature
feature type : seq
 - userID               : 사용자의 고유 번호입니다. 총 7,442명의 학생이 있습니다
 - assessmentItemID     : 사용자가 푼 문항의 일련 번호입니다.
 - testID               : 사용자가 푼 문항이 포함된 시험지의 일련 번호입니다.
 - answerCode           : 사용자가 푼 문항의 정답 여부를 담고 있는 이진 (0/1) 데이터입니다.
 - Timestamp            : 사용자가 문항을 푼 시간 정보입니다.
 - KnowledgeTag         : 사용자가 푼 문항의 고유 태그가 담겨져 있습니다.

feature name : make_question_count
feature type : agg
 - quesCnt              : 사용자가 푼 문항수를 나타냅니다.

feature name : make_correct_count
feature type : agg
 - correctCnt           : 사용자가 맞춘 문항수를 나타냅니다.

feature name : make_correct_percent
feature type : agg
 - correctPer           : 사용자가 푼 전체 문항에 대한 정답률입니다.

feature name : make_topn_correct_percent
feature type : agg
 - top10CorrectPer      : 사용자가 최근 푼 TOP-10개에 대한 정답률입니다.
 - top30CorrectPer      : 사용자가 최근 푼 TOP-30개에 대한 정답률입니다.
 - top50CorrectPer      : 사용자가 최근 푼 TOP-50개에 대한 정답률입니다.
 - top100CorrectPer     : 사용자가 최근 푼 TOP-100개에 대한 정답률입니다.

feat

In [15]:
new_df = FEpl.transform(df, is_train=True)

Feature Engineering Start ... 


load features /opt/ml/features/train_make_question_count.pkl to dataframe ... 



Feature Engineering Name: make_question_count

quesCnt         : 사용자가 푼 문항수를 나타냅니다.
dtype: int64
[Examples]
INDEX 0000: 641
INDEX 1000: 641
INDEX 2000: 641
INDEX 3000: 641
INDEX 4000: 641
INDEX 5000: 641
INDEX 6000: 641
INDEX 7000: 770
INDEX 8000: 770
INDEX 9000: 770


load features /opt/ml/features/train_make_correct_count.pkl to dataframe ... 



Feature Engineering Name: make_correct_count

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.
dtype: int64
[Examples]
INDEX 0000: 422
INDEX 1000: 422
INDEX 2000: 422
INDEX 3000: 422
INDEX 4000: 422
INDEX 5000: 422
INDEX 6000: 422
INDEX 7000: 673
INDEX 8000: 673
INDEX 9000: 673


load features /opt/ml/features/train_make_correct_percent.pkl to dataframe ... 



Feature Engineering Name: make_correct_percent

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 0.6583463338533542
INDEX 1000: 0.6583463338533542
INDEX 2000: 0.6583463338533542
INDEX 3000: 0.6583463338533542
INDEX 4000: 0.6583463338533542
INDEX 5000: 0.6583463338533542
INDEX 6000: 0.6583463338533542
INDEX 7000: 0.874025974025974
INDEX 8000: 0.874025974025974
INDEX 9000: 0.874025974025974


load features /opt/ml/features/train_make_topn_correct_percent.pkl to dataframe ... 



Feature Engineering Name: make_topn_correct_percent

top10CorrectPer : 사용자가 최근 푼 TOP-10개에 대한 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 0.7
INDEX 1000: 0.7
INDEX 2000: 0.7
INDEX 3000: 0.7
INDEX 4000: 0.7
INDEX 5000: 0.7
INDEX 6000: 0.7
INDEX 7000: 0.9
INDEX 8000: 0.9
INDEX 9000: 0.9

top30CorrectPer : 사용자가 최근 푼 TOP-30개에 대한 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 0.6666666666666666
INDEX 1000: 0.6666666666666666
INDEX 2000: 0.6666666666666666
INDEX 3000: 0.6666666666666666
INDEX 4000: 0.6666666666666666
INDEX 5000: 0.6666666666666666
INDEX 6000: 0.6666666666666666
INDEX 7000: 0.8666666666666667
INDEX 8000: 0.8666666666666667
INDEX 9000: 0.8666666666666667

top50CorrectPer : 사용자가 최근 푼 TOP-50개에 대한 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 0.56
INDEX 1000: 0.56
INDEX 2000: 0.56
INDEX 3000: 0.56
INDEX 4000: 0.56
INDEX 5000: 0.56
INDEX 6000: 0.56
INDEX 7000: 0.86
INDEX 8000: 0.86
INDEX 9000: 0.86

top100CorrectPer : 사용자가 최근 푼 TOP-100개에 대한 정답률입니다.
dtype: float64
[Examples]
INDEX

load features /opt/ml/features/train_make_tag_answer_data.pkl to dataframe ... 



Feature Engineering Name: make_tag_answer_data

TagCorrectPer   : KnowledgeTag 별 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 0.9586114819759679
INDEX 1000: 0.9586114819759679
INDEX 2000: 0.9586114819759679
INDEX 3000: 0.9586114819759679
INDEX 4000: 0.9586114819759679
INDEX 5000: 0.9586114819759679
INDEX 6000: 0.9586114819759679
INDEX 7000: 0.9586114819759679
INDEX 8000: 0.9166944351882705
INDEX 9000: 0.9166944351882705

TagCorrectSum   : KnowledgeTag 별 정답 문항 수의 합입니다.
dtype: int64
[Examples]
INDEX 0000: 718
INDEX 1000: 718
INDEX 2000: 718
INDEX 3000: 718
INDEX 4000: 718
INDEX 5000: 718
INDEX 6000: 718
INDEX 7000: 718
INDEX 8000: 2751
INDEX 9000: 2751


load features /opt/ml/features/train_make_user_tag_answer_data.pkl to dataframe ... 



Feature Engineering Name: make_user_tag_answer_data

UserTagCorrectPer : userID, KnowledgeTag 별 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 1.0
INDEX 1000: 1.0
INDEX 2000: 1.0
INDEX 3000: 1.0
INDEX 4000: 1.0
INDEX 5000: 1.0
INDEX 6000: 1.0
INDEX 7000: 1.0
INDEX 8000: 1.0
INDEX 9000: 1.0

UserTagCorrectSum : userID, KnowledgeTag 별 정답 문항 수의 합입니다.
dtype: int64
[Examples]
INDEX 0000: 1
INDEX 1000: 2
INDEX 2000: 2
INDEX 3000: 1
INDEX 4000: 2
INDEX 5000: 3
INDEX 6000: 1
INDEX 7000: 1
INDEX 8000: 12
INDEX 9000: 5


load features /opt/ml/features/train_make_test_answer_data.pkl to dataframe ... 



Feature Engineering Name: make_test_answer_data

TestCorrectPer  : testId 별 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 0.9576
INDEX 1000: 0.9576
INDEX 2000: 0.9576
INDEX 3000: 0.9576
INDEX 4000: 0.9576
INDEX 5000: 0.9576
INDEX 6000: 0.9576
INDEX 7000: 0.9576
INDEX 8000: 0.9576
INDEX 9000: 0.9576

TestCorrectSum  : testId 별 정답 문항 수의 합입니다.
dtype: int64
[Examples]
INDEX 0000: 1197
INDEX 1000: 1197
INDEX 2000: 1197
INDEX 3000: 1197
INDEX 4000: 1197
INDEX 5000: 1197
INDEX 6000: 1197
INDEX 7000: 1197
INDEX 8000: 1197
INDEX 9000: 1197


load features /opt/ml/features/train_make_user_test_answer_data.pkl to dataframe ... 



Feature Engineering Name: make_user_test_answer_data

UserTestCorrectPer : userID, testId 별 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 1.0
INDEX 1000: 1.0
INDEX 2000: 1.0
INDEX 3000: 0.0
INDEX 4000: 1.0
INDEX 5000: 1.0
INDEX 6000: 1.0
INDEX 7000: 1.0
INDEX 8000: 1.0
INDEX 9000: 1.0

UserTestCorrectSum : userID, testId 별 정답 문항 수의 합입니다.
dtype: int64
[Examples]
INDEX 0000: 5
INDEX 1000: 5
INDEX 2000: 5
INDEX 3000: 0
INDEX 4000: 5
INDEX 5000: 5
INDEX 6000: 5
INDEX 7000: 5
INDEX 8000: 5
INDEX 9000: 5


load features /opt/ml/features/train_make_assessment_answer_data.pkl to dataframe ... 



Feature Engineering Name: make_assessment_answer_data

AssessmentCorrectPer : assessmentItemID 별 정답률입니다.
dtype: float64
[Examples]
INDEX 0000: 0.984
INDEX 1000: 0.984
INDEX 2000: 0.984
INDEX 3000: 0.968
INDEX 4000: 0.968
INDEX 5000: 0.916
INDEX 6000: 0.916
INDEX 7000: 0.916
INDEX 8000: 0.972
INDEX 9000: 0.972

AssessmentCorrectSum : assessmentItemID 별 정답 문항 수의 합입니다.
dtype: int64
[Examples]
INDEX 0000: 246
INDEX 1000: 246
INDEX 2000: 246
INDEX 3000: 242
INDEX 4000: 242
INDEX 5000: 229
INDEX 6000: 229
INDEX 7000: 229
INDEX 8000: 243
INDEX 9000: 243
Feature Engineering End ... 
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'time_diff', 'UserCumtestnum', 'UserTagCumtestnum',
       'UserTestCumtestnum', 'TestSize', 'Retest', 'UserCycleCumtestnum', 'y',
       'testNumber', 'UserCumcorrectnum', 'UserCumcorrectper',
       'UserTagCumcorrectnum', 'UserTagCumcorrectper', 'UserTestCumcorrectnum',
       'UserTestC

In [16]:
processed_df = new_df.dropna(axis=0).reset_index(drop=True)

In [17]:
train_df = processed_df[processed_df.y != -1].reset_index(drop=True)

In [18]:
test_df = processed_df[processed_df.y == -1].reset_index(drop=True)

# Training

In [19]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [20]:
X = train_df[train_df.columns[6:]].drop('y', axis=1).values
y = train_df['y'].values

fold_acc = []
fold_auc = []
models = []

# split train, valid dataset
fold_X = train_df.userID.unique()
fold_y = list(train_df.groupby('userID')['y'].apply(lambda x: list(x)[-1]))

skf = StratifiedKFold(n_splits=n_folds)
for fold, (train_id, valid_id) in enumerate(skf.split(fold_X, fold_y)):
    train_idx = train_df[train_df.userID.isin(train_id)].index
    valid_idx = train_df[train_df.userID.isin(valid_id)].index

    X_train, X_valid = X[train_idx], X[valid_idx]
    y_train, y_valid = y[train_idx], y[valid_idx]
    
    print(f'* fold {fold+1}')
    print(f'  train X shape : {X_train.shape}, train y shape : {y_train.shape}')
    print(f'  valid X shape : {X_valid.shape}, valid y shape : {y_valid.shape}')

    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_valid)

    # training
    model = lgb.train(
                        {'objective': 'binary'}, 
                        lgb_train,
                        valid_sets=[lgb_train, lgb_valid],
                        verbose_eval=100,
                        num_boost_round=500,
                        early_stopping_rounds=100
                    )
    models.append(model)

    preds = model.predict(X_valid)
    
    valid_df = pd.DataFrame(train_df[train_df.userID.isin(valid_id)])
    valid_df['preds'] = preds
    
    if y_method == 'last':
        cycle_y = valid_df.groupby(y_id)['y'].apply(lambda x: list(x)[0]).values
        cycle_preds = valid_df.groupby(y_id)['preds'].mean().values

        acc = accuracy_score(cycle_y, np.where(cycle_preds >= 0.5, 1, 0))
        auc = roc_auc_score(cycle_y, cycle_preds)
    
    elif y_method == 'next':
        acc = accuracy_score(valid_df.y, np.where(valid_df.preds >= 0.5, 1, 0))
        auc = roc_auc_score(valid_df.y, valid_df.preds)
        
    
    print(f'  acc : {acc}, auc : {auc} \n' )
    
    fold_acc.append(acc)
    fold_auc.append(auc)

* fold 1
  train X shape : (944058, 33), train y shape : (944058,)
  valid X shape : (748769, 33), valid y shape : (748769,)
  acc : 0.7427045648380929, auc : 0.803404901338532 

* fold 2
  train X shape : (1182465, 33), train y shape : (1182465,)
  valid X shape : (510362, 33), valid y shape : (510362,)
  acc : 0.7477228574158412, auc : 0.8150528336596716 

* fold 3
  train X shape : (1415144, 33), train y shape : (1415144,)
  valid X shape : (277683, 33), valid y shape : (277683,)
  acc : 0.7417849840163581, auc : 0.8134406176371771 

* fold 4
  train X shape : (1576111, 33), train y shape : (1576111,)
  valid X shape : (116716, 33), valid y shape : (116716,)
  acc : 0.7375960630982877, auc : 0.8160510248725966 

* fold 5
  train X shape : (1653530, 33), train y shape : (1653530,)
  valid X shape : (39297, 33), valid y shape : (39297,)
  acc : 0.7363340884467645, auc : 0.8136263977922062 



# Inference

In [21]:
X_test = test_df[test_df.columns[6:]].drop('y', axis=1).values

In [22]:
# prediction
test_preds = []
for k in range(len(models)):
    temp_df = test_df.copy()
    temp_df['preds'] = models[k].predict(X_test)
    
    if y_method == 'last':
        test_pred = temp_df.groupby('userID')['preds'].mean().values
    
    elif y_method == 'next':
        test_pred = temp_df['preds']
        
    test_preds.append(test_pred)

In [41]:
final_test_pred = sum(test_preds) / len(test_preds)

In [47]:
# write submission
output_dir = 'output/'
write_path = os.path.join(output_dir, f"lgbm_{y_id}_{y_method}_{n_folds}.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)    
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(final_test_pred):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/pycaret_lgbm3_['userID', 'testId', 'Retest']_last_5.csv


# Training using Pycaret

In [48]:
from pycaret.classification import *
from pycaret.utils import check_metric

In [None]:
settings = setup(data=train_df[train_df.columns[6:]], target='y', train_size=0.8, categorical_features=[], numeric_features=list(train_df.columns[6:].drop('y')))

lgbm = create_model('lightgbm', sort='AUC')
tuned_lgbm = tune_model(lgbm, optimize = 'AUC', fold = 5)
final_lgbm = finalize_model(tuned_lgbm)

In [None]:
prediction = predict_model(final_lgbm, data=test_df[test_df.columns[6:]].drop('y', axis=1), raw_score=True)
total_preds = prediction.Score_1.values

In [None]:
temp_df = test_df.copy()
temp_df['preds'] = total_preds
final_test_pred = temp_df.groupby('userID')['preds'].mean().values

In [49]:
# inference는 위 inference 과정과 동일