https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering

In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm

In [2]:
train_pickle = '../cv_files/cv1_train.pickle'
valid_pickle = '../cv_files/cv1_valid.pickle'
question_file = '../data/questions.csv'
debug = False
validaten_flg = False

In [4]:
# funcs for user stats with loop
def add_user_feats(df, answered_correctly_sum_u_dict, count_u_dict):
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    for cnt,row in enumerate(tqdm(df[['user_id','answered_correctly']].values)):
        acsu[cnt] = answered_correctly_sum_u_dict[row[0]]
        cu[cnt] = count_u_dict[row[0]]
        answered_correctly_sum_u_dict[row[0]] += row[1]
        count_u_dict[row[0]] += 1
    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    return df

def add_user_feats_without_update(df, answered_correctly_sum_u_dict, count_u_dict):
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    for cnt,row in enumerate(df[['user_id']].values):
        acsu[cnt] = answered_correctly_sum_u_dict[row[0]]
        cu[cnt] = count_u_dict[row[0]]
    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    return df

def update_user_feats(df, answered_correctly_sum_u_dict, count_u_dict):
    for row in df[['user_id','answered_correctly','content_type_id']].values:
        if row[2] == 0:
            answered_correctly_sum_u_dict[row[0]] += row[1]
            count_u_dict[row[0]] += 1

In [4]:
# read data
feld_needed = ['row_id', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
train = pd.read_pickle(train_pickle)[feld_needed]
valid = pd.read_pickle(valid_pickle)[feld_needed]
if debug:
    train = train[:1000000]
    valid = valid[:10000]
train = train.loc[train.content_type_id == False].reset_index(drop=True)
valid = valid.loc[valid.content_type_id == False].reset_index(drop=True)

# answered correctly average for each content
content_df = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean']).reset_index()
content_df.columns = ['content_id', 'answered_correctly_avg_c']
train = pd.merge(train, content_df, on=['content_id'], how="left")
valid = pd.merge(valid, content_df, on=['content_id'], how="left")

# user stats features with loops
answered_correctly_sum_u_dict = defaultdict(int)
count_u_dict = defaultdict(int)
train = add_user_feats(train, answered_correctly_sum_u_dict, count_u_dict)
valid = add_user_feats(valid, answered_correctly_sum_u_dict, count_u_dict)

# fill with mean value for prior_question_elapsed_time
# note that `train.prior_question_elapsed_time.mean()` dose not work!
# please refer https://www.kaggle.com/its7171/can-we-trust-pandas-mean for detail.
prior_question_elapsed_time_mean = train.prior_question_elapsed_time.dropna().values.mean()
train['prior_question_elapsed_time_mean'] = train.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
valid['prior_question_elapsed_time_mean'] = valid.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)

# use only last 30M training data for limited memory on kaggle env.
#train = train[-30000000:]

# part
questions_df = pd.read_csv(question_file)
train = pd.merge(train, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
valid = pd.merge(valid, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')

# changing dtype to avoid lightgbm error
train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')

HBox(children=(FloatProgress(value=0.0, max=96817540.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2453760.0), HTML(value='')))




In [10]:
train.head()

Unnamed: 0,row_id,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly_avg_c,answered_correctly_sum_u,count_u,answered_correctly_avg_u,prior_question_elapsed_time_mean,question_id,part
0,32933156,705741139,128,0,1,,0,0.966933,0,0,,25439.917439,128,1
1,32933157,705741139,7860,0,1,16000.0,0,0.955045,1,1,1.0,16000.0,7860,1
2,32933158,705741139,7922,0,1,19000.0,0,0.95434,2,2,1.0,19000.0,7922,1
3,32933159,705741139,156,0,1,17000.0,0,0.932421,3,3,1.0,17000.0,156,1
4,32933160,705741139,51,0,1,17000.0,0,0.929977,4,4,1.0,17000.0,51,1


In [11]:
valid.head()

Unnamed: 0,row_id,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly_avg_c,answered_correctly_sum_u,count_u,answered_correctly_avg_u,prior_question_elapsed_time_mean,question_id,part
0,15969594,342268703,11581,0,0,48400.0,1,0.384876,187,289,0.647059,48400.0,11581,7
1,15969595,342268703,11584,0,1,48400.0,1,0.48667,187,290,0.644828,48400.0,11584,7
2,59061988,1252813376,1155,0,1,18000.0,1,0.620722,484,751,0.644474,18000.0,1155,2
3,91336018,1939074725,9975,0,1,21000.0,1,0.469522,884,1226,0.721044,21000.0,9975,5
4,30683822,658510304,8645,0,0,33000.0,1,0.369358,76,165,0.460606,33000.0,8645,5


In [12]:
train.shape, valid.shape

((96817540, 14), (2453760, 14))

In [13]:
np.save('../cv_files/cv{0}_train'.format(1), train.values)
np.save('../cv_files/cv{0}_valid'.format(1), valid.values)

In [5]:
np.save('../cv_files/features_all_cv', train.columns.values, allow_pickle=True)

In [5]:
def main(cv_idx):
    train_pickle = '../cv_files/cv{0}_train.pickle'.format(cv_idx)
    valid_pickle = '../cv_files/cv{0}_valid.pickle'.format(cv_idx)
    feld_needed = ['row_id', 'user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation']
    train = pd.read_pickle(train_pickle)[feld_needed]
    valid = pd.read_pickle(valid_pickle)[feld_needed]
    if debug:
        train = train[:1000000]
        valid = valid[:10000]
    train = train.loc[train.content_type_id == False].reset_index(drop=True)
    valid = valid.loc[valid.content_type_id == False].reset_index(drop=True)

    # answered correctly average for each content
    content_df = train[['content_id','answered_correctly']].groupby(['content_id']).agg(['mean']).reset_index()
    content_df.columns = ['content_id', 'answered_correctly_avg_c']
    train = pd.merge(train, content_df, on=['content_id'], how="left")
    valid = pd.merge(valid, content_df, on=['content_id'], how="left")

    # user stats features with loops
    answered_correctly_sum_u_dict = defaultdict(int)
    count_u_dict = defaultdict(int)
    train = add_user_feats(train, answered_correctly_sum_u_dict, count_u_dict)
    valid = add_user_feats(valid, answered_correctly_sum_u_dict, count_u_dict)

    # fill with mean value for prior_question_elapsed_time
    # note that `train.prior_question_elapsed_time.mean()` dose not work!
    # please refer https://www.kaggle.com/its7171/can-we-trust-pandas-mean for detail.
    prior_question_elapsed_time_mean = train.prior_question_elapsed_time.dropna().values.mean()
    train['prior_question_elapsed_time_mean'] = train.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    valid['prior_question_elapsed_time_mean'] = valid.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)

    # use only last 30M training data for limited memory on kaggle env.
    #train = train[-30000000:]

    # part
    questions_df = pd.read_csv(question_file)
    train = pd.merge(train, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    valid = pd.merge(valid, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')

    # changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')
    np.save('../cv_files/cv{0}_train'.format(cv_idx), train.values)
    np.save('../cv_files/cv{0}_valid'.format(cv_idx), valid.values)

    pd.to_pickle(content_df, '../cv_files/cv{0}_content'.format(cv_idx))
    ans, count = dict(), dict()
    for key, value in answered_correctly_sum_u_dict.items():
        ans[key] = value
    for key, value in count_u_dict.items():
        count[key] = value
    pd.to_pickle(ans, '../cv_files/cv{0}_answered_correctly_sum_u_dict'.format(cv_idx))
    pd.to_pickle(count, '../cv_files/cv{0}_count_u_dict'.format(cv_idx))

In [6]:
for i in range(1, 6):
    main(i)

HBox(children=(FloatProgress(value=0.0, max=96817540.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2453760.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=94366861.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2450679.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=91916930.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2449931.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=89466829.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2450101.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=87017064.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2449765.0), HTML(value='')))


