In [None]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('fivethirtyeight')
import seaborn as sns
import lightgbm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import gc
import sys
# pd.set_option('display.max_rows', None)
%cd /content

/content


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Train

In [None]:
# include timestamp?
cols = ['user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time',
        'prior_question_had_explanation']
train_ori = pd.read_pickle("/content/drive/MyDrive/riiid_train.pkl.gzip")[cols]
train_ori = train_ori.rename(columns={'prior_question_elapsed_time': 'prior_time'})
train_ori = train_ori.rename(columns={'prior_question_had_explanation': 'prior_saw_explanation'})
train_ori.prior_saw_explanation = train_ori.prior_saw_explanation.astype('boolean')

In [None]:
# prior_saw_explanation is none when prior_time is nan, so user hasn't seen prior explanation
train_ori.prior_saw_explanation.fillna(False, inplace=True)
train_ori.prior_saw_explanation = train_ori.prior_saw_explanation.astype('int8')


## Get recent & Split

In [None]:
train = train_ori.copy()
test = train.groupby('user_id').tail(1)
train.drop(test.index, inplace=True)
train.content_id.nunique()

13782

In [None]:
train_ori.shape[0] == train.shape[0] + test.shape[0]

True

In [None]:
test.index

Int64Index([       45,        75,        95,       223,       335,       352,
                  623,      1873,      8337,      8387,
            ...
            101228606, 101228645, 101228662, 101228712, 101228988, 101229216,
            101229974, 101230024, 101230304, 101230331],
           dtype='int64', length=393656)

In [None]:
train_ori.shape[0] == test.shape[0] + train.shape[0]

True

# Feature Dataframes

## df_user

#### df_lec

In [None]:
df_lec = pd.read_csv('/content/drive/MyDrive/lectures.csv')
df_lec.drop(columns='tag', inplace=True)
df_lec.columns=['lec_id', 'user_part', 'user_type']
df_lec = pd.get_dummies(df_lec, columns=['user_part', 'user_type'])
df_lec.head()

Unnamed: 0,lec_id,user_part_1,user_part_2,user_part_3,user_part_4,user_part_5,user_part_6,user_part_7,user_type_concept,user_type_intention,user_type_solving question,user_type_starter
0,89,0,0,0,0,1,0,0,1,0,0,0
1,100,1,0,0,0,0,0,0,1,0,0,0
2,185,0,0,0,0,0,1,0,1,0,0,0
3,192,0,0,0,0,1,0,0,0,0,1,0
4,317,0,0,0,0,1,0,0,0,0,1,0


### df_user_lec

In [None]:
tmp = train.loc[train.answered_correctly == -1 ,['user_id', 'content_id']]
tmp = tmp.merge(df_lec, left_on='content_id', right_on='lec_id')

In [None]:
parts = [column for column in df_lec if column.startswith('user_part')]
types = [column for column in df_lec if column.startswith('user_type')]
df_user_lec = tmp.groupby('user_id')[parts + types].sum().reset_index()

In [None]:
del tmp
del df_lec

In [None]:
train = train.rename(columns={'content_id': 'question_id'})
train = train[train.answered_correctly != -1]
train.shape

(98884590, 5)

### df_user_question

In [None]:
df_user_question = train.groupby('user_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
df_user_question.columns = ['user_id', 'user_times_answered', 'user_accuracy']
df_user_question.shape

KeyboardInterrupt: ignored

### df_user_sees_explanatation

In [None]:
df_user_sees_explanation = train.groupby('user_id').agg({'prior_saw_explanation': ['mean']}).reset_index()
df_user_sees_explanation.columns=['user_id', 'user_sees_explanation']

### df_user_avg_time

In [None]:
df_user_avg_time = train.groupby('user_id').agg({'prior_time': ['mean']}).reset_index()
df_user_avg_time.columns=['user_id', 'user_avg_time']
df_user_avg_time.head()

### df_user (combined)

In [None]:
df_user = df_user_lec.merge(df_user_question, on='user_id', how='outer')
df_user.shape # we will see that df_user_question is inclusive of df_user_lec

df_user[df_user.user_id==115]

In [None]:
# Since nan comes from df_user_lec being smaller, fill using df_user_lec
for col in df_user_lec.columns[1:]:
  val_fill = df_user_lec[col].mean()
  df_user[col].fillna(val_fill, inplace=True)
df_user[df_user.user_id==115]

In [None]:
df_user = df_user.merge(df_user_sees_explanation, on='user_id', how='left')
df_user = df_user.merge(df_user_avg_time, on='user_id', how='left')

### Lower dtype

In [None]:
df_user.info()

In [None]:
df_user.user_id = df_user.user_id.astype('int32')
cols_float16 = df_user.columns[1:-1]
df_user[cols_float16] = df_user[cols_float16].astype('float16')

In [None]:
df_user.info()

In [None]:
df_user.columns

## df_question

In [None]:
df_question = train.groupby('question_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
df_question.columns = ['question_id', 'question_times_asked', 'question_accuracy']
df_question.head()

In [None]:
train['time'] = train[['prior_time']].shift(periods=-1, axis='rows')
train['saw_explanation'] = train[['prior_saw_explanation']].shift(periods=-1, axis='rows')
train.head()

In [None]:
tmp = train.groupby('question_id').agg({'time': 'mean'})
tmp.columns = ['question_time']

train.drop(columns=['time'], inplace=True)

tmp.head()

In [None]:
df_question = df_question.merge(tmp, on='question_id')
del tmp
df_question.head()

In [None]:
tmp = train.groupby('question_id').agg({'saw_explanation': 'mean'})
tmp.columns = ['question_saw_explanation']

train.drop(columns=['saw_explanation'], inplace=True)

tmp.head()

In [None]:
df_question = df_question.merge(tmp, on='question_id')
del tmp
df_question.head()

In [None]:
tmp = train[['question_id', 'user_id']].groupby(['question_id',]).agg('nunique')
tmp.columns = ['question_times_asked_by_unique_users']
tmp.head()

In [None]:
df_question = df_question.merge(tmp, on='question_id')
del tmp
df_question['question_times_asked_per_user'] = \
  df_question['question_times_asked'] / df_question['question_times_asked_by_unique_users']

In [None]:
df_question.info()

In [None]:
df_question.question_id = df_question.question_id.astype('int16')
df_question.question_times_asked = df_question.question_times_asked.astype('int32')
df_question.question_accuracy = df_question.question_accuracy.astype('float16')
df_question.question_saw_explanation = df_question.question_saw_explanation.astype('float16')
df_question.question_times_asked_by_unique_users = df_question.question_times_asked_by_unique_users.astype('int32')
df_question.question_times_asked_per_user = df_question.question_times_asked_per_user.astype('float16')


In [None]:
df_question.info()

## df_tags

In [None]:
df_tags = pd.read_csv('/content/drive/MyDrive/questions.csv')[['question_id', 'part', 'tags']]
df_tags['tags'] = df_tags['tags'].astype(str)
lst_tags = [x.split() for x in df_tags.tags.values]
maxi = 6
arr_tags = []
for tags in lst_tags:
  len_pad = maxi - len(tags)
  if tags[0] == 'nan':
    tags[0] = 0
  tags = np.array(tags, dtype=int)
  tags = np.pad(tags, (0, len_pad))
  arr_tags.append(tags)
arr_tags = np.stack(arr_tags)
df_tags['tag0'] = arr_tags[:, 0]
df_tags['tag1'] = arr_tags[:, 1]
df_tags['tag2'] = arr_tags[:, 2]
df_tags['tag3'] = arr_tags[:, 3]
df_tags['tag4'] = arr_tags[:, 4]
df_tags['tag5'] = arr_tags[:, 5]
df_tags.drop(columns=['tags'], inplace=True)
df_tags.head()

In [None]:
df_tags.info()

In [None]:
df_tags.question_id = df_tags.question_id.astype('int16')
df_tags.part = df_tags.part.astype('int8')
cols_uint8 = df_tags.columns[2:]
df_tags[cols_uint8] = df_tags[cols_uint8].astype('uint8')

In [None]:
df_tags.info()

## df_part

In [None]:
tmp = train[['question_id', 'answered_correctly']]\
  .merge(df_tags[['question_id', 'part']], on='question_id', how='left')
df_part = tmp.groupby('part').agg({'answered_correctly': ['count', 'mean']}).reset_index()
df_part.columns = ['part', 'part_times_asked', 'part_accuracy']

In [None]:
del tmp

In [None]:
df_part.info()

In [None]:
df_part.part = df_part.part.astype('int8')
df_part.part_times_asked = df_part.part_times_asked.astype('int32')
df_part.part_accuracy = df_part.part_accuracy.astype('float16')

In [None]:
df_part.info()

# Decrease train size

In [None]:
# train = train.sample(n=10000000, random_state = 1)
train = train.groupby('user_id').tail(10)

# Merge on user_id

In [None]:
train = train.merge(df_user, on = "user_id", how = "left")

# Fillna

In [None]:
num_examples = train.shape[0]
num_users = df_user_question.shape[0]

In [None]:
fillnas = {}
for col in df_user_lec.columns[1:]:
  fillnas[col] = train[col].astype('float64').mean()

#df_user_question
fillnas['user_times_answered'] =  num_examples / num_users
fillnas['user_accuracy'] = train.answered_correctly.astype('int64').mean()
fillnas['user_sees_explanation'] = train.user_sees_explanation.astype('float64').mean()

fillnas['prior_time'] = train['prior_time'].astype('float64').mean()

fillnas

In [None]:
train['prior_time'].fillna(fillnas['prior_time'], inplace = True)
train.dtypes['prior_time'] # not converted to float64

# Merge on question_id, part

In [None]:
%%time
train = train.merge(df_question, on = "question_id", how = "left")
train = train.merge(df_tags, on = "question_id", how = "left")
train = train.merge(df_part, on = "part", how = "left")


## Drop question_id

In [None]:
train.drop(columns='question_id', inplace=True)                           

# Split to train, val

In [None]:
# val = train.sample(frac=0.1)
# train = train[~train.index.isin(val.index)]

# val = train.groupby('user_id').tail(1)
# train = train[~train.index.isin(val.index)]


## Drop user_id

In [None]:
# val.drop(columns='user_id', inplace=True)
# train.drop(columns='user_id', inplace=True)

# Test set

In [None]:
test = test.rename(columns={'content_id': 'question_id'})

test = test.merge(df_user, on = "user_id", how = "left")
test = test.merge(df_question, on = "question_id", how = "left")
test = test.merge(df_tags, on = "question_id", how = "left")
test = test.merge(df_part, on = "part", how = "left")

test['prior_time'].fillna(fillnas['prior_time'], inplace = True)

test['user_times_answered'].fillna(fillnas['user_times_answered'], inplace = True)
test['user_accuracy'].fillna(fillnas['user_accuracy'], inplace = True)
test['user_sees_explanation'].fillna(fillnas['user_sees_explanation'], inplace = True)

In [None]:
for col in train.columns:
  test[col] = test[col].astype(train[col].dtype)

# Features

In [None]:
# Casting as category is hurting accuracy
# train[['tag0', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'part']] = \
#   train[['tag0', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5','part']].astype('category')
# val[['tag0', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5', 'part']] = \
#   val[['tag0', 'tag1', 'tag2', 'tag3', 'tag4', 'tag5','part']].astype('category')

In [None]:
features = [
# "given"
  'prior_time', 
  'prior_saw_explanation',

# "df_user"
  'user_part_1', 'user_part_2','user_part_3', 'user_part_4', 'user_part_5', 'user_part_6','user_part_7',
  'user_type_concept', 
  'user_type_intention',
  'user_type_solving question', 
  'user_type_starter', 
  'user_times_answered', 
  'user_accuracy', 
  'user_avg_time',
  'user_sees_explanation',

# "df_part"
  'part', 
  'part_times_asked', 
  'part_accuracy', 

# "df_question"
  #'question_id', #useless (confirmed twice)
  'question_times_asked',
  'question_accuracy', 
  'question_times_asked_per_user',
  'question_times_asked_by_unique_users',
  'question_saw_explanation',
  'question_time',

# "df_tags"
  'tag0', 'tag1', 'tag2', 'tag3', 'tag4','tag5', 
  ]

ds_train = lightgbm.Dataset(train[features], train['answered_correctly'])
ds_val = lightgbm.Dataset(val[features], val['answered_correctly'])
# del train, val

# Light Gradient Boosting

In [None]:
%%time
params = {
    'objective': 'binary',
    'metric': 'auc', # roc_auc
    'seed': 2020,
    
    # Setting params to default somehow lowering score (all confirmed)
    # 'learning_rate': 0.1, # default=0.1
    # 'boosting_type': "gbdt", # default=gbdt (goss, rf, dart)
    # 'max_bin': 255, # default=255
    # 'num_leaves': 31, # default=31
    # 'num_tree': 100, # default=100
    # 'tree_learner': 'serial', # default=serial (feature, data, voting)
    
    }
   
model = lightgbm.train(
    params=params, 
    train_set=ds_train,
    valid_sets=[ds_val],
    verbose_eval=50,
    num_boost_round=10000,
    # early_stopping_rounds=8
)


In [None]:
val.shape

In [None]:
#Plot
lightgbm.plot_importance(model, figsize=(18,18))
plt.show()

In [None]:
# params = {
#     "objective": "binary",
#     "boosting_type": "gbdt",
#     "learning_rate": 0.1,
#     "num_leaves": 15,
#     "tree_learner": 'voting',
#     "min_data_in_leaf"
#     "max_bin": 256,
#     "feature_fraction": 0.6,
#     "verbosity": 0,
#     "drop_rate": 0.1,
#     "is_unbalance": False,
#     "max_drop": 50,
#     "min_child_samples": 10,
#     "min_child_weight": 150,
#     "min_split_gain": 0,
#     "subsample": 0.9,
#     "metric": 'auc',
#     "seed": 2020
#           }



In [None]:
# #Scores using sklearn.metrics
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import roc_auc_score

# preds = model.predict(val)
# print(preds)
# for el in preds:
#   if el > 1 or el < -1:
#     throw
# score_roc_auc = roc_auc_score(y_val, preds)
# print(f'ROC AUC: {score_roc_auc}')

# preds[preds>0.5] = 1
# preds[preds<0.5] = 0
# score_acc = accuracy_score(y_val, preds)
# print(f'Accuracy: {score_acc}')