In [2]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
import lightgbm
import gc
import sys
import tensorflow as tf

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from tensorflow.keras import layers

%cd /content
%matplotlib inline

style.use('fivethirtyeight')
drive.mount('/content/drive')


/content
Mounted at /content/drive


# Load data

In [3]:
%%time

cols = ['row_id', 'user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time']
train_ori = pd.read_pickle("/content/drive/MyDrive/riiid_train.pkl.gzip")[cols]
train_ori = train_ori.rename(columns={'content_id': 'question_id'})
train_ori = train_ori[train_ori.answered_correctly != -1]

CPU times: user 9.94 s, sys: 5.08 s, total: 15 s
Wall time: 42.3 s


# Custom feature tables

Calculate means before splitting to train,val

## df_user: user features

In [4]:
%%time

df_user = train_ori.groupby('user_id').agg({'answered_correctly': ['count', 'mean']})
df_user.columns = ['user_times_answered', 'user_accuracy']
mean_user_acc = df_user.user_accuracy.mean()

df_question = train_ori.groupby('question_id').agg({'answered_correctly': ['count', 'mean']})
df_question.columns = ['question_times_asked', 'question_accuracy']
mean_question_acc = df_question.question_accuracy.mean()

mean_prior_time = train_ori.prior_question_elapsed_time.astype('float64').mean()


CPU times: user 15 s, sys: 171 ms, total: 15.1 s
Wall time: 15.2 s


## df_question: question features

mean_part_acc wasn't calculated as it should probably be captured by the part (part_id) alone.

In [5]:
%%time

df_part = pd.read_csv('/content/drive/MyDrive/questions.csv')[['question_id', 'part', 'tags']]
df_question = df_question.merge(df_part, on='question_id', how='left')
tmp = train_ori[['question_id', 'answered_correctly']].merge(df_part[['question_id', 'part']], on='question_id', how='left')
tmp = tmp.groupby('part').agg({'answered_correctly': ['mean']})
tmp.columns = ['part_accuracy']
df_question = df_question.merge(tmp, on='part', how='left')
del tmp, df_part

CPU times: user 23.5 s, sys: 964 ms, total: 24.4 s
Wall time: 24.8 s


# Split to train, val

In [6]:
%%time

cv_val = pd.read_pickle("/content/drive/MyDrive/cv1_valid.pickle")['row_id']

train = train_ori[train_ori.row_id.isin(cv_val)==False]
train = train.drop(columns = "row_id")
print(train_ori.shape)
print(train.shape)

val = train_ori[train_ori.row_id.isin(cv_val)]
val = val.drop(columns = "row_id")
print(val.shape)
del train_ori
del cv_val

(99271300, 5)
(96817414, 4)
(2453886, 4)
CPU times: user 17.8 s, sys: 596 ms, total: 18.4 s
Wall time: 20.4 s


Drop row_id

In [7]:
#sample for fast experimenting
train = train.sample(n=10000000, random_state = 1)
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

# Merge with custom features

Drop user_id

In [8]:
%%time
train = train.merge(df_user, on = "user_id", how = "left")
train = train.merge(df_question, on = "question_id", how = "left")
train.drop(columns='user_id', inplace=True)

val = val.merge(df_user, on = "user_id", how = "left")
val = val.merge(df_question, on = "question_id", how = "left")
val.drop(columns='user_id', inplace=True)

CPU times: user 5.03 s, sys: 108 ms, total: 5.14 s
Wall time: 5.14 s


# Fill missing values

In [9]:
%%time
train['question_times_asked'].fillna(0, inplace = True)
train['question_accuracy'].fillna(mean_question_acc, inplace = True)

train['user_times_answered'].fillna(0, inplace = True)
train['user_accuracy'].fillna(mean_user_acc, inplace = True)

train['prior_question_elapsed_time'].fillna(mean_prior_time, inplace = True)

# train[['question_times_asked', 'user_times_answered', 'part_times_asked']] = \
#   train[['question_times_asked', 'user_times_answered', 'part_times_asked']].astype(int)



CPU times: user 56.9 ms, sys: 0 ns, total: 56.9 ms
Wall time: 56.4 ms


In [10]:
%%time
val['question_times_asked'].fillna(0, inplace = True)
val['question_accuracy'].fillna(mean_question_acc, inplace = True)

val['user_times_answered'].fillna(0, inplace = True)
val['user_accuracy'].fillna(mean_user_acc, inplace = True)

val['prior_question_elapsed_time'].fillna(mean_prior_time, inplace = True)

# val[['question_times_asked', 'user_times_answered', 'part_times_asked']] = \
#   val[['question_times_asked', 'user_times_answered', 'part_times_asked']].astype(int)


CPU times: user 15.2 ms, sys: 16 µs, total: 15.3 ms
Wall time: 14.7 ms


In [11]:
%%time
BATCH_SIZE = 32
def df_to_dataset(df):
  labels = df.pop('answered_correctly')
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  ds = ds.batch(BATCH_SIZE)
  return ds
ds_train = df_to_dataset(train)
ds_val = df_to_dataset(val)

CPU times: user 1.4 s, sys: 220 ms, total: 1.62 s
Wall time: 4.26 s


In [12]:
# ex = next(iter(ds_val))[0]
# print(ex['user_times_answered'])
# def demo(col):
#   feature_layer = layers.DenseFeatures(col, dtype=tf.int64)
#   bla = feature_layer(ex)
#   print(bla)
# demo_times_answered = tf.feature_column.numeric_column('user_times_answered')
# demo(demo_times_answered)

In [13]:
num_bounds = {
    'user_accuracy': 20,
    'user_times_answered': 19,
    'question_accuracy': 16,
    'question_times_asked': 13,
    'prior_question_elapsed_time': 11,
    'part_accuracy': 3
    }

cols_feat = []
cols_num = [
            'prior_question_elapsed_time',
            'user_times_answered',
            'user_accuracy',
            'question_times_asked',
            'question_accuracy',
            'part_accuracy'
            ]
# cols_bucketized = [
#             'prior_question_elapsed_time',
#             'user_times_answered',
#             'user_accuracy',
#             'question_times_asked',
#             'question_accuracy',
#             'part_accuracy'
#             ]
cols_indicator = [
            'part'      
]
cols_embed = ['question_id']

vocab_question = df_question.question_id.unique()
vocab_part = df_question.part.unique()

In [None]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for our feature.
  normalizer = preprocessing.Normalization()

  # Prepare a Dataset that only yields our feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [14]:
all_inputs = []
encoded_features = []

for header in cols_num:
  cols_feat.append(tf.feature_column.numeric_column(header))

for header in cols_indicator:
  col = tf.feature_column.categorical_column_with_vocabulary_list(
      header, vocab_part)
  col = tf.feature_column.indicator_column(col)
  cols_feat.append(col)

for header in cols_embed:
  col = tf.feature_column.categorical_column_with_vocabulary_list(
      header, vocab_question)
  col = tf.feature_column.embedding_column(col, dimension=100)
  cols_feat.append(col)

# layer_feat = layers.DenseFeatures(cols_feat)
# len(cols_feat)

In [None]:
layer_feat = layers.DenseFeatures(cols_feat)
dense1 = layers.Dense(1024, activation='relu')(layer_feat)
dense2 = layers.Dense(512, activation='relu')(dense1)
drop = layers.Dropout(0.1)(dense2)
final = layers.Dense(1)
model = tf.keras.Model(cols_feat, final)
model.summary()

In [None]:
# model = tf.keras.Sequential([
#   layer_feat,
#   layers.Dense(1024, activation='relu'),
#   layers.Dense(512, activation='relu'),
#   layers.Dropout(.1),
#   layers.Dense(1)
# ])

# model.compile(optimizer='adam',
#               loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
#               metrics=['accuracy'])

# model.fit(ds_train,
#           validation_data=ds_val,
#           epochs=1)

In [None]:
layer