In [None]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
style.use('fivethirtyeight')
import seaborn as sns
import lightgbm
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import gc
import sys
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Train

In [None]:
cols = ['user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time',
        'prior_question_had_explanation']
train = pd.read_pickle("/content/drive/MyDrive/riiid_train.pkl.gzip")[cols]

## Train Test Split

In [None]:
test = train.groupby('user_id').tail(5)
train.drop(test.index, inplace=True)

In [None]:
train = train.rename(columns={'prior_question_elapsed_time': 'prior_time'})
train = train.rename(columns={'prior_question_had_explanation': 'prior_saw_explanation'})
train.prior_saw_explanation = train.prior_saw_explanation.astype('boolean')

# prior_saw_explanation is none when prior_time is nan, so user hasn't seen prior explanation
train.prior_saw_explanation.fillna(False, inplace=True)
train.prior_saw_explanation = train.prior_saw_explanation.astype('int8')


# Feature Dataframes

## df_user (using train_latest)

In [None]:
# train_latest = train.groupby('user_id').tail(100)
train_latest = train.copy()

### df_user_lec

In [None]:
df_lec = pd.read_csv('/content/drive/MyDrive/lectures.csv')
df_lec.drop(columns='tag', inplace=True)
df_lec.columns=['lec_id', 'user_part', 'user_type']
df_lec = pd.get_dummies(df_lec, columns=['user_part', 'user_type'])
df_lec.head()

Unnamed: 0,lec_id,user_part_1,user_part_2,user_part_3,user_part_4,user_part_5,user_part_6,user_part_7,user_type_concept,user_type_intention,user_type_solving question,user_type_starter
0,89,0,0,0,0,1,0,0,1,0,0,0
1,100,1,0,0,0,0,0,0,1,0,0,0
2,185,0,0,0,0,0,1,0,1,0,0,0
3,192,0,0,0,0,1,0,0,0,0,1,0
4,317,0,0,0,0,1,0,0,0,0,1,0


In [None]:
tmp = train_latest.loc[train_latest.answered_correctly == -1 ,['user_id', 'content_id']]
tmp = tmp.merge(df_lec, left_on='content_id', right_on='lec_id')

In [None]:
parts = [column for column in df_lec if column.startswith('user_part')]
types = [column for column in df_lec if column.startswith('user_type')]
df_user_lec = tmp.groupby('user_id')[parts + types].sum().reset_index()

In [None]:
del tmp
del df_lec

In [None]:
train = train[train.answered_correctly != -1]
train = train.rename(columns={'content_id': 'question_id'})
train.shape

(97330440, 5)

### df_user_question

In [None]:
df_user_question = train_latest.groupby('user_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
df_user_question.columns = ['user_id', 'user_times_answered', 'user_accuracy']
df_user_question.shape

(393382, 3)

### df_user_sees_explanatation

In [None]:
df_user_sees_explanation = train_latest.groupby('user_id').agg({'prior_saw_explanation': ['mean']}).reset_index()
df_user_sees_explanation.columns=['user_id', 'user_sees_explanation']

### df_user_avg_time

In [None]:
df_user_avg_time = train_latest.groupby('user_id').agg({'prior_time': ['mean']}).reset_index()
df_user_avg_time.columns=['user_id', 'user_avg_time']
df_user_avg_time.head()

Unnamed: 0,user_id,user_avg_time
0,115,20500.0
1,124,21333.208984
2,2746,18307.691406
3,5382,36420.167969
4,8623,26452.427734


### df_user (combined)

In [None]:
# df_user_question is inclusive of df_user_lec
df_user = df_user_lec.merge(df_user_question, on='user_id', how='outer')

df_user[df_user.user_id==115]

Unnamed: 0,user_id,user_part_1,user_part_2,user_part_3,user_part_4,user_part_5,user_part_6,user_part_7,user_type_concept,user_type_intention,user_type_solving question,user_type_starter,user_times_answered,user_accuracy
141647,115,,,,,,,,,,,,41,0.731707


In [None]:
# Since nan comes from df_user_lec being smaller, fill using df_user_lec
for col in df_user_lec.columns[1:]:
  val_fill = df_user_lec[col].mean()
  df_user[col].fillna(val_fill, inplace=True)

df_user[df_user.user_id==115]

Unnamed: 0,user_id,user_part_1,user_part_2,user_part_3,user_part_4,user_part_5,user_part_6,user_part_7,user_type_concept,user_type_intention,user_type_solving question,user_type_starter,user_times_answered,user_accuracy
141647,115,1.205963,2.650109,0.694035,0.650102,6.51185,1.533396,0.396119,9.794757,0.736112,3.110684,2.1e-05,41,0.731707


In [None]:
df_user = df_user.merge(df_user_sees_explanation, on='user_id', how='left')
df_user = df_user.merge(df_user_avg_time, on='user_id', how='left')

In [None]:
df_user.user_id = df_user.user_id.astype('int32')
cols_float32 = df_user.columns[1:-1]
df_user[cols_float32] = df_user[cols_float32].astype('float32')

In [None]:
del train_latest

## df_question

In [None]:
df_question = train.groupby('question_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
df_question.columns = ['question_id', 'question_times_asked', 'question_accuracy']


In [None]:
train['time'] = train[['prior_time']].shift(periods=-1, axis='rows')
train['saw_explanation'] = train[['prior_saw_explanation']].shift(periods=-1, axis='rows')

In [None]:
tmp = train.groupby('question_id').agg({'time': 'mean'})
tmp.columns = ['question_time']

train.drop(columns=['time'], inplace=True)

In [None]:
df_question = df_question.merge(tmp, on='question_id')
del tmp
df_question.head()

Unnamed: 0,question_id,question_times_asked,question_accuracy,question_time
0,0,6843,0.907789,19695.111328
1,1,7334,0.891055,19317.537109
2,2,44342,0.555162,24580.992188
3,3,22696,0.780534,21610.310547
4,4,31514,0.613918,22041.882812


In [None]:
tmp = train.groupby('question_id').agg({'saw_explanation': 'mean'})
tmp.columns = ['question_saw_explanation']

train.drop(columns=['saw_explanation'], inplace=True)

In [None]:
df_question = df_question.merge(tmp, on='question_id')
del tmp

In [None]:
tmp = train[['question_id', 'user_id']].groupby(['question_id',]).agg('nunique')
tmp.columns = ['question_times_asked_by_unique_users']

In [None]:
df_question = df_question.merge(tmp, on='question_id')
del tmp
df_question['question_times_asked_per_user'] = \
  df_question['question_times_asked'] / df_question['question_times_asked_by_unique_users']

In [None]:
df_question.question_id = df_question.question_id.astype('int16')
df_question.question_times_asked = df_question.question_times_asked.astype('int32')
df_question.question_accuracy = df_question.question_accuracy.astype('float32')
df_question.question_saw_explanation = df_question.question_saw_explanation.astype('float32')
df_question.question_times_asked_by_unique_users = df_question.question_times_asked_by_unique_users.astype('int32')
df_question.question_times_asked_per_user = df_question.question_times_asked_per_user.astype('float32')


## df_tags

In [None]:
df_tags = pd.read_csv('/content/drive/MyDrive/questions.csv')[['question_id', 'part', 'tags']]
df_tags['tags'] = df_tags['tags'].astype(str)
lst_tags = [x.split() for x in df_tags.tags.values]
maxi = 6
arr_tags = []
for tags in lst_tags:
  len_pad = maxi - len(tags)
  if tags[0] == 'nan':
    tags[0] = 0
  tags = np.array(tags, dtype=int)
  tags = np.pad(tags, (0, len_pad))
  arr_tags.append(tags)
arr_tags = np.stack(arr_tags)
df_tags['tag0'] = arr_tags[:, 0]
df_tags['tag1'] = arr_tags[:, 1]
df_tags['tag2'] = arr_tags[:, 2]
df_tags['tag3'] = arr_tags[:, 3]
df_tags['tag4'] = arr_tags[:, 4]
df_tags['tag5'] = arr_tags[:, 5]
df_tags.drop(columns=['tags'], inplace=True)

In [None]:
df_tags.question_id = df_tags.question_id.astype('int16')
df_tags.part = df_tags.part.astype('int8')
cols_uint8 = df_tags.columns[2:]
df_tags[cols_uint8] = df_tags[cols_uint8].astype('uint8')

## df_part

In [None]:
tmp = train[['question_id', 'answered_correctly']]\
  .merge(df_tags[['question_id', 'part']], on='question_id', how='left')
df_part = tmp.groupby('part').agg({'answered_correctly': ['count', 'mean']}).reset_index()
df_part.columns = ['part', 'part_times_asked', 'part_accuracy']

In [None]:
del tmp

In [None]:
df_part.part = df_part.part.astype('int8')
df_part.part_times_asked = df_part.part_times_asked.astype('int32')
df_part.part_accuracy = df_part.part_accuracy.astype('float32')

# Merge on user_id (for fillnas)

In [None]:
%%time
train = train.merge(df_user, on = "user_id", how = "left")

CPU times: user 11.5 s, sys: 1.66 s, total: 13.2 s
Wall time: 13.2 s


# Fillna

In [None]:
num_examples = train.shape[0]
num_users = df_user_question.shape[0]

In [None]:
fillnas = {}
for col in df_user_lec.columns[1:]:
  fillnas[col] = train[col].astype('float64').mean()

#df_user
fillnas['user_times_answered'] =  num_examples / num_users
fillnas['user_accuracy'] = train.answered_correctly.astype('int64').mean()
fillnas['user_sees_explanation'] = train.user_sees_explanation.astype('float64').mean()
fillnas['user_avg_time'] = train.prior_time.astype('float64').mean()

fillnas['prior_time'] = train['prior_time'].astype('float64').mean()

fillnas

{'prior_time': 25446.638436913097,
 'user_accuracy': 0.6595619725956238,
 'user_avg_time': 25446.638436913097,
 'user_part_1': 3.4841528285101804,
 'user_part_2': 7.51408065529831,
 'user_part_3': 2.529385112299997,
 'user_part_4': 2.584576515580094,
 'user_part_5': 21.228967425131245,
 'user_part_6': 5.676999370654873,
 'user_part_7': 1.4270841033787671,
 'user_sees_explanation': 0.8876898825826414,
 'user_times_answered': 247.4196582456747,
 'user_type_concept': 29.15073955410436,
 'user_type_intention': 1.6895308526399253,
 'user_type_solving question': 13.604943553959913,
 'user_type_starter': 3.2123824510533806e-05}

In [None]:
train['prior_time'].fillna(fillnas['prior_time'], inplace = True)
train['user_avg_time'].fillna(fillnas['user_avg_time'], inplace = True)

In [None]:
for col in train.columns:
  if train[col].isnull().values.any():
    print(col)
    throw

# Decrease train size

In [None]:
# By sampling
# train = train.sample(n=21810401, random_state = 1)

# By getting most recent examples
# train = train.groupby('user_id').tail(100)

train.shape


(97330440, 20)

In [None]:
train.drop(columns='user_id', inplace=True)

# Merge on question_id, part

In [None]:
%%time
train = train.merge(df_question, on = "question_id", how = "left")
train = train.merge(df_tags, on = "question_id", how = "left")
train = train.merge(df_part, on = "part", how = "left")

CPU times: user 1min 8s, sys: 1.61 s, total: 1min 10s
Wall time: 1min 10s


In [None]:
train.drop(columns='question_id', inplace=True)

# Prepare Test set

In [None]:
# real test set doesn't have answered_correctly column
test = test[test.answered_correctly != -1]

test = test.rename(columns={'prior_question_elapsed_time': 'prior_time'})
test = test.rename(columns={'prior_question_had_explanation': 'prior_saw_explanation'})
test = test.rename(columns={'content_id': 'question_id'})

# prior_saw_explanation is none when prior_time is nan, so user hasn't seen prior explanation
test.prior_saw_explanation = test.prior_saw_explanation.astype('boolean')
test.prior_saw_explanation.fillna(False, inplace=True)
test.prior_saw_explanation = test.prior_saw_explanation.astype('int8')


In [None]:
test = test.merge(df_user, on = "user_id", how = "left")
test = test.merge(df_question, on = "question_id", how = "left")
test = test.merge(df_tags, on = "question_id", how = "left")
test = test.merge(df_part, on = "part", how = "left")

In [None]:
test.drop(columns='question_id', inplace=True)   
test.drop(columns='user_id', inplace=True)

In [None]:
test['prior_time'].fillna(fillnas['prior_time'], inplace = True)

for col in df_user.columns[1:]:
  test[col].fillna(fillnas[col], inplace = True)

In [None]:
for col in test.columns:
  if test[col].isnull().values.any():
    print(col)
    throw

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1940860 entries, 0 to 1940859
Data columns (total 33 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   answered_correctly                    int8   
 1   prior_time                            float32
 2   prior_saw_explanation                 int8   
 3   user_part_1                           float32
 4   user_part_2                           float32
 5   user_part_3                           float32
 6   user_part_4                           float32
 7   user_part_5                           float32
 8   user_part_6                           float32
 9   user_part_7                           float32
 10  user_type_concept                     float32
 11  user_type_intention                   float32
 12  user_type_solving question            float32
 13  user_type_starter                     float32
 14  user_times_answered                   float32
 15  user_accuracy  

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97330440 entries, 0 to 97330439
Data columns (total 33 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   answered_correctly                    int8   
 1   prior_time                            float32
 2   prior_saw_explanation                 int8   
 3   user_part_1                           float32
 4   user_part_2                           float32
 5   user_part_3                           float32
 6   user_part_4                           float32
 7   user_part_5                           float32
 8   user_part_6                           float32
 9   user_part_7                           float32
 10  user_type_concept                     float32
 11  user_type_intention                   float32
 12  user_type_solving question            float32
 13  user_type_starter                     float32
 14  user_times_answered                   float32
 15  user_accuracy

# Features

In [None]:
features = [
# "given"
  'prior_time', 
  'prior_saw_explanation',

# "df_user"
  'user_part_1', 'user_part_2','user_part_3', 'user_part_4', 'user_part_5', 'user_part_6','user_part_7',
  'user_type_concept', 
  'user_type_intention',
  'user_type_solving question', 
  'user_type_starter', 
  'user_times_answered', 
  'user_accuracy', 
  'user_avg_time',
  'user_sees_explanation',

# "df_part"
  'part', 
  'part_times_asked', 
  'part_accuracy', 

# "df_question"
  # 'question_id', #useless (confirmed twice)
  'question_times_asked',
  'question_accuracy', 
  'question_times_asked_per_user',
  'question_times_asked_by_unique_users',
  'question_saw_explanation',
  'question_time',

# "df_tags"
  'tag0', 'tag1', 'tag2', 'tag3', 'tag4','tag5', 
  ]

len(features)

32

#Keras

In [None]:
import tensorflow as tf
model = tf.keras.Sequential(
  [
    tf.keras.layers.Input(len(features)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(200, activation="relu"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(30, activation="relu"),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation="sigmoid")
  ]
)
model.compile(optimizer='adam', loss="binary_crossentropy", metrics=['accuracy'])

In [None]:
model.fit(
        train[features].to_numpy(),
        train['answered_correctly'].to_numpy(),
        validation_data=(test[features].to_numpy(), test['answered_correctly'].to_numpy()),
        epochs=1, 
        batch_size=16384)

In [None]:
# #Scores using sklearn.metrics
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import roc_auc_score

# preds = model.predict(val)
# print(preds)
# for el in preds:
#   if el > 1 or el < -1:
#     throw
# score_roc_auc = roc_auc_score(y_val, preds)
# print(f'ROC AUC: {score_roc_auc}')

# preds[preds>0.5] = 1
# preds[preds<0.5] = 0
# score_acc = accuracy_score(y_val, preds)
# print(f'Accuracy: {score_acc}')