In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, GroupKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_score, recall_score, f1_score, auc, precision_recall_fscore_support

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import warnings
warnings.simplefilter("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Features
- session_id - the ID of the session the event took place in
- index - the index of the event for the session
- elapsed_time - how much time has passed (in milliseconds) between the start of the session and when the event was recorded
- event_name - the name of the event type
- name - the event name (e.g. identifies whether a notebook_click is is opening or closing the notebook)
- level - what level of the game the event occurred in (0 to 22)
- page - the page number of the event (only for notebook-related events)
- room_coor_x - the coordinates of the click in reference to the in-game room (only for click events)
- room_coor_y - the coordinates of the click in reference to the in-game room (only for click events)
- screen_coor_x - the coordinates of the click in reference to the player’s screen (only for click events)
- screen_coor_y - the coordinates of the click in reference to the player’s screen (only for click events)
- hover_duration - how long (in milliseconds) the hover happened for (only for hover events)
- text - the text the player sees during this event
- fqid - the fully qualified ID of the event
- room_fqid - the fully qualified ID of the room the event took place in
- text_fqid - the fully qualified ID of the
- fullscreen - whether the player is in fullscreen mode
- hq - whether the game is in high-quality
- music - whether the game music is on or off
- level_group - which group of levels - and group of questions - this row belongs to (0-4, 5-12, 13-22)

In [None]:
dtypes = {
    'elapsed_time': np.int32,
    'event_name': 'category',
    'name': 'category',
    'level': np.uint8,
    'room_coor_x': np.float32,
    'room_coor_y': np.float32,
    'screen_coor_x': np.float32,
    'screen_coor_y': np.float32,
    'hover_duration': np.float32,
    'text': 'category',
    'fqid': 'category',
    'room_fqid': 'category',
    'text_fqid': 'category',
    'fullscreen': 'category',
    'hq': 'category',
    'music': 'category',
    'level_group': 'category'}

In [None]:
train = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)
print("Full train dataset shape is {}".format(train.shape))

In [None]:
# Display the first 10 records
train.head(10)

In [None]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

labels.head(5)

In [None]:
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]) )
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

labels.head(5)

For each `session_id>_<question #>`, you are predicting the correct column, identifying whether you believe the user for this particular session will answer this question correctly, using only the previous information for the session.

In [None]:
labels.correct.value_counts().plot(kind="bar", color=["b", "r"])

In [None]:
plt.figure(figsize=(10, 20))
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.suptitle("\"Correct\" column values for each question", fontsize=14, y=0.94)
for n in range(1,19):
    #print(n, str(n))
    ax = plt.subplot(6, 3, n)

    # filter df and plot ticker on the new subplot axis
    plot_df = labels.loc[labels.q == n]
    plot_df = plot_df.correct.value_counts()
    plot_df.plot(ax=ax, kind="bar", color=["b", "r"])

    # chart formatting
    ax.set_title("Question " + str(n))
    ax.set_xlabel("")

In [None]:
categorical = ['event_name', 'name','fqid', 'room_fqid', 'text_fqid']
numerical = ['elapsed_time','level','page','room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']

For each categorical column, we will first group the dataset by `session_id`  and `level_group`. We will then count the number of **distinct elements** in the column for each group and store it temporarily.

For all numerical columns, we will group the dataset by `session id` and `level_group`. Instead of counting the number of distinct elements, we will calculate the `mean` and `standard deviation` of the numerical column for each group and store it temporarily.

After this, we will concatenate the temporary data frames we generated in the earlier step for each column to create our new feature engineered dataset.

In [None]:
def feature_engineer(dataset_df):
    dfs = []
    for c in categorical:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in numerical:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in numerical:
        tmp = dataset_df.groupby(['session_id','level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs,axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [None]:
df = feature_engineer(train)

In [None]:
print("Full prepared dataset shape is {}".format(df.shape))

# EDA

In [None]:
df.head(10)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='category')

In [None]:
df.isnull().sum().sum()

In [None]:
df.corr()

# Data Visualization

In [None]:
figure, axis = plt.subplots(3, 2, figsize=(10, 10))

for name, data in df.groupby('level_group'):
    axis[0, 0].plot(range(1, len(data['room_coor_x_std'])+1), data['room_coor_x_std'], label=name)
    axis[0, 1].plot(range(1, len(data['room_coor_y_std'])+1), data['room_coor_y_std'], label=name)
    axis[1, 0].plot(range(1, len(data['screen_coor_x_std'])+1), data['screen_coor_x_std'], label=name)
    axis[1, 1].plot(range(1, len(data['screen_coor_y_std'])+1), data['screen_coor_y_std'], label=name)
    axis[2, 0].plot(range(1, len(data['hover_duration'])+1), data['hover_duration_std'], label=name)
    axis[2, 1].plot(range(1, len(data['elapsed_time_std'])+1), data['elapsed_time_std'], label=name)


axis[0, 0].set_title('room_coor_x')
axis[0, 1].set_title('room_coor_y')
axis[1, 0].set_title('screen_coor_x')
axis[1, 1].set_title('screen_coor_y')
axis[2, 0].set_title('hover_duration')
axis[2, 1].set_title('elapsed_time_std')

for i in range(3):
    axis[i, 0].legend()
    axis[i, 1].legend()

plt.show()

# Data Modeling

We will train a model for each question to predict if the question will be answered correctly by a user. There are a total of 18 questions in the dataset. Hence, we will be training 18 models, one for each question.

In [None]:
features = [c for c in df.columns if c != 'level_group']
print(len(features) ,'features')

ls = df.index.unique()
print(len(ls) ,'users')

Before training the data we have to understand how level_groups and questions are associated to each other.

In this game the first quiz checkpoint (i.e., questions 1 to 3) comes after finishing levels 0 to 4. So for training questions 1 to 3 we will use data from the level_group 0-4. Similarly, we will use data from the level_group 5-12 to train questions from 4 to 13 and data from the level_group 13-22 to train questions from 14 to 18.

In [None]:
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ls), 18)), index=ls)
models = {}

for i, (index_train, index_test) in enumerate(gkf.split(X=df, groups=df.index)):
    print('#' * 25)
    print('### Fold', i + 1)
    print('#' * 25)

    for n in range(1, 19):
        print(n,', ', end='')

        if n <= 3: grp = '0-4'
        elif n <= 13: grp = '5-12'
        elif n <= 22: grp = '13-22'

        x_train = df.iloc[index_train]
        x_train = x_train.loc[x_train.level_group == grp]
        users_train = x_train.index.values
        y_train = labels.loc[labels.q==n].set_index('session').loc[users_train]

        x_test = df.iloc[index_test]
        x_test = x_test.loc[x_test.level_group == grp]
        users_test = x_test.index.values
        y_test = labels.loc[labels.q==n].set_index('session').loc[users_test]

        model = RandomForestClassifier()
        model.fit(x_train[features].astype('float32'), y_train['correct'])

        models[f'{grp}_{n}'] = model
        oof.loc[users_test, n-1] = model.predict_proba(x_test[features].astype('float32'))[:,1]

    print()

Since the values of the column correct is fairly imbalanced, using the default threshold of 0.5 to map the predictions into classes 0 or 1 can result in poor performance. In such cases, to improve performance we will calculate the F1 score for a certain range of thresholds and try to find the best threshold aka, threshold with highest F1 score. Then we will use this threshold to map the predicted probabilities to class labels 0 or 1. Please note that we are using F1 score since it is a better metric than accuracy to evaluate problems with class imbalance.

In [None]:
true = oof.copy()
for k in range(18):
    temp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = temp.correct.values

In [None]:
scores = []; thresholds = []
best_score = 0; best_threshold = 0

for threshold in np.arange(0.4, 0.81, 0.01):
    print(f'{threshold:.02f}, ', end='')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')
    scores.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_threshold = threshold

In [None]:
plt.figure(figsize=(20,5))
plt.plot(thresholds,scores, '-o', color='blue')
plt.scatter([best_threshold], [best_score], color='blue', s=300, alpha=1)
plt.xlabel('Threshold', size=14)
plt.ylabel('Validation F1 Score', size=14)
plt.title(f'Threshold vs. F1_Score with Best F1_Score = {best_score:.3f} at Best Threshold = {best_threshold:.3}', size=18)
plt.show()

In [None]:
for k in range(18):
    m = f1_score(true[k].values, (oof[k].values>best_threshold).astype('int'), average='macro')
    print(f'q{k}: F1 =', m)

m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1))>best_threshold).astype('int'), average='macro')
print('Overall F1 =', m)

# Prediction