In [1]:
# Binary Classifier implementation (model architecture, training, testing, etc.) derived from
#     https://towardsdatascience.com/pytorch-tabular-binary-classification-a0368da5bb89

In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
# loading subset of training data as per https://www.kaggle.com/sohier/competition-api-detailed-introduction/notebook
train_df = pd.read_csv('data/train.csv', low_memory=False, nrows=10**5, 
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )

In [4]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False
...,...,...,...,...,...,...,...,...,...,...
99995,99995,153647401,2078569,4334,0,275,3,0,6000.0,True
99996,99996,153692472,2078569,6436,0,276,3,0,9000.0,True
99997,99997,153722998,2078569,6446,0,277,2,1,21000.0,True
99998,99998,153759775,2078569,3715,0,278,3,0,12000.0,True


In [5]:
# get equal amount of correct and incorrect rows and shuffle data
num_correct_rows = train_df[train_df['answered_correctly'] == 1].shape[0]
num_incorrect_rows = train_df[train_df['answered_correctly'] == 0].shape[0]
num_train_rows = min(num_correct_rows, num_incorrect_rows) * 2

correct_df = train_df[train_df['answered_correctly'] == 1][:num_train_rows // 2]
incorrect_df = train_df[train_df['answered_correctly'] == 0][:num_train_rows // 2]
train_df = pd.concat([correct_df, incorrect_df]).dropna().sample(frac=1)

In [6]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
85540,85540,9112549570,1864702,3734,0,1618,2,0,15000.0,True
78960,78960,3072436049,1746406,5196,0,343,2,0,11000.0,True
8371,8371,956298,24600,9887,0,19,1,0,15000.0,True
9001,9001,502229,53842,6879,0,14,3,0,29000.0,False
49043,49043,5469865800,1282581,2975,0,441,1,0,15333.0,True
...,...,...,...,...,...,...,...,...,...,...
58559,58559,9575228678,1283420,9286,0,2866,0,0,28000.0,True
11939,11939,1187932394,107002,6870,0,523,1,0,36000.0,True
84293,84293,4153620962,1864702,2095,0,841,3,0,34000.0,True
87999,87999,31616728,1959138,4922,0,70,3,0,32000.0,True


In [7]:
X = train_df.iloc[:, np.r_[2:7, 8:10]]
y = train_df.iloc[:, 7]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [9]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [10]:
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()

        self.layer_1 = nn.Linear(7, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BinaryClassifier()
model.to(device)

BinaryClassifier(
  (layer_1): Linear(in_features=7, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [12]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [13]:
class TrainDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

train_data = TrainDataset(torch.FloatTensor(np.array(X_train.values, dtype=np.float64)), 
                          torch.FloatTensor(y_train.values))

In [14]:
class TestDataset(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
        
    def __len__ (self):
        return len(self.X_data)
    

test_data = TestDataset(torch.FloatTensor(np.array(X_test.values, dtype=np.float64)))

In [15]:
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [16]:
# calculates accuracy given predictions and labels
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [17]:
model.train()
for e in range(1, EPOCHS+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)

        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

Epoch 001: | Loss: 0.53145 | Acc: 71.845
Epoch 002: | Loss: 0.51837 | Acc: 72.391
Epoch 003: | Loss: 0.51547 | Acc: 72.539
Epoch 004: | Loss: 0.51562 | Acc: 72.474
Epoch 005: | Loss: 0.51713 | Acc: 72.402
Epoch 006: | Loss: 0.51309 | Acc: 72.675
Epoch 007: | Loss: 0.51522 | Acc: 72.486
Epoch 008: | Loss: 0.51526 | Acc: 72.559
Epoch 009: | Loss: 0.51469 | Acc: 72.540
Epoch 010: | Loss: 0.51326 | Acc: 72.640
Epoch 011: | Loss: 0.51312 | Acc: 72.616
Epoch 012: | Loss: 0.51271 | Acc: 72.795
Epoch 013: | Loss: 0.51135 | Acc: 72.595
Epoch 014: | Loss: 0.51226 | Acc: 72.517
Epoch 015: | Loss: 0.51166 | Acc: 72.747
Epoch 016: | Loss: 0.51328 | Acc: 72.537
Epoch 017: | Loss: 0.51213 | Acc: 72.640
Epoch 018: | Loss: 0.51253 | Acc: 72.525
Epoch 019: | Loss: 0.51239 | Acc: 72.458
Epoch 020: | Loss: 0.51099 | Acc: 72.626
Epoch 021: | Loss: 0.51265 | Acc: 72.491
Epoch 022: | Loss: 0.51041 | Acc: 72.756
Epoch 023: | Loss: 0.50946 | Acc: 72.522
Epoch 024: | Loss: 0.51017 | Acc: 72.708
Epoch 025: | Los

In [18]:
# get predictions for test data
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [19]:
# get performance on test data
print(confusion_matrix(y_test, y_pred_list))
print(classification_report(y_test, y_pred_list))

[[ 4888  5162]
 [   65 10167]]
              precision    recall  f1-score   support

           0       0.99      0.49      0.65     10050
           1       0.66      0.99      0.80     10232

    accuracy                           0.74     20282
   macro avg       0.83      0.74      0.72     20282
weighted avg       0.82      0.74      0.72     20282

