In [None]:
import xgboost as xgb
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from utils import *
from sklearn.model_selection import KFold

In [None]:
def sigmoid(x):
    """ Apply sigmoid function.
    """
    return np.exp(x) / (1 + np.exp(x))

In [None]:
def load_metadata(path):
  # A helper function to load the csv file.
  if not os.path.exists(path):
    raise Exception("The specified path {} does not exist.".format(path))
    # Initialize the data.
  data = {}
  # Iterate over the row to fill in the data.
  with open(path, "r") as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
      try:
        data[int(row[0])] = {}
        if int(row[1]) == 0:
          data[int(row[0])]["gender"] = np.nan
        else:
          data[int(row[0])]["gender"] = int(row[1])
        if row[2] == "":
          data[int(row[0])]["date_of_birth"] = np.nan
        else:
          row_split = row[2].split(" ")[0][:4]
          age = 2020 - int(row_split)
          if age > 2:
            data[int(row[0])]["date_of_birth"] = age
          else:
            data[int(row[0])]["date_of_birth"] = np.nan
        if row[3] == "":
          data[int(row[0])]["premium_pupil"] = np.nan
        else:
          data[int(row[0])]["premium_pupil"] = int(float(row[3]))
      except ValueError:
        # Pass first row.
        pass
      except IndexError:
        # is_correct might not be available.
        pass
  return data

In [None]:
sparse_matrix = load_train_sparse("../data").toarray()
train_data = load_train_csv("../data")
val_data = load_valid_csv("../data")
test_data = load_public_test_csv("../data")
private_test_data = load_private_test_csv("../data")
'''
train_data["user_id"].extend(val_data["user_id"])
train_data["question_id"].extend(val_data["question_id"])
train_data["is_correct"].extend(val_data["is_correct"])
'''
train_obs_np = []
train_label_np = []
for i in range(len(train_data["user_id"])):
    u = train_data["user_id"][i]
    q = train_data["question_id"][i]
    c = train_data["is_correct"][i]
    train_obs_np.append([u, q, c])
train_obs_np = np.array(train_obs_np)

kf = KFold(n_splits=5, shuffle=True, random_state=1)
index = 1

test_i = {}
for train_index, test_index in kf.split(train_obs_np):
  for num in test_index:
    test_i[(train_data["user_id"][num], train_data["question_id"][num])] = index - 1
  #np.save("../data/cv2train" + str(index) + ".npy", train_obs_np[train_index])
  #np.save("../data/cv2test" + str(index) + ".npy", train_obs_np[test_index])
  index += 1
  
#train_data["user_id"].extend(test_data["user_id"])
#train_data["question_id"].extend(test_data["question_id"])
#train_data["is_correct"].extend(test_data["is_correct"])

In [None]:
user_acc = np.full((542),0)
user_num_q = np.full((542),0)
q_acc = np.full((1774),0)
q_num_user = np.full((1774),0)
q_age = np.full((1774),0)
q_age_num = np.full((1774),0)
user_q_age = np.full((542),0)
s_metadata = load_metadata("../data/student_meta.csv")

user_acc_list = [{},{},{},{},{}]
q_acc_list = [{},{},{},{},{}]

for i in range(len(train_data["user_id"])):
    u = train_data["user_id"][i]
    q = train_data["question_id"][i]
    c = train_data["is_correct"][i]
    
    user_acc[u] += c
    user_num_q[u] += 1
    q_acc[q] += c
    q_num_user[q]  += 1
    if not np.isnan(s_metadata[u]["date_of_birth"]):
      q_age[q] += s_metadata[u]["date_of_birth"]
      q_age_num[q] += 1

    set_num = test_i[(u,q)]
    if u not in user_acc_list[set_num]:
      user_acc_list[set_num][u] = [c, 1]
    else:
      prev_c = user_acc_list[set_num][u][0]
      prev_total = user_acc_list[set_num][u][1]
      user_acc_list[set_num][u] = [prev_c+c, prev_total+1] 
    if q not in q_acc_list[set_num]:
      q_acc_list[set_num][q] = [c, 1]
    else:
      prev_c = q_acc_list[set_num][q][0]
      prev_total = q_acc_list[set_num][q][1]
      q_acc_list[set_num][q] = [prev_c+c, prev_total+1] 
for i in range(5):
  for key in user_acc_list[i]:
    user_acc_list[i][key] = float(user_acc_list[i][key][0]) / user_acc_list[i][key][1]
  for key in q_acc_list[i]:
    q_acc_list[i][key] = float(q_acc_list[i][key][0]) / q_acc_list[i][key][1]

user_acc = user_acc / user_num_q
q_acc = q_acc / q_num_user
q_age = q_age / q_age_num

In [None]:
# import irt
thetas = []
betas = []
for i in range(1,6):
  thetas.append(np.load("../cvmodels/theta_cv" + str(i) + ".npy"))
  betas.append(np.load("../cvmodels/beta_cv" + str(i) + ".npy"))
  
irt_m = np.full((542,1774),0)
users = train_data["user_id"]
questions = train_data["question_id"]
is_correct = train_data["is_correct"]
in_data_matrix = np.full((542,1774),0)
for i in range(len(users)):
  u = users[i]
  q = questions[i]
  c = is_correct[i]
  in_data_matrix[u, q] = 1

diffs = []
for i in range(5):
  diffs.append(sigmoid(np.subtract.outer(thetas[i][:,0], betas[i][:,0])))
  irt_m = irt_m + diffs[i]

for i in range(542):
  for j in range(1774):
    if in_data_matrix[i,j] == 1:
      irt_m[i,j] = diffs[test_i[(i,j)]][i,j]
    else:
      irt_m[i,j] /= 5
print(irt_m)

[[0.39660202 0.42578973 0.45482846 ... 0.42060309 0.49302693 0.42977169]
 [0.56909958 0.58203818 0.62612073 ... 0.59415681 0.66246785 0.60256774]
 [0.51558378 0.54585194 0.61506883 ... 0.54086796 0.61202098 0.54980951]
 ...
 [0.30501496 0.32972278 0.35720147 ... 0.32562024 0.39272543 0.3338585 ]
 [0.59180568 0.62116889 0.64843633 ... 0.6166389  0.68219424 0.62514121]
 [0.38994751 0.4183346  0.44741754 ... 0.41334934 0.48546107 0.42256609]]


In [None]:
# import mf
mfs = []
for i in range(1, 6):
  mfs.append(np.load("../cvmodels/mf" + str(i) + ".npy"))
mf_m = np.full((542,1774),0)
for i in range(5):
  mf_m = mf_m + mfs[i]

for i in range(542):
  for j in range(1774):
    if in_data_matrix[i,j] == 1:
      mf_m[i,j] = mfs[test_i[(i,j)]][i,j]
    else:
      mf_m[i,j] /= 5
print(mf_m)

[[0.40907563 0.4089843  0.46233351 ... 0.47747023 0.55754158 0.48424846]
 [0.56071437 0.53011209 0.5844982  ... 0.59987543 0.71790746 0.56823893]
 [0.51769357 0.52393579 0.64469634 ... 0.56088241 0.68140935 0.534998  ]
 ...
 [0.25319577 0.24987541 0.29002778 ... 0.28092009 0.31739314 0.27425873]
 [0.51708427 0.55047206 0.57718069 ... 0.58016197 0.69463896 0.56475846]
 [0.32052839 0.32908193 0.32796212 ... 0.35835621 0.43050846 0.34065481]]


In [None]:
# import autoencoder
class AutoEncoder(nn.Module):
    def __init__(self, num_question, k=100,m = 25):
        """ Initialize a class AutoEncoder.

        :param num_question: int
        :param k: int
        """
        super(AutoEncoder, self).__init__()

        # Define linear functions.
        self.g = nn.Linear(num_question, k)
        self.b = nn.Linear(k, num_question)
    def get_weight_norm(self):
        """ Return ||W^1|| + ||W^2||.

        :return: float
        """
        g_w_norm = torch.norm(self.g.weight, 2)
        b_w_norm = torch.norm(self.b.weight, 2)
        return g_w_norm + b_w_norm

    def forward(self, inputs):
        """ Return a forward pass given inputs.

        :param inputs: user vector.
        :return: user vector.
        """
        #####################################################################
        # TODO:                                                             #
        # Implement the function as described in the docstring.             #
        # Use sigmoid activations for f and g.                              #
        #####################################################################
        sig = nn.Sigmoid()
        relu = nn.ReLU()
        g_out = sig(self.g(inputs))
        b_out = sig(self.b(g_out))
        #####################################################################
        #                       END OF YOUR CODE                            #
        #####################################################################
        return b_out#######################################################
        #                       END OF YOUR CODE                            #
        #####################################################################
        return l_out

train_matrix = np.full((542, 1774), np.nan)
for i in range(len(train_data["user_id"])):
  train_matrix[train_data["user_id"][i], train_data["question_id"][i]] = train_data["is_correct"][i]

zero_train_matrix = train_matrix.copy()
zero_train_matrix[np.isnan(train_matrix)] = 0
zero_train_matrix = torch.FloatTensor(zero_train_matrix)
train_matrix = torch.FloatTensor(train_matrix)

autoencoders = []
for i in range(1, 6):
  model = torch.load("../cvmodels/autoencoder" + str(i) + ".pt")
  model.eval()
  matrix = np.full((542,1774),0.0)
  for i, u in enumerate(train_data["user_id"]):
    inputs = Variable(zero_train_matrix[u]).unsqueeze(0)
    output = model(inputs)
    q = train_data["question_id"][i]
    matrix[u,q] = output[0][q].item()
  
  for i, u in enumerate(val_data["user_id"]):
    inputs = Variable(zero_train_matrix[u]).unsqueeze(0)
    output = model(inputs)
    q = val_data["question_id"][i]
    matrix[u,q] += output[0][q].item()
  
  for i, u in enumerate(test_data["user_id"]):
    inputs = Variable(zero_train_matrix[u]).unsqueeze(0)
    output = model(inputs)
    q = test_data["question_id"][i]
    matrix[u,q] += output[0][q].item()
  for i, u in enumerate(private_test_data["user_id"]):
    inputs = Variable(zero_train_matrix[u]).unsqueeze(0)
    output = model(inputs)
    q = private_test_data["question_id"][i]
    matrix[u,q] += output[0][q].item()
  autoencoders.append(matrix)

autoencoder_m = np.full((542,1774),0.0)
for i in range(542):
  for j in range(1774):
    if in_data_matrix[i,j] == 1:
      autoencoder_m[i,j] = autoencoders[test_i[(i,j)]][i,j]
    else:
      autoencoder_m[i,j] /= 5
print(autoencoder_m)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.54760659 0.         ... 0.         0.         0.        ]
 [0.         0.         0.81632286 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
# import xg
xgs = []
for i in range(1, 6):
  xgs.append(np.load("../cvmodels/xgboost" + str(i) + ".npy"))

xg_m = np.full((542,1774),0.0)
for i in range(5):
  xg_m = xg_m + xgs[i]

for i in range(542):
  for j in range(1774):
    if in_data_matrix[i,j] == 1:
      xg_m[i,j] = xgs[test_i[(i,j)]][i,j]
    else:
      xg_m[i,j] /= 5
print(xg_m)

[[0.33202231 0.         0.         ... 0.         0.         0.        ]
 [0.         0.55141401 0.         ... 0.         0.         0.        ]
 [0.         0.         0.60529703 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [None]:
#for round in range(17,100):
#  recorded = []
#  for i in range(100):
data = []
f = []
# bias
f0 = []
# user acc
f1 = []
# q acc
f2 = []
# num q
f3 = []
# num user
f4 = []
# user id
f5 = []
# q id
f6 = []
# q age
f7 = []
# gender
f8 = []
# age
f9 = []
# premium pupil
f10 = []
y = []
avg_theta = np.full(542, 0)
avg_beta = np.full(1774, 0)
#for i in range(5):
  #avg_theta += thetas[i]
  #avg_beta += betas[i]
for i in range(len(users)):
  u = users[i]
  q = questions[i]
  c = is_correct[i]
  f0.append(1)
  f1.append(user_acc[u])
  f2.append(q_acc[q])
  f3.append(user_num_q[u])
  f4.append(q_num_user[q])
  f5.append(user_num_q[u] + q_num_user[q])
  f6.append(q)
  f7.append(q_age[q])
  f8.append(s_metadata[u]["gender"] if not np.isnan(s_metadata[u]["gender"]) else 0)
  f9.append(s_metadata[u]["date_of_birth"])
  f10.append(s_metadata[u]["premium_pupil"])
  y.append(c)

f.append(f0)
#f.append(f1)
#f.append(f2)
#f.append(f3)
#f.append(f4)
#f.append(f5)
#f.append(f6)
#f.append(f7)
#f.append(f8)
#f.append(f9)
#f.append(f10)
f = np.array(f)
y = np.array(y)

randint = np.random.randint(2, size=21)
data.append(randint)
g = []
g0 = []
g1 = []
g2 = []
g3 = []
g4 = []
g5 = []
for i in range(len(users)):
  u = users[i]
  q = questions[i]
  g0.append(1)
  g1.append(irt_m[u,q])
  g2.append(mf_m[u,q])
  g3.append(nn_m[u,q])
  g4.append(xg_m[u,q])
  g5.append(autoencoder_m[u,q])
g.append(g0)
g.append(g1)
g.append(g2)
g.append(g3)
g.append(g4)
g.append(g5)

g = np.array(g).T
num_model = g.shape[1]
num_meta = f.shape[0]
A = np.multiply(g, f[0][:,None])
for i in range(1, num_meta):
  A = np.concatenate((A, np.multiply(g, f[i][:,None])), axis=1)
  
lambd = 0.1
identity = np.identity(num_model*num_meta)
v = np.linalg.solve(np.dot(A.T, A) + lambd*identity, np.dot(A.T, y))

pred = []
actual = []
threshold = 0.5
for i in range(len(train_data["user_id"])):
  u = train_data["user_id"][i]
  q = train_data["question_id"][i]
  sum = 0.0
  for j in range(num_model):
    for k in range(num_meta):
      sum += v[k*num_model + j] * f[k,i] * g[i,j]
  pred.append(sum >= threshold)
  actual.append(sum)
#print(np.sum(np.array(pred) == 1))
#print(np.sum(np.array(pred) == 0))
#print(actual)
acc = np.sum((train_data["is_correct"] == np.array(pred))) / len(train_data["is_correct"]) 
data.append(acc)
#print(acc)

pred = []
actual = []
all_g = []
all_g.append(np.ones((542, 1774)))
all_g.append(irt_m)
all_g.append(mf_m)
all_g.append(nn_m)
all_g.append(xg_m)
all_g.append(autoencoder_m)

for i in range(len(val_data["user_id"])):
  u = val_data["user_id"][i]
  q = val_data["question_id"][i]
  sum = 0.0
  for j in range(num_model):
    sum += v[j] * all_g[j][u,q]
    #sum += v[1*num_model + j] * ((user_acc[u])) * all_g[j][u,q]
    #sum += v[2*num_model + j] * ((q_acc[q])) * all_g[j][u,q]
    #sum += v[1*num_model + j] * (user_num_q[u]) * all_g[j][u,q]
    #sum += v[2*num_model + j] * (q_num_user[q]) * all_g[j][u,q]
    #sum += v[3*num_model + j] * (user_num_q[u] + q_num_user[q]) * all_g[j][u,q]
    #sum += v[1*num_model + j] * q_age[q] * all_g[j][u,q]
    #sum += v[1*num_model + j] * s_metadata[u]["gender"] * all_g[j][u,q]
  pred.append(sum >= threshold)
  actual.append(sum)

#print(np.sum(np.array(pred) == 1))
#print(np.sum(np.array(pred) == 0))
#print(actual)
acc = np.sum((val_data["is_correct"] == np.array(pred))) / len(val_data["is_correct"]) 
print(acc)
data.append(acc)


  return array(a, dtype, copy=False, order=order, subok=True)


In [None]:

pred = []
actual = []
for i in range(len(test_data["user_id"])):
  u = test_data["user_id"][i]
  q = test_data["question_id"][i]
  sum = 0.0
  for j in range(num_model):
    sum += v[j] * all_g[j][u,q]
    #sum += v[1*num_model + j] * ((user_acc[u])) * all_g[j][u,q]
    #sum += v[2*num_model + j] * ((q_acc[q])) * all_g[j][u,q]
    #sum += v[1*num_model + j] * (user_num_q[u]) * all_g[j][u,q]
    #sum += v[2*num_model + j] * (q_num_user[q]) * all_g[j][u,q]
    #sum += v[3*num_model + j] * (user_num_q[u] + q_num_user[q]) * all_g[j][u,q]
    #sum += v[1*num_model + j] * q_age[q] * all_g[j][u,q]
    #sum += v[1*num_model + j] * s_metadata[u]["gender"] * all_g[j][u,q]
  pred.append(sum >= 0.5)
  actual.append(sum)
#print(actual)
#print(np.sum(np.array(pred) == 1))
#print(np.sum(np.array(pred) == 0))
acc = np.sum((test_data["is_correct"] == np.array(pred))) / len(test_data["is_correct"]) 
print(acc)