# Part 1: KNN -- Doh

In [None]:
import numpy as np
from sklearn.impute import KNNImputer
from utils import *
import matplotlib.pyplot as plt


def knn_impute_by_user(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    student similarity. Return the accuracy on valid_data.

    See https://scikit-learn.org/stable/modules/generated/sklearn.
    impute.KNNImputer.html for details.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    nbrs = KNNImputer(n_neighbors=k)
    # We use NaN-Euclidean distance measure.
    mat = nbrs.fit_transform(matrix)
    acc = sparse_matrix_evaluate(valid_data, mat)
    return acc


def knn_impute_by_item(matrix, valid_data, k):
    """ Fill in the missing values using k-Nearest Neighbors based on
    question similarity. Return the accuracy on valid_data.

    :param matrix: 2D sparse matrix
    :param valid_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param k: int
    :return: float
    """
    #####################################################################
    # TODO:                                                             #
    # Implement the function as described in the docstring.             #
    #####################################################################
    imputer = KNNImputer(n_neighbors=k)
    mat = imputer.fit_transform(np.transpose(matrix))
    acc = sparse_matrix_evaluate(valid_data, np.transpose(mat))
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################
    return acc


def main():
    sparse_matrix = load_train_sparse("../data").toarray()
    val_data = load_valid_csv("../data")
    test_data = load_public_test_csv("../data")

    print("Sparse matrix:")
    print(sparse_matrix)
    print("Shape of sparse matrix:")
    print(sparse_matrix.shape)

    #####################################################################
    # TODO:                                                             #
    # Compute the validation accuracy for each k. Then pick k* with     #
    # the best performance and report the test accuracy with the        #
    # chosen k*.                                                        #
    #####################################################################
    list_k = [1, 6, 11, 16, 21, 26]
    acc_user = []
    acc_item = []

    for k in list_k:
        acc_user.append(knn_impute_by_user(sparse_matrix, val_data, k))
        acc_item.append(knn_impute_by_item(sparse_matrix, val_data, k))

    # Plot k vs. Validation Accuracy
    p = plt.figure()
    plt.plot(list_k, acc_user, label="Validation Accuracy (user)")
    plt.plot(list_k, acc_item, label="Validation Accuracy (item)")
    plt.title("k vs. Validation Accuracy")
    plt.xlabel("Value of k")
    plt.ylabel("Validation Accuracy")
    plt.legend()
    plt.show()
    p.savefig("final_knn.pdf")

    # Report Accuracies
    for i in range(len(list_k)):
        print("Validation Accuracy (user) for ", list_k[i], " : ", acc_user[i])
    for i in range(len(list_k)):
        print("Validation Accuracy (item) for ", list_k[i], " : ", acc_item[i])

    # Find k* and report
    max_k_user = list_k[acc_user.index(max(acc_user))]
    max_k_item = list_k[acc_item.index(max(acc_item))]

    print("k with Maximum Validation Accuracy (user): ", max_k_user)
    print("k with Maximum Validation Accuracy (item): ", max_k_item)

    # Report test accuracies
    test_acc_user = knn_impute_by_user(sparse_matrix, test_data, max_k_user)
    test_acc_item = knn_impute_by_item(sparse_matrix, test_data, max_k_item)
    print("Test Accuracy (user) with k* = ", max_k_user, " : ", test_acc_user)
    print("Test Accuracy (item) with k* = ", max_k_item, " : ", test_acc_item)
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################

    # Report

    # d) The test accuracy of user-based collaborative filtering with k* = 11
    # was about 0.6842, and the test accuracy of item-based collaborative
    # filtering with k* = 21 was about 0.6816.
    # User-based collaborative filtering has slightly higher test accuracy by
    # about 0.0026 difference.

    # e) 1. It might return inaccurate predictions if there is no / a few
    #       student(s) who answered the other question similarly, or simple
    #       there is not enough amount of data (answers from other students).
    #    2. The computational costs such as time and storage might be expensive
    #       if the data is huge.


if __name__ == "__main__":
    main()


# Part 2: Information Response Theory -- Doh

In [None]:
import math

from utils import *

import numpy as np
import matplotlib.pyplot as plt


def sigmoid(x):
    """ Apply sigmoid function.
    """
    return np.exp(x) / (1 + np.exp(x))


def neg_log_likelihood(data, theta, beta):
    """ Compute the negative log-likelihood.

    You may optionally replace the function arguments to receive a matrix.

    :param data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param theta: Vector
    :param beta: Vector
    :return: float
    """
    #####################################################################
    # TODO:                                                             #
    # Implement the function as described in the docstring.             #
    #####################################################################
    log_lklihood = 0

    for index in range(len(data["user_id"])):
        if data["is_correct"][index] == 1:
            log_lklihood += \
                math.log(sigmoid(theta[data["user_id"][index]][0] -
                                 beta[data["question_id"][index]][0]))
        else:
            log_lklihood += \
                math.log(1 - sigmoid(theta[data["user_id"][index]][0] -
                                     beta[data["question_id"][index]][0]))

    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################
    return -log_lklihood


def update_theta_beta(data, lr, theta, beta):
    """ Update theta and beta using gradient descent.

    You are using alternating gradient descent. Your update should look:
    for i in iterations ...
        theta <- new_theta
        beta <- new_beta

    You may optionally replace the function arguments to receive a matrix.

    :param data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param lr: float
    :param theta: Vector
    :param beta: Vector
    :return: tuple of vectors
    """
    #####################################################################
    # TODO:                                                             #
    # Implement the function as described in the docstring.             #
    #####################################################################
    for index in range(len(data["user_id"])):
        if data["is_correct"][index] == 1:
            theta[data["user_id"][index]][0] = \
                theta[data["user_id"][index]][0] + \
                lr * sigmoid((-1) * theta[data["user_id"][index]][0] +
                             beta[data["question_id"][index]][0])
            beta[data["question_id"][index]][0] = \
                beta[data["question_id"][index]][0] - \
                lr * sigmoid((-1) * theta[data["user_id"][index]][0] +
                             beta[data["question_id"][index]][0])
        else:
            theta[data["user_id"][index]][0] = \
                theta[data["user_id"][index]][0] - \
                lr * sigmoid(theta[data["user_id"][index]][0] -
                             beta[data["question_id"][index]][0])
            beta[data["question_id"][index]][0] = \
                beta[data["question_id"][index]][0] + \
                lr * sigmoid(theta[data["user_id"][index]][0] -
                             beta[data["question_id"][index]][0])
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################
    return theta, beta


def irt(data, val_data, lr, iterations):
    """ Train IRT model.

    You may optionally replace the function arguments to receive a matrix.

    :param data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param val_data: A dictionary {user_id: list, question_id: list,
    is_correct: list}
    :param lr: float
    :param iterations: int
    :return: (theta, beta, val_acc_lst)
    """
    # TODO: Initialize theta and beta.
    theta = []
    beta = []
    for m in range(max(data["user_id"]) + 1):
        theta.append([0.1])
    for n in range(max(data["question_id"]) + 1):
        beta.append([0.1])

    theta = np.array(theta)
    beta = np.array(beta)

    val_acc_lst = []
    neg_lld_list = []

    for i in range(iterations):
        neg_lld = neg_log_likelihood(data, theta=theta, beta=beta)
        neg_lld_list.append(neg_lld)
        score = evaluate(data=val_data, theta=theta, beta=beta)
        val_acc_lst.append(score)
        print("NLLK: {} \t Score: {}".format(neg_lld, score))
        theta, beta = update_theta_beta(data, lr, theta, beta)

    # TODO: You may change the return values to achieve what you want.
    return theta, beta, val_acc_lst, neg_lld_list


def evaluate(data, theta, beta):
    """ Evaluate the model given data and return the accuracy.
    :param data: A dictionary {user_id: list, question_id: list,
    is_correct: list}

    :param theta: Vector
    :param beta: Vector
    :return: float
    """
    pred = []
    for i, q in enumerate(data["question_id"]):
        u = data["user_id"][i]
        x = (theta[u] - beta[q]).sum()
        p_a = sigmoid(x)
        pred.append(p_a >= 0.5)
    return np.sum((data["is_correct"] == np.array(pred))) \
           / len(data["is_correct"])


def main():
    train_data = load_train_csv("../data")
    # You may optionally use the sparse matrix.
    # sparse_matrix = load_train_sparse("../data")
    val_data = load_valid_csv("../data")
    test_data = load_public_test_csv("../data")

    #####################################################################
    # TODO:                                                             #
    # Tune learning rate and number of iterations. With the implemented #
    # code, report the validation and test accuracy.                    #
    #####################################################################
    # Hyperparameters
    lr = 0.1
    iterations = 20

    val_theta, val_beta, val_val_accu_list, val_neg_lld_list = \
        irt(train_data, val_data, lr, iterations)
    print("Validation accuracies: ", val_val_accu_list)

    iter_list = [i for i in range(0, iterations)]
    p = plt.figure()
    plt.plot(iter_list, val_neg_lld_list)
    plt.title("Number of iteration vs. Negative log likelihood")
    plt.xlabel("Number of iteration")
    plt.ylabel("Negative log likelihood")
    # plt.show()
    p.savefig("csc311_nllk.png")

    test_theta, test_beta, test_val_accu_list, test_neg_lld_list = \
        irt(train_data, test_data, lr, iterations)
    print("Testing accuracies: ", test_val_accu_list)

    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################

    #####################################################################
    # TODO:                                                             #
    # Implement part (d)                                                #
    #####################################################################
    j = [1, 10, 100]
    j_beta = []
    theta_i = [item for sublist in val_theta for item in sublist]
    theta_i = sorted(theta_i)
    prob_list = []

    for n in j:
        j_beta.append(val_beta[n][0])

    for beta in j_beta:
        prob = []
        for theta in theta_i:
            prob.append(sigmoid(theta - beta))
        prob_list.append(prob)

    p2 = plt.figure()
    plt.plot(theta_i, prob_list[0], label="j_1 = 1")
    plt.plot(theta_i, prob_list[1], label="j_2 = 10")
    plt.plot(theta_i, prob_list[2], label="j_3 = 100")
    plt.title("Theta given j vs. Probability of correct answer")
    plt.xlabel("Theta given j")
    plt.ylabel("Probability of correct answer")
    plt.legend()
    plt.show()
    p2.savefig("csc311_theta_prob.png")
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################


if __name__ == "__main__":
    main()



# Part 3: Neural Networks -- Devansh



In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
from torch.autograd import Variable

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

import numpy as np
import torch

import matplotlib.pyplot as plt

In [None]:
from scipy.sparse import load_npz
import csv
import os

In [None]:
##UTILS FUNCTIONS

def _load_csv(path):
    # A helper function to load the csv file.
    if not os.path.exists(path):
        raise Exception("The specified path {} does not exist.".format(path))
    # Initialize the data.
    data = {
        "user_id": [],
        "question_id": [],
        "is_correct": []
    }
    # Iterate over the row to fill in the data.
    with open(path, "r") as csv_file:
        reader = csv.reader(csv_file)
        for row in reader:
            try:
                data["question_id"].append(int(row[0]))
                data["user_id"].append(int(row[1]))
                data["is_correct"].append(int(row[2]))
            except ValueError:
                # Pass first row.
                pass
            except IndexError:
                # is_correct might not be available.
                pass
    return data


def load_train_sparse(root_dir="/data"):
    """ Load the training data as a spare matrix representation.

    :param root_dir: str
    :return: 2D sparse matrix
    """
    path = os.path.join(root_dir, "train_sparse.npz")
    if not os.path.exists(path):
        raise Exception("The specified path {} "
                        "does not exist.".format(os.path.abspath(path)))
    matrix = load_npz(path)
    return matrix


def load_train_csv(root_dir="/data"):
    """ Load the training data as a dictionary.

    :param root_dir: str
    :return: A dictionary {user_id: list, question_id: list, is_correct: list}
        WHERE
        user_id: a list of user id.
        question_id: a list of question id.
        is_correct: a list of binary value indicating the correctness of
        (user_id, question_id) pair.
    """
    path = os.path.join(root_dir, "train_data.csv")
    return _load_csv(path)


def load_valid_csv(root_dir="/data"):
    """ Load the validation data as a dictionary.

    :param root_dir: str
    :return: A dictionary {user_id: list, question_id: list, is_correct: list}
        WHERE
        user_id: a list of user id.
        question_id: a list of question id.
        is_correct: a list of binary value indicating the correctness of
        (user_id, question_id) pair.
    """
    path = os.path.join(root_dir, "valid_data.csv")
    return _load_csv(path)


def load_public_test_csv(root_dir="/data"):
    """ Load the test data as a dictionary.

    :param root_dir: str
    :return: A dictionary {user_id: list, question_id: list, is_correct: list}
        WHERE
        user_id: a list of user id.
        question_id: a list of question id.
        is_correct: a list of binary value indicating the correctness of
        (user_id, question_id) pair.
    """
    path = os.path.join(root_dir, "test_data.csv")
    return _load_csv(path)

In [None]:
def load_data(base_path="../starter_code/data"):
    """ Load the data in PyTorch Tensor.

    :return: (zero_train_matrix, train_data, valid_data, test_data)
        WHERE:
        zero_train_matrix: 2D sparse matrix where missing entries are
        filled with 0.
        train_data: 2D sparse matrix
        valid_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
        test_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
    """
    train_matrix = load_train_sparse(base_path).toarray() #2D Sparse Matrix
    train_data = load_train_csv(base_path)
    valid_data = load_valid_csv(base_path) #{user id: [...], question id: [...], correct: [...]}
    test_data = load_public_test_csv(base_path) #{user id: [...], q id: [...], correct: [...]}

    zero_train_matrix = train_matrix.copy()

    # Fill in the missing entries to 0.
    zero_train_matrix[np.isnan(train_matrix)] = 0

    # Change to Float Tensor for PyTorch.
    zero_train_matrix = torch.FloatTensor(zero_train_matrix)
    train_matrix = torch.FloatTensor(train_matrix)

    return zero_train_matrix, train_matrix, train_data, valid_data, test_data

In [None]:
class AutoEncoder(nn.Module):
    torch.manual_seed(42)
    def __init__(self, num_question, k=100):
        """ Initialize a class AutoEncoder.

        :param num_question: int
        :param k: int
        """
        super(AutoEncoder, self).__init__()

        # Define linear functions.
        self.encoder = nn.Linear(num_question, k) #self.g
        self.decoder = nn.Linear(k, num_question) #self.h

    def get_weight_norm(self):
        """ Return ||W^1||^2 + ||W^2||^2.

        :return: float
        """
        g_w_norm = torch.norm(self.encoder.weight, 2) ** 2
        h_w_norm = torch.norm(self.decoder.weight, 2) ** 2

        return g_w_norm + h_w_norm

    def forward(self, inputs):
        """ Return a forward pass given inputs.

        :param inputs: user vector.
        :return: user vector.
        """
        #####################################################################
        # TODO:                                                             #
        # Implement the function as described in the docstring.             #
        # Use sigmoid activations for f and g.                              #
        #####################################################################
        x = torch.sigmoid(self.encoder(inputs))
        out = torch.sigmoid(self.decoder(x))
        #####################################################################
        #                       END OF YOUR CODE                            #
        #####################################################################
        return out

In [None]:
def evaluate(model, train_data, valid_data):
    """ Evaluate the valid_data on the current model.

    :param model: Module
    :param train_data: 2D FloatTensor
    :param valid_data: A dictionary {user_id: list,
    question_id: list, is_correct: list}
    :return: float
    """
    # Tell PyTorch you are evaluating the model.
    model.eval()

    total = 0
    correct = 0

    for i, u in enumerate(valid_data["user_id"]):
        inputs = Variable(train_data[u]).unsqueeze(0)
        output = model(inputs)

        guess = output[0][valid_data["question_id"][i]].item() >= 0.5
        if guess == valid_data["is_correct"][i]:
            correct += 1
        total += 1
    return correct / float(total)

In [None]:
def train(model, lr, lamb, train_matrix, zero_train_data, train_data, valid_data, num_epoch):
    """ Train the neural network, where the objective also includes
    a regularizer.

    :param model: Module
    :param lr: float
    :param lamb: float
    :param train_data: 2D FloatTensor
    :param zero_train_data: 2D FloatTensor
    :param train_data: Dict
    :param valid_data: Dict
    :param num_epoch: int
    :return: None
    """

    # TODO: Add a regularizer to the cost function.

    # Tell PyTorch you are training the model.
    model.train()

    # Define optimizers and loss function.
    optimizer = optim.SGD(model.parameters(), lr=lr)
    num_student = train_matrix.shape[0]
    train_losses = []
    train_accs = []
    val_accs = []
    eps = []

    for epoch in range(0, num_epoch):
        train_loss = 0.
        eps.append(epoch)

        for user_id in range(num_student):
            inputs = Variable(zero_train_data[user_id]).unsqueeze(0)  #answers to all questions by a student
            target = inputs.clone()

            optimizer.zero_grad()
            output = model(inputs)

            # Mask the target to only compute the gradient of valid entries.
            nan_mask = np.isnan(train_matrix[user_id].unsqueeze(0).numpy())
            target[0][nan_mask] = output[0][nan_mask]

            loss = torch.sum((output - target) ** 2.)
            loss.backward()

            train_loss += loss.item()
            optimizer.step()
        train_acc = evaluate(model, zero_train_data, train_data)
        valid_acc = evaluate(model, zero_train_data, valid_data)

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        val_accs.append(valid_acc)
        print("Epoch: {} \tTraining Cost: {:.6f}\t "
              "Valid Acc: {}".format(epoch, train_loss, valid_acc))

      #plotting
    plt.title("Training Loss vs. Epochs")
    plt.plot(eps, train_losses, label="Training Curve")
    plt.xlabel("Epochs")
    plt.ylabel("Training Loss")
    plt.show()

    plt.title("Accuracy vs. Epochs")
    plt.plot(eps, train_accs, label="Training Curve")
    plt.plot(eps, val_accs, label="Validation Curve")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()

    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################

In [None]:
if __name__ == "__main__":
  path = "/content/drive/MyDrive/starter_code/data"
  zero_train_matrix, train_matrix, train_data, valid_data, test_data = load_data(path)

    #####################################################################
    # TODO:                                                             #
    # Try out 5 different k and select the best k using the             #
    # validation set.                                                   #
    #####################################################################
    # Set model hyperparameters.
  k = 50
  num_questions = zero_train_matrix.shape[1]
  model = AutoEncoder(num_question = num_questions, k = k)

  # Set optimization hyperparameters.
  lr = 0.01
  num_epoch = 40
  lamb = 0

  train(model, lr, lamb, train_matrix, zero_train_matrix, train_data,
          valid_data, num_epoch)
  #####################################################################
  #                       END OF YOUR CODE                            #
  #####################################################################

Epoch: 0 	Training Cost: 13504.900812	 Valid Acc: 0.6195314705052216
Epoch: 1 	Training Cost: 12322.248103	 Valid Acc: 0.6344905447360993
Epoch: 2 	Training Cost: 11671.075566	 Valid Acc: 0.6536833192209992


KeyboardInterrupt: ignored

Regularized NN

In [None]:
def train_reg(model, lr, lamb, train_matrix, zero_train_data, train_data, valid_data, num_epoch):
    """ Train the neural network, where the objective also includes
    a regularizer.

    :param model: Module
    :param lr: float
    :param lamb: float
    :param train_data: 2D FloatTensor
    :param zero_train_data: 2D FloatTensor
    :param valid_data: Dict
    :param num_epoch: int
    :return: None
    """

    # TODO: Add a regularizer to the cost function.
    norm = model.get_weight_norm()
    norm = norm.detach().numpy()
    print("norm = ", norm)

    # Tell PyTorch you are training the model.
    model.train()

    # Define optimizers and loss function.
    optimizer = optim.SGD(model.parameters(), lr=lr)
    num_student = train_matrix.shape[0]

    eps = []
    train_losses = []
    train_accs = []
    val_accs = []

    for epoch in range(0, num_epoch):
        train_loss = 0.

        for user_id in range(num_student):
            inputs = Variable(zero_train_data[user_id]).unsqueeze(0)  #answers to all questions by a student
            target = inputs.clone()

            optimizer.zero_grad()
            output = model(inputs)

            # Mask the target to only compute the gradient of valid entries.
            nan_mask = np.isnan(train_matrix[user_id].unsqueeze(0).numpy())
            target[0][nan_mask] = output[0][nan_mask]

            loss = torch.sum(((output - target) ** 2.)) + lamb*norm/2 #check placement of norm term
            loss.backward()

            train_loss += loss.item()
            optimizer.step()

        train_acc = evaluate(model, zero_train_data, train_data)
        valid_acc = evaluate(model, zero_train_data, valid_data)

        eps.append(epoch)
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        val_accs.append(valid_acc)
        print("Epoch: {} \tTraining Cost: {:.6f}\t "
              "Valid Acc: {}".format(epoch, train_loss, valid_acc))

      #plotting
    plt.title("Training Loss vs. Epochs")
    plt.plot(eps, train_losses, label="Training Curve")
    plt.xlabel("Epochs")
    plt.ylabel("Training Loss")
    plt.show()

    plt.title("Accuracy vs. Epochs")
    plt.plot(eps, train_accs, label="Training Curve")
    plt.plot(eps, val_accs, label="Validation Curve")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################

In [None]:
if __name__ == "__main__":
  path = "/content/drive/MyDrive/starter_code/data"
  zero_train_matrix, train_matrix, train_data, valid_data, test_data = load_data(path)

    #####################################################################
    # TODO:                                                             #
    # Try out 5 different k and select the best k using the             #
    # validation set.                                                   #
    #####################################################################
    # Set model hyperparameters.
  k = 50
  num_questions = zero_train_matrix.shape[1]
  model_reg = AutoEncoder(num_question = num_questions, k = k)

  # Set optimization hyperparameters.
  lr = 0.01
  num_epoch = 40
  lamb = 0.001

  train_reg(model_reg, lr, lamb, train_matrix, zero_train_matrix, train_data, valid_data, num_epoch)
  #####################################################################
  #                       END OF YOUR CODE                            #
  #####################################################################

norm =  596.55023
Epoch: 0 	Training Cost: 13674.634931	 Valid Acc: 0.622495060683037
Epoch: 1 	Training Cost: 12463.829630	 Valid Acc: 0.6363251481795089
Epoch: 2 	Training Cost: 11755.400912	 Valid Acc: 0.6515664690939882
Epoch: 3 	Training Cost: 11162.529329	 Valid Acc: 0.6642675698560542
Epoch: 4 	Training Cost: 10671.385089	 Valid Acc: 0.6699125035280835
Epoch: 5 	Training Cost: 10273.309865	 Valid Acc: 0.6758396838837144
Epoch: 6 	Training Cost: 9935.320303	 Valid Acc: 0.6804967541631386
Epoch: 7 	Training Cost: 9642.127272	 Valid Acc: 0.6819079875811459
Epoch: 8 	Training Cost: 9389.196759	 Valid Acc: 0.6809201241885408
Epoch: 9 	Training Cost: 9156.072784	 Valid Acc: 0.6812023708721423
Epoch: 10 	Training Cost: 8945.150385	 Valid Acc: 0.6799322607959356
Epoch: 11 	Training Cost: 8771.104641	 Valid Acc: 0.6797911374541349
Epoch: 12 	Training Cost: 8592.795979	 Valid Acc: 0.678662150719729
Epoch: 13 	Training Cost: 8432.629589	 Valid Acc: 0.6775331639853232
Epoch: 14 	Training Co

#Part 4: Bagging -- Devansh

In [None]:
#import utils

def load_data_for_bagging(base_path="../starter_code/data"):
    """ Load the data in PyTorch Tensor.

    :return: (zero_train_matrix, train_data, valid_data, test_data)
        WHERE:
        zero_train_matrix: 2D sparse matrix where missing entries are
        filled with 0.
        train_data: 2D sparse matrix
        valid_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
        test_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
    """
    train_matrix = load_train_sparse(base_path).toarray() #2D Sparse Matrix
    train_data = load_train_csv(base_path)
    valid_data = load_valid_csv(base_path) #{user id: [...], question id: [...], correct: [...]}
    test_data = load_public_test_csv(base_path) #{user id: [...], q id: [...], correct: [...]}

    zero_train_matrix = train_matrix.copy()

    # Fill in the missing entries to 0.
    zero_train_matrix[np.isnan(train_matrix)] = 0

    # Change to Float Tensor for PyTorch.
    zero_train_matrix = torch.FloatTensor(zero_train_matrix)
    train_matrix1 = torch.FloatTensor(train_matrix)

    return zero_train_matrix, train_matrix1, train_data, valid_data, test_data, train_matrix

In [None]:
def train_bagging(model, lr, lamb, train_data, zero_train_data, valid_data, num_epoch):
    """ Train the neural network, where the objective also includes
    a regularizer.

    :param model: Module
    :param lr: float
    :param lamb: float
    :param train_data: 2D FloatTensor
    :param zero_train_data: 2D FloatTensor
    :param valid_data: Dict
    :param num_epoch: int
    :return: None
    """

    # TODO: Add a regularizer to the cost function.
    norm = model.get_weight_norm()
    norm = norm.detach().numpy()
    print("norm = ", norm)

    # Tell PyTorch you are training the model.
    model.train()

    # Define optimizers and loss function.
    optimizer = optim.SGD(model.parameters(), lr=lr)
    num_student = train_data.shape[0]

    for epoch in range(0, num_epoch):
        train_loss = 0.

        for user_id in range(num_student):
            inputs = Variable(zero_train_data[user_id]).unsqueeze(0)  #answers to all questions by a student
            target = inputs.clone()

            optimizer.zero_grad()
            output = model(inputs)

            # Mask the target to only compute the gradient of valid entries.
            nan_mask = np.isnan(train_data[user_id].unsqueeze(0).numpy())
            target[0][nan_mask] = output[0][nan_mask]

            loss = torch.sum(((output - target) ** 2.)) + lamb*norm/2
            loss.backward()

            train_loss += loss.item()
            optimizer.step()

        valid_acc = evaluate(model, zero_train_data, valid_data)
        print("Epoch: {} \tTraining Cost: {:.6f}\t "
              "Valid Acc: {}".format(epoch, train_loss, valid_acc))
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################

In [None]:
  def evaluate_bagging(model1, model2, model3, train_data, valid_data):
    """ Evaluate the valid_data on the current model.

    :param model: Module
    :param train_data: 2D FloatTensor
    :param valid_data: A dictionary {user_id: list,
    question_id: list, is_correct: list}
    :return: float
    """
    # Tell PyTorch you are evaluating the model.
    model1.eval()
    model2.eval()
    model3.eval()

    total = 0
    correct = 0

    for i, u in enumerate(valid_data["user_id"]):
        inputs = Variable(train_data[u]).unsqueeze(0)
        output1 = model1(inputs)
        output2 = model2(inputs)
        output3 = model3(inputs)

        guess = (((output1[0][valid_data["question_id"][i]].item() >= 0.5)+(output2[0][valid_data["question_id"][i]].item() >= 0.5)+(output3[0][valid_data["question_id"][i]].item() >= 0.5))/3) >= 0.5

        if guess == valid_data["is_correct"][i]:
            correct += 1
        total += 1
    return correct / float(total)

In [None]:
#create list of randomly sorted iterables (equal to # of students)
import random

def bootstrap(train_matrix):
  '''Input: np train_matrix
  Returns: tensor zero_train_matrix2, train_matrix2, zero_train_matrix3,train_matrix3
  '''

  train_matrix2 = train_matrix.copy()
  train_matrix3 = train_matrix.copy()

  rand_list1 = list(range(train_matrix.shape[0]))
  random.shuffle(rand_list1)
  rand_list2 = list(range(train_matrix.shape[0]))
  random.shuffle(rand_list2)

  for i in range(train_matrix.shape[0]):
    j = rand_list1[i]
    k = rand_list2[i]

    train_matrix2[i] = train_matrix[j]
    train_matrix3[i] = train_matrix[k]

  zero_train_matrix2 = train_matrix2.copy()
  zero_train_matrix3 = train_matrix3.copy()

  #Fill in the missing entries to 0.
  zero_train_matrix2[np.isnan(train_matrix2)] = 0
  zero_train_matrix3[np.isnan(train_matrix3)] = 0

  # Change to Float Tensor for PyTorch.
  zero_train_matrix2 = torch.FloatTensor(zero_train_matrix2)
  train_matrix2 = torch.FloatTensor(train_matrix2)

  zero_train_matrix3 = torch.FloatTensor(zero_train_matrix3)
  train_matrix3 = torch.FloatTensor(train_matrix3)

  return zero_train_matrix2, train_matrix2, zero_train_matrix3, train_matrix3

In [None]:
if __name__ == "__main__":
  path = "/content/drive/MyDrive/starter_code/data"


  zero_train_matrix1, train_matrix1, train_data, valid_data, test_data, np_train_matrix = load_data_for_bagging(path)
  zero_train_matrix2, train_matrix2, zero_train_matrix3, train_matrix3 = bootstrap(np_train_matrix)


    #####################################################################
    # TODO:                                                             #
    # Try out 5 different k and select the best k using the             #
    # validation set.                                                   #
    #####################################################################


  #train the models on the three training datasets

    # Set model hyperparameters.
  k = 50
  num_questions = zero_train_matrix1.shape[1]
  modelb1 = AutoEncoder(num_question = num_questions, k = k)
  modelb2 = AutoEncoder(num_question = num_questions, k = k)
  modelb3 = AutoEncoder(num_question = num_questions, k = k)

    # Set optimization hyperparameters.
  lr1 = 0.01; lr2 = 0.02; lr3 = 0.001
  num_epoch1 = 40; num_epoch2 = 30; num_epoch3 = 40
  lamb1 = 0.001; lamb2 = 0.001; lamb3 = 0.001

  train_bagging(modelb1, lr1, lamb1, train_matrix1, zero_train_matrix1, valid_data, num_epoch1)
  train_bagging(modelb2, lr2, lamb2, train_matrix2, zero_train_matrix2, valid_data, num_epoch2)
  train_bagging(modelb3, lr3, lamb3, train_matrix3, zero_train_matrix3, valid_data, num_epoch3)


  #make ensembled validation predictions and evaluate accuracy (only one epoch -- because we have the fully trained model)
  val_acc_b = evaluate_bagging(modelb1, modelb2, modelb3, zero_train_matrix1,valid_data) #choice of train matrix doesn't matter because eval function takes the ztm list of user ids and makes predictions on that specific list

  #make ensembled validation predictions and evaluate accuracy (only one epoch -- because we have the fully trained model)
  test_acc_b = evaluate_bagging(modelb1, modelb2, modelb3, zero_train_matrix1,test_data)

  print("Validation Accuracy = ", val_acc_b, "     ", "Test Accuracy = ", test_acc_b)

rand_list1 type =  <class 'list'>
Epoch: 0 	Training Cost: 14308.366125	 Valid Acc: 0.5294947784363534
Epoch: 1 	Training Cost: 13931.602462	 Valid Acc: 0.5534857465424782
Epoch: 2 	Training Cost: 13590.435765	 Valid Acc: 0.5781823313576065
Epoch: 3 	Training Cost: 13226.372755	 Valid Acc: 0.5924357888794807
Epoch: 4 	Training Cost: 12889.105661	 Valid Acc: 0.602314422805532
Epoch: 5 	Training Cost: 12615.122409	 Valid Acc: 0.6095117132373694
Epoch: 6 	Training Cost: 12404.062513	 Valid Acc: 0.6148744002257973
Epoch: 7 	Training Cost: 12240.494475	 Valid Acc: 0.6189669771380186
Epoch: 8 	Training Cost: 12108.988413	 Valid Acc: 0.6241885407846458
Epoch: 9 	Training Cost: 11998.558067	 Valid Acc: 0.6243296641264465
Epoch: 10 	Training Cost: 11902.030702	 Valid Acc: 0.626728760937059
Epoch: 11 	Training Cost: 11814.823382	 Valid Acc: 0.6306802145074796
Epoch: 12 	Training Cost: 11733.940060	 Valid Acc: 0.6323736946090883
Epoch: 13 	Training Cost: 11657.408529	 Valid Acc: 0.634772791419700

In [None]:
#make ensembled validation predictions and evaluate accuracy (only one epoch -- because we have the fully trained model)
if __name__ == "__main__":
  test_acc = evaluate_bagging(model1, model2, model3, zero_train_matrix1,test_data)

# Code Modification (Neural Networks) - Doh

In [None]:
from utils import *
from torch.autograd import Variable

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import matplotlib.pyplot as plt

import numpy as np
import torch


def load_data(base_path="../data"):
    """ Load the data in PyTorch Tensor.

    :return: (zero_train_matrix, train_matrix train_data, valid_data, test_data)
        WHERE:
        zero_train_matrix: 2D sparse matrix where missing entries are
        filled with 0.
        train_matrix: Original training 2D sparse matrix
        train_data: 2D sparse matrix
        valid_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
        test_data: A dictionary {user_id: list,
        user_id: list, is_correct: list}
    """
    train_matrix = load_train_sparse(base_path).toarray()
    train_data = load_train_csv(base_path)
    valid_data = load_valid_csv(base_path)
    test_data = load_public_test_csv(base_path)

    zero_train_matrix = train_matrix.copy()

    # Fill in the missing entries to 0.
    zero_train_matrix[np.isnan(train_matrix)] = 0

    # Change to Float Tensor for PyTorch.
    zero_train_matrix = torch.FloatTensor(zero_train_matrix)
    train_matrix = torch.FloatTensor(train_matrix)

    return zero_train_matrix, train_matrix, train_data, valid_data, test_data


class AutoEncoder(nn.Module):
    torch.manual_seed(42)

    def __init__(self, num_question, k=50, j=75):
        """ Initialize a class AutoEncoder.

        :param num_question: int
        :param k: int
        """
        super(AutoEncoder, self).__init__()

        # Define linear functions.
        self.encoder = nn.Linear(num_question, j)
        self.layer1 = nn.Linear(j, k)
        self.layer2 = nn.Linear(k, j)
        self.dropout = nn.Dropout(p=0.25)
        self.decoder = nn.Linear(j, num_question)

    def get_weight_norm(self):
        """ Return ||W^1||^2 + ||W^2||^2.

        :return: float
        """
        g_w_norm = torch.norm(self.encoder.weight, 2) ** 2
        h_w_norm = torch.norm(self.decoder.weight, 2) ** 2
        return g_w_norm + h_w_norm

    def forward(self, inputs):
        """ Return a forward pass given inputs.

        :param inputs: user vector.
        :return: user vector.
        """
        #####################################################################
        # TODO:                                                             #
        # Implement the function as described in the docstring.             #
        # Use sigmoid activations for f and g.                              #
        #####################################################################
        a = torch.sigmoid(self.encoder(inputs))
        b = F.relu(self.layer1(a))
        c = F.relu(self.layer2(b))
        d = F.relu(self.dropout(c))
        out = torch.sigmoid(self.decoder(d))
        #####################################################################
        #                       END OF YOUR CODE                            #
        #####################################################################
        return out


def train(model, lr, lamb, train_matrix, zero_train_data, train_data,
          valid_data, num_epoch):
    """ Train the neural network, where the objective also includes
    a regularizer.

    :param model: Module
    :param lr: float
    :param lamb: float
    :param train_matrix: 2D sparse matrix
    :param train_data: 2D sparse matrix
    :param zero_train_data: 2D FloatTensor
    :param valid_data: Dict
    :param num_epoch: int
    :return: None
    """
    # TODO: Add a regularizer to the cost function.
    norm = model.get_weight_norm()
    norm = norm.detach().numpy()

    # Tell PyTorch you are training the model.
    model.train()

    # Define optimizers and loss function.
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.1)
    num_student = train_matrix.shape[0]

    # Parameters for early stopping
    previous_acc = 0
    patience = 3
    count = 0

    eps = []
    train_accs = []
    val_accs = []
    train_losses = []

    for epoch in range(0, num_epoch):
        train_loss = 0.

        for user_id in range(num_student):
            inputs = Variable(zero_train_data[user_id]).unsqueeze(0)
            target = inputs.clone()

            optimizer.zero_grad()
            output = model(inputs)

            # Mask the target to only compute the gradient of valid entries.
            nan_mask = np.isnan(train_matrix[user_id].unsqueeze(0).numpy())
            target[0][nan_mask] = output[0][nan_mask]

            loss = torch.sum((output - target) ** 2.) + lamb * norm / 2
            loss.backward()

            train_loss += loss.item()
            optimizer.step()
        train_acc = evaluate(model, zero_train_data, train_data)
        valid_acc = evaluate(model, zero_train_data, valid_data)
        print("Epoch: {} \tTraining Cost: {:.6f}\t "
              "Valid Acc: {}".format(epoch, train_loss, valid_acc))

        eps.append(epoch)
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        val_accs.append(valid_acc)

        # Early stopping
        if valid_acc <= previous_acc:
            count += 1
            if count >= patience:
                break

        # Modify patience to 3 consecutive non-increasing iterations
        elif valid_acc > previous_acc:
            count = 0

        previous_acc = valid_acc

    # plotting
    plt.title("Training Loss vs. Epochs")
    plt.plot(eps, train_losses, label="Training Curve")
    plt.xlabel("Epochs")
    plt.ylabel("Training Loss")
    plt.show()

    plt.title("Accuracy vs. Epochs")
    plt.plot(eps, train_accs, label="Training Curve")
    plt.plot(eps, val_accs, label="Validation Curve")
    plt.xlabel("Epochs")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################


def evaluate(model, train_data, valid_data):
    """ Evaluate the valid_data on the current model.

    :param model: Module
    :param train_data: 2D FloatTensor
    :param valid_data: A dictionary {user_id: list,
    question_id: list, is_correct: list}
    :return: float
    """
    # Tell PyTorch you are evaluating the model.
    model.eval()

    total = 0
    correct = 0

    for i, u in enumerate(valid_data["user_id"]):
        inputs = Variable(train_data[u]).unsqueeze(0)
        output = model(inputs)

        guess = output[0][valid_data["question_id"][i]].item() >= 0.5
        if guess == valid_data["is_correct"][i]:
            correct += 1
        total += 1
    return correct / float(total)


def main():
    zero_train_matrix, train_matrix, train_data, valid_data, test_data = \
        load_data()

    #####################################################################
    # TODO:                                                             #
    # Try out 5 different k and select the best k using the             #
    # validation set.                                                   #
    #####################################################################
    # Set model hyperparameters.
    k = 50
    j = 75
    num_questions = zero_train_matrix.shape[1]
    model = AutoEncoder(num_question=num_questions, k=k, j=j)

    # Set optimization hyperparameters.
    lr = 0.01
    num_epoch = 40
    lamb = 0.001

    print("Validation ========================================================")
    train(model, lr, lamb, train_matrix, zero_train_matrix, train_data,
          valid_data, num_epoch)
    print("Test ==============================================================")
    train(model, lr, lamb, train_matrix, zero_train_matrix, train_data,
          test_data, num_epoch)
    #####################################################################
    #                       END OF YOUR CODE                            #
    #####################################################################


if __name__ == "__main__":
    main()


Exception: ignored