In [1]:
from datetime import datetime

import numpy as np
import pandas as pd

# from google.colab import files
# files.download("PATH/TO/FILE")

# tar xvf {file.zip}
# tar -xvf {file.zip}
# tar xvf {file.zip} -C /dest/directory/

# using python
# import zipfile
# with zipfile.ZipFile("file.zip","r") as zip_ref:
#     zip_ref.extractall("targetdir")

In [2]:
def dataframe_to_matrix(df, with_index=False):
    matrix = df.pivot_table(columns=['item_id'], index=['user_id'], values='rating')
    if with_index:
        return matrix.fillna(0).as_matrix(), matrix.index, matrix.columns
    else:
        return matrix.fillna(0).as_matrix()

In [4]:
class FairnessRegALS:
    def __init__(self, df_train, n_factor, user_factor=None, item_factor=None):

        # get data from dataframe
        self.data_frame = df_train
        R_matrix, user_index, item_index = dataframe_to_matrix(df_train, with_index=True)
        self.R = R_matrix
        self.user_index = user_index
        self.item_index = item_index

        self.R_prediction = None
        self.n_user, self.n_item = self.R.shape

        # common parameter
        self.n_factor = n_factor

        # rank als parameter
        self.is_support_weight = False

        # initialize user (P) & item (Q) latent model
        if user_factor is None and item_factor is None:
            self.P = np.random.rand(self.n_user, self.n_factor)
            self.Q = np.random.rand(self.n_item, self.n_factor)
        else:
            self.P = user_factor
            self.Q = item_factor

        # DEBUGGING PURPOSE ONLY, CONSTANT FACTOR CAN RAISING ERROR!.
        # self.constant_latent_factor()

        # check matrix shape dimension
        if self.P.shape != (self.n_user, self.n_factor) \
                or self.Q.shape != (self.n_item, self.n_factor):
            raise ValueError

        # building support weight
        self.support_weight_vector = np.zeros(self.n_item)
        self.support_weight_sum = 0
        for item_idx in range(self.n_item):
            support_value = np.count_nonzero(self.R[:, item_idx]) if self.is_support_weight else 1
            self.support_weight_vector[item_idx] = support_value
            self.support_weight_sum += support_value

    def train_data(self, iteration, directory=None):
        for iterate in range(iteration):
            print("als train_data iteration {} at {}".format(iterate + 1, datetime.now()))
            # P STEP: UPDATE USER VECTORS
            # q̃ = QTs,
            # Ã = QT diag(s)Q
            q_tilde = np.zeros(self.n_factor)
            A_tilde = np.zeros((self.n_factor, self.n_factor))
            for item in range(self.n_item):
                qj = self.Q[item, :]
                sj = self.support_weight_vector[item]
                q_tilde += qj * sj
                A_tilde = A_tilde + np.outer(qj, qj)

            # for u ← 1,..., U do ;
            # TODO: RE-CHECK THIS, IT HAS SIDE-EFFECT IN DATA-SET (R)
            cus = [i for i in range(self.n_user) if np.any(self.R[i, :])]

            for user in cus:
                A_bar = np.zeros((self.n_factor, self.n_factor))
                q_bar = np.zeros(self.n_factor)
                b_bar = np.zeros(self.n_factor)
                b_tilde = np.zeros(self.n_factor)

                Ru = self.filter_row(self.R[user])  # OK
                I_bar = len(Ru)  # OK
                r_tilde, r_bar = 0.0, 0.0  # OK

                for i, rui in Ru:
                    qi = self.Q[i]  # OK

                    A_bar = A_bar + np.outer(qi, qi)

                    q_bar = q_bar + qi
                    b_bar = b_bar + (qi * rui)
                    si = self.support_weight_vector[i]
                    r_tilde += si * rui
                    r_bar += rui
                    b_tilde = b_tilde + qi * (si * rui)

                M = A_bar * self.support_weight_sum \
                    - (np.outer(q_bar, q_tilde)) \
                    - (np.outer(q_tilde, q_bar)) \
                    + (A_tilde * I_bar)

                y = b_bar * self.support_weight_sum \
                    - (q_bar * r_tilde) \
                    - (q_tilde * r_bar) \
                    + (b_tilde * I_bar)

                pu = np.linalg.inv(M).dot(y)
                self.P[user, :] = pu

            ###################################################
            #  CHECKPOINT - MANUALLY CHECKED                  #
            ###################################################
            # Q STEP: UPDATE ITEM VECTORS

            # k, v -> int, double
            map_p1_tensor = {}
            map_p3_tensor = {}
            map_b_tensor = {}

            # k, v -> int, []
            map_p2_tensor = {}

            # for each user
            for user in cus:
                Ru = self.filter_row(self.R[user])  # this is okay

                sum_p1_tensor = 0.0
                sum_p3_tensor = 0.0
                sum_b_tensor = len(Ru)
                sum_p2_tensor = np.zeros(self.n_factor)

                for j, ruj in Ru:
                    sj = self.support_weight_vector[j]
                    sum_p1_tensor += sj * ruj
                    sum_p3_tensor += ruj

                    sum_p2_tensor = sum_p2_tensor + self.Q[j]

                map_p1_tensor[user] = sum_p1_tensor
                map_p3_tensor[user] = sum_p3_tensor
                map_b_tensor[user] = sum_b_tensor
                map_p2_tensor[user] = sum_p2_tensor

            # FAIRNESS REGULARIZATION
            # dist = lambda param1, param2: param1 + param2
            # sum_d_q = np.zeros((self.n_item, self.n_factor))
            # for item_i in range(self.n_item):
            #     for item_j in range(self.n_item):
            #         # update sum_d_q
            #         for x, y in np.nditer([sum_d_q[item_i], self.Q[item_j]], op_flags=['readwrite']):
            #             x[...] = dist(x, y)

            # for each item
            for item in range(self.n_item):
                A_bar = np.zeros((self.n_factor, self.n_factor))
                A_tensor = np.zeros((self.n_factor, self.n_factor))
                b_bar = np.zeros(self.n_factor)

                p1_tensor = np.zeros(self.n_factor)
                p3_tensor = np.zeros(self.n_factor)
                b_tensor = np.zeros(self.n_factor)
                p2_tensor = np.zeros(self.n_factor)

                si = self.support_weight_vector[item]
                for user in cus:
                    pu = self.P[user]
                    rui = self.R[user, item]

                    pp = np.outer(pu, pu)  # 6x6 indeed
                    A_bar += pp

                    p2_tensor = p2_tensor + pp.dot(map_p2_tensor.get(user))
                    A_tensor = A_tensor + (pp * map_b_tensor.get(user))
                    p3_tensor = p3_tensor + pu * map_p3_tensor.get(user)

                    if rui > 0:
                        b_bar += pu * rui
                        p1_tensor += pu * map_p1_tensor.get(user)
                        b_tensor += pu * (rui * map_b_tensor.get(user))

                M = (A_bar * self.support_weight_sum) + (A_tensor * si)  # THIS IS DOPE
                y = A_bar.dot(q_tilde) \
                    + (b_bar * self.support_weight_sum) \
                    - p1_tensor \
                    + (p2_tensor * si) \
                    - (p3_tensor * si) \
                    + (b_tensor * si)

                # dope variable checked
                # b_bar, p1_tensor, p2_tensor, p3_tensor, b_tensor

                qi = np.linalg.inv(M).dot(y)
                self.Q[item, :] = qi

            if directory is not None:
                print("model saved to {} at {}".format(directory, datetime.now()))
                self.save_data(directory)

        # build matrix prediction after training
        self.R_prediction = self.P.dot(self.Q.T)

    def predict(self, user_id, item_id):
        """
        :return: int number of rating predicted value
        """
        user_idx = self.user_index.get_loc(user_id)
        item_idx = self.item_index.get_loc(item_id)
        return self.P[user_idx, :].dot(self.Q[item_idx, :].T)


    def matrix_prediction(self):
        if self.R_prediction is None:
            self.R_prediction = self.P.dot(self.Q.T)
        return self.R_prediction

    def top_n_recommendation(self, user_id, n,
                             return_index=False,
                             with_index=False,
                             with_reviewed=True):
        """
        :return: slice of n item_idx
        """
        user_idx = self.user_index.get_loc(user_id)

        if self.R_prediction is None:
            self.matrix_prediction()

        if not with_reviewed:
            non_watched_item = self.R_prediction[user_idx]
        else:
            watched = self.R[user_idx]
            non_watched_index = np.where(watched == 0)
            non_watched_item = self.R_prediction[user_idx][non_watched_index]

        index_sorted = np.argsort(non_watched_item)[-n:]

        # reversed because argsort cannot desc
        rec_item_idx = np.array(list(reversed(index_sorted)))

        # TODO: return_index option not awesome, will deprecated this
        if return_index:
            return rec_item_idx
        elif with_index:
            return [self.item_index[item] for item in rec_item_idx], rec_item_idx
        else:
            # convert list recommendation to id
            return [self.item_index[item] for item in rec_item_idx]

    def save_data(self, directory):
        np.save(directory + "/P.npy", self.P)
        np.save(directory + "/Q.npy", self.Q)
        self.data_frame.to_pickle(directory + '/data_frame.pkl')
        np.save(directory + "/n_factor.npy", self.n_factor)

    # noinspection PyBroadException
    @staticmethod
    def load_data(directory):
        try:
            P = np.load(directory + "/P.npy")
            Q = np.load(directory + "/Q.npy")
            # R = np.load(directory + "/R.npy")
            data_frame = pd.read_pickle(directory + '/data_frame.pkl')
            n_factor = np.load(directory + "/n_factor.npy")

            return FairnessRegALS(data_frame, n_factor, P, Q)
        except Exception:
            return None

    @staticmethod
    def filter_row(vector):
        return [(i, j) for i, j in enumerate(vector) if j != 0]


In [None]:
from sklearn.model_selection import train_test_split

from config import MODEL_LOCATION, DATASET_DIR
from recommender.fairness_reg_als import FairnessRegALS
from recommender.util import load_dataset, dataframe_to_matrix


def main():

    # load from previous model
    als = FairnessRegALS.load_data('')

    # create new if isn't available
    if als is None:

        # prepare dataset
        ratings_df = load_dataset(DATASET_DIR)
        train, test = train_test_split(ratings_df, test_size=0.2)
        print("total user dataset: {}, item dataset: {}"
              .format(ratings_df.user_id.unique().shape, ratings_df.item_id.unique().shape))
        print("total user training: {}, item training: {}"
              .format(train.user_id.unique().shape, train.item_id.unique().shape))
        print("total user test: {}, item test: {}"
              .format(test.user_id.unique().shape, test.item_id.unique().shape))
        print("total record training: {}, total record test: {}"
              .format(train.shape[0], test.shape[0]))

        # create new recommender instance
        als = FairnessRegALS(df_train=train, n_factor=50)

    # train the recommender
    als.train_data(iteration=30, directory=MODEL_LOCATION)
    als.save_data(MODEL_LOCATION)

# run the main
main()


In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [None]:
from google.colab import files

files.download('example.txt')

In [None]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
