## FDTree Class

In [6]:
import numpy as np
class DecisionTreeModel:
    def __init__(self, source, depth_threshold=10, plambda=7, MSP_item=200):
        '''
            sMatrix: I*U matrix
            depth_threshold: terminate depth
            plambda: regularization parameter
            self.rI: dict { itemid1, itemid2, itemid3 ... }
            self.rU: dict {
                        userid1: { itemid11: rating11, itemid12: rating12, ... } rating of user 1
                        userid2: { itemid21: rating21, itemid22: rating22, ... } rating of user 2
                        userid3: ...
                     }
            self.lr_bound: dict {
                                level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node
                                level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3
                                level 2: ..., 9
                            } (bound means index)
            self.tree: []  all of userid
            self.split_item: list [
                    level 0: []
                    level 1: []
                    level 2: []
            ]
            self.sum_cur_t: dict {
                        itemid1: {'rating': sum of ratings for item 1, 'cnt': sum of users rated item 1}
                        itemid2: {'rating': sum of ratings for item 2, 'cnt': sum of users rated item 2}
                        ...
                    }
            self.sum_2_cur_t: dict {
                        itemid1: sum of square ratings for item 1
                        itemid2: sum of square ratings for item 2
                        ...
                    }
            self.biasU: dict {
                        userid1: bias1
                        userid2: bias2
                        ...
                    }
            every element represents ratings for one item, its order decide the users in tree nodes
        '''
        self.sMatrix = source
        self.real_item_num = source.shape[0]
        self.global_mean = 0  # global average of ratings   
        self.depth_threshold = depth_threshold
        self.plambda = plambda
        self.cur_depth = 0
        self.MSP_item = MSP_item

        #### Calculate rate of progress ####
        self.node_num = 0
        self.cur_node = 0
        for i in range(self.depth_threshold):
            self.node_num += 3 ** i

        #### Generate rI, rU ####
#         self.rI = rI
        self.rI = list(set(source.nonzero()[0]))
        
        #### Initiate Tree, lr_bound ####
        self.tree = list(range(1, self.sMatrix.shape[1]))
        self.split_item = []
        self.lr_bound = {'0': [[0, len(self.tree) - 1]]}
        self.global_mean = source.sum()/source.getnnz()
        self.item_size = len(self.rI)
        self.user_size = len(self.tree)

        #### Generate bias, sum_cur_t, sum_2_cur_t ####
        
        print("bias, sum_cur_t, sum_2_cur_t Generation start:")
#         self.biasU = biasU
#         self.sum_cur_t = sum_cur_t
#         self.sum_2_cur_t = sum_2_cur_t
#         self.sum_cntt = sum_cntt
        self.biasU = {}
        self.sum_cur_t = np.zeros(self.real_item_num)
        self.sum_2_cur_t = np.zeros(self.real_item_num)
        self.sum_cntt = np.zeros(self.real_item_num)
        i = 0
        for userid in self.tree:
            if i % 5000 == 0:
                print("%.2f%%" % (100 * i / (0.75 * 480189)))
            i += 1

            self.biasU[userid] = (self.sMatrix.getcol(userid).sum() \
                                     + self.plambda * self.global_mean) /   \
                                 (self.plambda + self.sMatrix.getcol(userid).getnnz())
            user_all_rating_id = self.sMatrix.getcol(userid).nonzero()[0]
            # print('user_all_rating_id ', user_all_rating_id[:])
            user_all_rating = find(self.sMatrix.getcol(userid))[2]
            self.sum_cur_t[user_all_rating_id[:]] += user_all_rating[:] - self.biasU[userid]
            self.sum_2_cur_t[user_all_rating_id[:]] += (user_all_rating[:] - self.biasU[userid]) ** 2
            self.sum_cntt[user_all_rating_id[:]] += 1
        print("bias, sum_cur_t, sum_2_cur_t Generation DONE")

        print("Initiation DONE!")

    def calculate_error(self, sumt, sumt_2, cntt):
        ''' Calculate error for one item-split in one node '''
        Error_i = np.sum(sumt_2 - (sumt ** 2) / (cntt + 1e-9))

        return Error_i

    def generate_decision_tree(self, lr_bound_for_node, chosen_id):
        '''
            sumtL: dict {
                itemid1: {'rating': sum of ratings for item 1, 'cnt': sum of users rated item 1}
                itemid2: {'rating': sum of ratings for item 2, 'cnt': sum of users rated item 2}
                ...
            }
            sumtL_2: dict {
                itemid1: sum of square ratings for item 1
                itemid2: sum of square ratings for item 2
                ...
            }
            lr_bound_for_node: list [leftind, rightind] for one node
        '''

        #### Terminate ####
        self.cur_depth += 1
        if self.cur_depth > self.depth_threshold or len(chosen_id) == self.item_size:
            return

        #### Choose Most Popular Items of This Node ####
        num_rec = np.zeros(self.real_item_num)
        for userid in self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)]:
            user_all_rating_id = self.sMatrix.getcol(userid).nonzero()[0]
            num_rec[user_all_rating_id[:]] += 1
        sub_item_id = np.argsort(-num_rec)[:self.MSP_item]

        #### Find optimum item to split ####
        min_sumtL, min_sumtD, min_sumtL_2, min_sumtD_2, min_sumtU, min_sumtU_2, Error = {}, {}, {}, {}, {}, {}, {}
        min_Error = "None"
        print("optimum item choosing start:")
        i = 0
        for itemid in sub_item_id:
            print(i)
            i+=1
            if itemid in chosen_id:
                continue
            user_rating_item_in_nodet = list(set(self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)]).intersection(set(find(self.sMatrix[itemid, :])[1])))
            print("user_rating_item_in_nodet DONE")            
            sumt = np.zeros((self.real_item_num, 3))
            sumt_2 = np.zeros((self.real_item_num, 3))
            cntt = np.zeros((self.real_item_num, 3))
            
            item_row = find(self.sMatrix.getrow(itemid))
            user_rating_item_id = item_row[1]
            item_all_rating = item_row[2]
            lst_L = list(user_rating_item_id[item_all_rating >= 4])
            lst_D = list(user_rating_item_id[item_all_rating <= 3])

            sumt[:, 0] = self.sMatrix[:, lst_L].sum(axis=1) - self.biasU[user]
            sumt_2[:, 0] = (self.sMatrix[:, lst_L].sum(axis=1) - self.biasU[user]) ** 2
            cntt[:, 0] = self.sMatrix[:, lst_L].getnnz(axis=1)
            sumt[:, 1] = self.sMatrix[:, lst_D].sum(axis=1) - self.biasU[user]
            sumt_2[:, 1] = (self.sMatrix[:, lst_D].sum(axis=1) - self.biasU[user]) ** 2
            cntt[:, 1] = self.sMatrix[:, lst_D].getnnz(axis=1)
            
            #### calculate sumtU for node UNKNOWN ####
            sumt[:, 2] = self.sum_cur_t - sumt[:, 0] - sumt[:, 1]
            sumt_2[:, 2] = self.sum_2_cur_t - sumt_2[:, 0] - sumt_2[:, 1]
            cntt[:, 2] = self.sum_cntt - cntt[:, 0] - cntt[:, 1]
            Error[itemid] = self.calculate_error(sumt, sumt_2, cntt)

            if min_Error == "None" or Error[itemid] < min_Error:
                min_sumt = sumt
                min_sumt_2 = sumt_2
                min_cntt = cntt
                min_Error = Error[itemid]        
        print("optimum item choosing DONE")
        #### Find optimum split-item ####
        optimum_itemid = min(Error, key=Error.get)
        if len(self.split_item) == self.cur_depth - 1:
            self.split_item.append([optimum_itemid])
        else:
            self.split_item[self.cur_depth - 1].append(optimum_itemid)
        chosen_id.append(optimum_itemid)

        #### sort tree ####
        self.lr_bound.setdefault(str(self.cur_depth), []).append([])  # for LIKE
        self.lr_bound[str(self.cur_depth)].append([])  # for DISLIKE
        self.lr_bound[str(self.cur_depth)].append([])  # for UNKNOWN
        listU, listL, listD = [], [], []
        for userid in self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)]:
            if optimum_itemid not in self.sMatrix.getcol(userid).nonzero()[0]:
                listU.append(userid)
            elif self.sMatrix[optimum_itemid, userid] >= 4:
                listL.append(userid)
            elif self.sMatrix[optimum_itemid, userid] <= 3:
                listD.append(userid)
        self.tree[lr_bound_for_node[0]:(lr_bound_for_node[1] + 1)] = listL + listD + listU
        self.lr_bound[str(self.cur_depth)][-3] = [lr_bound_for_node[0],
                                                  lr_bound_for_node[0] + len(listL) - 1]  # for LIKE
        self.lr_bound[str(self.cur_depth)][-2] = [lr_bound_for_node[0] + len(listL),
                                                  lr_bound_for_node[0] + len(listL) + len(listD) - 1]  # for DISLIKE
        self.lr_bound[str(self.cur_depth)][-1] = [lr_bound_for_node[0] + len(listL) + len(listD),
                                                  lr_bound_for_node[0] + len(listL) + len(listD) + len(
                                                      listU) - 1]  # for UNKNOWN

        #### Generate Subtree of Node LIKE ####
        self.sum_cur_t = min_sumt[:, 0]
        self.sum_2_cur_t = min_sumt_2[:, 0]
        self.sum_cntt = min_cntt[:, 0]
        self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-3], chosen_id[:])
        self.cur_depth -= 1

        #### Generate Subtree of Node DISLIKE ####
        self.sum_cur_t = min_sumt[:, 1]
        self.sum_2_cur_t = min_sumt_2[:, 1]
        self.sum_cntt = min_cntt[:, 1]
        self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-2], chosen_id[:])
        self.cur_depth -= 1

        #### Generate Subtree of Node UNKNOWN ####
        self.sum_cur_t = min_sumt[:, 2]
        self.sum_2_cur_t = min_sumt_2[:, 2]
        self.sum_cntt = min_cntt[:, 2]
        self.generate_decision_tree(self.lr_bound[str(self.cur_depth)][-1], chosen_id[:])
        self.cur_depth -= 1

        #### Show Rating Progress ####
        for i in range(self.cur_depth - 1):
            print("┃", end="")
        print("┏", end="")
        self.cur_node += 1
        print("Current depth: " + str(self.cur_depth) + "        %.2f%%" % (100 * self.cur_node / self.node_num))

    def build_model(self):
        #### Construct the tree & get the prediction model ####
        self.generate_decision_tree(self.lr_bound['0'][0], [])

## Step 1: Input Dataset

In [7]:
from scipy.sparse import *

In [8]:
############### Load Data ##################
rating_matrix_csc = load_npz('netflix/sparse_matrix_100%.npz').tocsc()
rating_matrix_val_csc = load_npz('netflix/sparse_matrix_validation_75%.npz').tocsc()
print("file load DONE")
############################################

file load DONE


In [9]:
''' Save to file 'tree.pkl' '''
start = 0
end = int(rating_matrix_csc.shape[1] * 0.75)

## Tree Init

In [10]:
dtmodel_realdata = DecisionTreeModel(rating_matrix_csc[:, start:end], depth_threshold = 10)

bias, sum_cur_t, sum_2_cur_t Generation start:
0.00%
1.39%
2.78%
4.17%
5.55%
6.94%
8.33%
9.72%
11.11%
12.50%
13.88%
15.27%
16.66%
18.05%
19.44%
20.83%
22.21%
23.60%
24.99%
26.38%
27.77%
29.16%
30.54%
31.93%
33.32%
34.71%
36.10%
37.49%
38.87%
40.26%
41.65%
43.04%
44.43%
45.82%
47.20%
48.59%
49.98%
51.37%
52.76%
54.15%
55.53%
56.92%
58.31%
59.70%
61.09%
62.48%
63.86%
65.25%
66.64%
68.03%
69.42%
70.81%
72.19%
73.58%
74.97%
76.36%
77.75%
79.14%
80.52%
81.91%
83.30%
84.69%
86.08%
87.47%
88.85%
90.24%
91.63%
93.02%
94.41%
95.80%
97.18%
98.57%
99.96%
bias, sum_cur_t, sum_2_cur_t Generation DONE
Initiation DONE!


In [13]:
dtmodel_realdata.build_model()

optimum item choosing start:
0
user_rating_item_in_nodet DONE


NameError: name 'user' is not defined

In [12]:
# biasU = dtmodel_realdata.biasU
# sum_cur_t = dtmodel_realdata.sum_cur_t
# sum_2_cur_t = dtmodel_realdata.sum_2_cur_t
# sum_cntt = dtmodel_realdata.sum_cntt
# rI = dtmodel_realdata.rI

import klepto
Tree = klepto.archives.dir_archive('treeFile', cached=True, serialized=True)

Tree['biasU'] = dtmodel_realdata.biasU
Tree['sum_cur_t'] = dtmodel_realdata.sum_cur_t
Tree['sum_2_cur_t'] = dtmodel_realdata.sum_2_cur_t
Tree['sum_cntt'] = dtmodel_realdata.sum_cntt
Tree['rI'] = dtmodel_realdata.rI

Tree.dump()
Tree.clear()

In [None]:
biasU = dtmodel_realdata.biasU
sum_cur_t = dtmodel_realdata.sum_cur_t
sum_2_cur_t = dtmodel_realdata.sum_2_cur_t
sum_cntt = dtmodel_realdata.sum_cntt
rI = dtmodel_realdata.rI

import klepto
Tree = klepto.archives.dir_archive('treeFile', cached=True, serialized=True)

Tree['biasU'] = dtmodel_realdata.biasU
Tree['sum_cur_t'] = dtmodel_realdata.sum_cur_t
Tree['sum_2_cur_t'] = dtmodel_realdata.sum_2_cur_t
Tree['sum_cntt'] = dtmodel_realdata.sum_cntt

Tree['lr_bound'] = dtmodel_realdata.lr_bound
Tree['tree'] = dtmodel_realdata.tree
Tree['split_item'] = dtmodel_realdata.split_item
Tree['rI'] = dtmodel_realdata.rI


Tree.dump()
Tree.clear()

In [None]:
import klepto
Tree = klepto.archives.dir_archive('treeFile', cached=True, serialized=True)
Tree.dump()
Tree.clear()

In [13]:
del dtmodel_realdata

In [66]:
a = 3.0 - 3

## MF part

In [None]:
from pyspark.mllib.recommendation import ALS
from pyspark.sql import SparkSession

class MatrixFactorization:
    def __init__(self, maxIter=15, regParam=0.01, rank=10):
        self.maxIter = maxIter
        self.regParam = regParam
        self.rank = rank
        self.spark = SparkSession.builder.master("local[*]").appName("Example").getOrCreate()

    def change_parameter(self, regParam):
        self.regParam = regParam

    def matrix_factorization(self, train_lst):
        ratings = self.spark.createDataFrame(train_lst)
        model = ALS.train(ratings, self.rank, seed=10, \
                          iterations=self.maxIter, \
                          lambda_=self.regParam)
        print("MF DONE")
        userFeatures = sorted(model.userFeatures().collect(), key=lambda d: d[0], reverse=False)
        productFeatures = sorted(model.productFeatures().collect(), key=lambda d: d[0], reverse=False)
        userProfile = {each[0]: each[1].tolist() for each in userFeatures}
        itemProfile = {each[0]: each[1].tolist() for each in productFeatures}

        return userProfile, itemProfile

    def end(self):
        self.spark.stop()

## Tool Function

In [None]:
from scipy.sparse import find
val_num = rating_matrix_val_csc.getnnz(axis=None)
########################################## For Validation #############################################
def calculate_avg_rating_for_pesudo_user(pseudo_user_lst, sMatrix):

    ret_array = np.zeros(sMatrix.shape[0])
    ret_array = np.array(sMatrix[:, pseudo_user_lst].sum(axis=1))[:,0]/(sMatrix[:, pseudo_user_lst].getnnz(axis=1)+1e-9)

    return ret_array


def pred_RMSE_for_validate_user(user_node_ind, user_profile, item_profile, val_user_list, val_item_list, sMatrix):
    print("RMSE calculation on valset qstarted.")
    RMSE = 0
    i = 0
    for userid, itemid in zip(val_user_list, val_item_list):
        if i % 50000 == 0:
            print("%.2f%%" % (100 * i / val_num))        
        i += 1
        RMSE += (sMatrix[itemid, userid] - np.dot(user_profile[user_node_ind[userid]], item_profile[itemid]))**2
    return (RMSE / len(val_user_list))**0.5

def generate_prediction_model(lr_bound, tree, rI, sMatrix, plambda_candidates, validation_set):
    ''' lr_bound: dict {
                level 0: [[left_bound, right_bound]], users' bound for one level, each ele in dictionary represents one node
                level 1: [[left_bound, right_bound], [left_bound, right_bound], [left_bound, right_bound]], 3
                level 2: ..., 9
            } (bound means index)
        plambda_candidates: {
            level 0: [clambda1, clambda2, clambda3, ...]
            level 1: [clambda1, clambda2, clambda3, ...]
            level 2: [clambda1, clambda2, clambda3, ...]
        }
        prediction_model: dict {
                level 0: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...}
                level 1: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...}
                level 2: { 'best_lambda': x, 'user_profile': ..., 'item_profile': ...}
            }
    '''
    MF = MatrixFactorization()
    print("MF session started.")
    prediction_model = {}
    
    val_item_list = find(validation_set)[0]
    val_user_list = find(validation_set)[1]
    user_node_ind = np.zeros(sMatrix.shape[1])                  #### notice that index is not id
    
    for level in lr_bound:
        print("level:", level)
        prediction_model.setdefault(level, {})
        train_lst = []       
        for pseudo_user_bound, userid in zip(lr_bound[level], range(len(lr_bound[level]))):
#             print(str(userid) + "/" + str(pow(3,int(level))))
            if pseudo_user_bound[0] > pseudo_user_bound[1]:
                continue
            pseudo_user_lst = tree[pseudo_user_bound[0]:(pseudo_user_bound[1] + 1)]
            pseudo_user_for_item = calculate_avg_rating_for_pesudo_user(pseudo_user_lst, sMatrix)
            train_lst += [(userid, itemid, float(pseudo_user_for_item[itemid])) \
                          for itemid in range(pseudo_user_for_item.shape[0]) if pseudo_user_for_item[itemid]]    
            #### find node index for each validation user ####
            user_node_ind[pseudo_user_lst] = userid      

        print("level " + level + " training started.")
        #### Train MF and Do validation ####
#         min_RMSE = -1
#         for plambda in plambda_candidates[level]:
        MF.change_parameter(plambda_candidates[level])
        user_profile, item_profile = MF.matrix_factorization(train_lst)
#         RMSE = pred_RMSE_for_validate_user(user_node_ind, user_profile, item_profile, val_user_list, val_item_list, sMatrix)
#         if min_RMSE is -1 or RMSE < min_RMSE:
#         min_RMSE = RMSE
        min_user_profile, min_item_profile, min_lambda = user_profile, item_profile, plambda_candidates[level]
                
        prediction_model[level]['upro'], prediction_model[level]['ipro'], prediction_model[level]['plambda'] \
                                             = min_user_profile, min_item_profile, min_lambda
        print("level " + level + " training DONE")
    MF.end()   #### close MF spark session
    return prediction_model

## Load Tree from file

In [None]:
import klepto
import numpy as np
Tree = klepto.archives.dir_archive('treeFile', {}, serialized=True)
Tree.load()

In [None]:
plambda_candidates = {"0":0.001,
                     "1":0.001,
                     "2":0.001,
                     "3":0.001,
                     "4":0.003,
                     "5":0.003,
                     "6":0.01,
                     "7":0.01,
                     "8":0.02,
                     "9":0.02,
                     "10":0.02}
# for level in Tree["lr_bound"]:
#     plambda_candidates[level] = list(np.arange(0.001, 0.05, 0.005))    

In [None]:
prediction_model = generate_prediction_model \
            (Tree['lr_bound'], \
             Tree['tree'], \
             Tree['rI'], \
             rating_matrix_csc[:, start:end], 
             plambda_candidates, 
             rating_matrix_val_csc)

In [None]:
import klepto
Tree = klepto.archives.dir_archive('treeFile', cached=True, serialized=True)
Tree['prediction_model'] = prediction_model
Tree.dump()
Tree.clear()

In [None]:
def RMSE(real_rating, pred_rating, rated_item):
    
    non_zeros = list(np.nonzero(real_rating)[0])
    non_zeros = list(set(non_zeros).difference(set(rated_item)))
    rmse = (np.sum((pred_rating[non_zeros].T[0]-real_rating[non_zeros])**2) / (len(non_zeros)))**0.5
#     print(len(non_zeros))
#     print(real_rating[non_zeros])
#     print(pred_rating[non_zeros])
#     print((pred_rating[non_zeros]-real_rating[non_zeros])**2)
#     print(np.sum((pred_rating[non_zeros]-real_rating[non_zeros])**2))
#     print((pred_rating[non_zeros]-real_rating[non_zeros]))
#     print(rmse)
    return rmse


def predict(user_profile, item_profile):
    ''' 
        user_profile: array {
                        [k1, k2, k3, ... , kt]
                    } profile for certain user
        item_profile: dict {
                        itemid1: [k1, k2, k3, ... , kt], 
                        itemid2: [k1, k2, k3, ... , kt], 
                        itemid3: [k1, k2, k3, ... , kt], 
                    } profile for items in each node
     '''
    # item_profile_cont = np.array(list(item_profile.values()))  # shape: (I, k)
    #### Calculate predict rating ####
    pred_rating = np.dot(item_profile, user_profile)
    # pred_rating = { itemid: np.dot(item_profile_cont[i], user_profile) \
    #                     for itemid, i in zip(item_profile, range(item_profile_cont.shape[0])) }
    return pred_rating
def pred_RMSE_for_new_user(split_item, rI, prediction_model, sM_testing):
    '''
        sM_testing: 30% test dataset (sparse matrix)
        split_item: list [
                level 0: [112],
                level 1: [48, 0, 79],
                level 2: [15, 0, 17, 1, 1, 1, 61, 0, 50]
                ...
            ]
        User: dict {
                    userid1: { itemid11: rating11, itemid12: rating12, ... } rating of user 1
                    userid2: { itemid21: rating21, itemid22: rating22, ... } rating of user 2
                    userid3: ...
                }
        return : rmse value (float)
    '''

    sM_testing_0_discard = sM_testing[1:,:]
    rmse = 0
    for userid in range(sM_testing.shape[1]):
        if userid % 100 == 0:
            print("%.2f%%" % (100 * userid / sM_testing.shape[1]))  
        pred_index = 0
        final_level = 0
        rated_item = []
        user_all_ratings = sM_testing[:,userid].nonzero()[0]
#         print("Step1 start:")
        for level in range(len(split_item)):
            if split_item[level][pred_index] not in user_all_ratings:
                tmp_pred_index = 3*pred_index + 2
                if tmp_pred_index in prediction_model[str(int(level)+1)]['upro']:
                    final_level += 1
                    pred_index = tmp_pred_index
                else:
                    break
            elif sM_testing[split_item[level][pred_index], userid] >= 4:
                tmp_pred_index = 3*pred_index
                if tmp_pred_index in prediction_model[str(int(level)+1)]['upro']:
                    rated_item.append(split_item[level][pred_index]-1)
                    final_level += 1
                    pred_index = tmp_pred_index
                else:
                    break
            elif sM_testing[split_item[level][pred_index], userid] <= 3:
                tmp_pred_index = 3*pred_index + 1
                if tmp_pred_index in prediction_model[str(int(level)+1)]['upro']:
                    rated_item.append(split_item[level][pred_index]-1)
                    final_level += 1
                    pred_index = tmp_pred_index
                else:
                    break        
#         print("Step2 start:")
        pred_rating = predict(np.array(prediction_model[str(final_level)]['upro'][pred_index]), \
                                            np.array(list(prediction_model[str(final_level)]['ipro'].values()))) 
#         print("Step3 start:")
#         print(pred_rating)
#         print(sM_testing_0_discard[:, userid].toarray())
        rmse += RMSE(sM_testing_0_discard[:, userid].toarray(), pred_rating, rated_item)
    return rmse / sM_testing.shape[1]
#######################################################################################################

In [None]:
import klepto
import numpy as np
Tree = klepto.archives.dir_archive('treeFile', {}, serialized=True)
Tree.load()

In [None]:
rmse_result = pred_RMSE_for_new_user(Tree['split_item'], Tree["rI"], prediction_model, rating_matrix_csc[:, end:])

In [None]:
rmse_result

In [14]:
rmse_result

1.1750985993864183