In [14]:
import numpy as np
import pickle
from scipy.sparse import *
import shelve
import h5py

## FDTree Class

In [15]:
class DecisionTreeModel:
    def __init__(self, source, depth_threshold=10, plambda=7, MSP_item=200):

        self.plambda = plambda
        self.rI = list(set(source.nonzero()[0]))
        self.sMatrix = source
        self.tree = list(range(self.sMatrix.shape[1]))
        self.real_item_num = self.sMatrix.shape[0]
        self.global_mean = self.sMatrix.sum()/self.sMatrix.getnnz()
        x = find(source)
        itemset = x[0]
        userset = x[1]
        self.rU = {}
        
        print("rU Generation start:")
        num_ratings = len(userset)
        i = 0
        for itemid, userid in zip(itemset, userset):
            # put approximate 5000 user in each file. Divide user num with 5000.
            n_i = int(userid/5000)
            if i%100000 == 0:
                print("%.2f%%" %(100 * i/num_ratings))
            i += 1
            if n_i in self.rU:
                self.rU[n_i].setdefault(userid, {})[itemid] = int(source[itemid, userid])
            else:
                self.rU.setdefault(n_i, {userid: {itemid: int(source[itemid, userid])}})                 
        print("rU Generation DONE")
        
        print("bias, sum_cur_t, sum_2_cur_t Generation start:")
        self.biasU = np.zeros(self.sMatrix.shape[1])
        self.sum_cur_t = np.zeros(self.real_item_num)
        self.sum_2_cur_t = np.zeros(self.real_item_num)
        self.sum_cntt = np.zeros(self.real_item_num)
        i = 0
        for userid in self.tree:
            if i % 50000 == 0:
                print("%.2f%%" % (100 * i / (0.75 * 480189)))
            i += 1

            self.biasU[userid] = (self.sMatrix.getcol(userid).sum() \
                                     + self.plambda * self.global_mean) /   \
                                 (self.plambda + self.sMatrix.getcol(userid).getnnz())
            user_all_rating_id = self.sMatrix.getcol(userid).nonzero()[0]
            user_all_rating = find(self.sMatrix.getcol(userid))[2]
            self.sum_cur_t[user_all_rating_id[:]] += user_all_rating[:] - self.biasU[userid]
            self.sum_2_cur_t[user_all_rating_id[:]] += (user_all_rating[:] - self.biasU[userid]) ** 2
            self.sum_cntt[user_all_rating_id[:]] += 1
            
        print("bias, sum_cur_t, sum_2_cur_t Generation DONE")
        
        print("Initiation DONE!")

## Step 1: Input Dataset

In [16]:
dataset = '1m'

In [17]:
feature=h5py.File('movielens/test_list.mat')  
test_list = feature['test_list'][:]
test_list = list(map(int, test_list.T[0]))
test_list = [i-1 for i in test_list] 
feature=h5py.File('movielens/train_list.mat')  
train_list = feature['train_list'][:]
train_list = list(map(int, train_list.T[0]))
train_list = [i-1 for i in train_list] 

In [18]:
rating_matrix_csc = load_npz('movielens/sparse_matrix_ml-' + dataset + '_selected.npz').tocsc()
rating_matrix_csc_train = rating_matrix_csc[:, train_list]
rating_matrix_csc_test  = rating_matrix_csc[:, test_list]
print("file load DONE")

file load DONE


In [19]:
dt_model = DecisionTreeModel(rating_matrix_csc_train)

rU Generation start:
0.00%
14.37%
28.75%
43.12%
57.50%
71.87%
86.24%
rU Generation DONE
bias, sum_cur_t, sum_2_cur_t Generation start:
0.00%
bias, sum_cur_t, sum_2_cur_t Generation DONE
Initiation DONE!


In [20]:
rating_matrix_csc_train

<6040x2590 sparse matrix of type '<class 'numpy.float64'>'
	with 695708 stored elements in Compressed Sparse Column format>

In [21]:
for dictname in dt_model.rU:
    d = shelve.open('rU_data/' + str(dictname), protocol = pickle.HIGHEST_PROTOCOL)
    d['content'] = dt_model.rU[dictname]
    d.close()
print("DONE!")

DONE!


In [22]:
dictt = {}
for dictname in dt_model.rU:
    print(dictname)
    d = shelve.open('./rU_data/'+str(dictname), protocol = pickle.HIGHEST_PROTOCOL)
    dictt.update(d['content'])
    d.close()

0


In [23]:
print(len(dictt.keys()))

2590


In [24]:
import klepto
Tree = klepto.archives.dir_archive('treeFile', cached=True, serialized=True)

Tree['biasU'] = dt_model.biasU
Tree['sum_cur_t'] = dt_model.sum_cur_t
Tree['sum_2_cur_t'] = dt_model.sum_2_cur_t
Tree['sum_cntt'] = dt_model.sum_cntt
Tree['rI'] = dt_model.rI

Tree.dump()
Tree.clear()

# save_npz('netflix/biasUM', dt_model.biasUM.tocsr())