In [1]:
import numpy as np
import pickle
from scipy.sparse import *
import shelve
import h5py

## FDTree Class

In [2]:
class DecisionTreeModel:
    def __init__(self, source, depth_threshold=10, plambda=7, MSP_item=200):

        self.plambda = plambda
        self.rI = list(set(source.nonzero()[0]))
        self.sMatrix = source
        self.tree = list(range(self.sMatrix.shape[1]))
        self.real_item_num = self.sMatrix.shape[0]
        self.global_mean = self.sMatrix.sum()/self.sMatrix.getnnz()
        x = find(source)
        itemset = x[0]
        userset = x[1]
        self.rU = {}
        
        print("rU Generation start:")
        num_ratings = len(userset)
        i = 0
        for itemid, userid in zip(itemset, userset):
            # put approximate 5000 user in each file. Divide user num with 5000.
            n_i = int(userid/5000)
            if i%100000 == 0:
                print("%.2f%%" %(100 * i/num_ratings))
            i += 1
            if n_i in self.rU:
                self.rU[n_i].setdefault(userid, {})[itemid] = int(source[itemid, userid])
            else:
                self.rU.setdefault(n_i, {userid: {itemid: int(source[itemid, userid])}})                 
        print("rU Generation DONE")
        
        print("bias, sum_cur_t, sum_2_cur_t Generation start:")
        self.biasU = np.zeros(self.sMatrix.shape[1])
        self.sum_cur_t = np.zeros(self.real_item_num)
        self.sum_2_cur_t = np.zeros(self.real_item_num)
        self.sum_cntt = np.zeros(self.real_item_num)
        i = 0
        for userid in self.tree:
            if i % 50000 == 0:
                print("%.2f%%" % (100 * i / (0.75 * 480189)))
            i += 1

            self.biasU[userid] = (self.sMatrix.getcol(userid).sum() \
                                     + self.plambda * self.global_mean) /   \
                                 (self.plambda + self.sMatrix.getcol(userid).getnnz())
            user_all_rating_id = self.sMatrix.getcol(userid).nonzero()[0]
            user_all_rating = find(self.sMatrix.getcol(userid))[2]
            self.sum_cur_t[user_all_rating_id[:]] += user_all_rating[:] - self.biasU[userid]
            self.sum_2_cur_t[user_all_rating_id[:]] += (user_all_rating[:] - self.biasU[userid]) ** 2
            self.sum_cntt[user_all_rating_id[:]] += 1
            
        print("bias, sum_cur_t, sum_2_cur_t Generation DONE")
        
        print("Initiation DONE!")

## Step 1: Input Dataset

In [3]:
dataset = '20m'

In [4]:
feature=h5py.File('movielens/' + dataset + '/test_list.mat')  
test_list = feature['test_list'][:]
test_list = list(map(int, test_list.T[0]))
test_list = [i-1 for i in test_list] 
feature=h5py.File('movielens/' + dataset + '/train_list.mat')  
train_list = feature['train_list'][:]
train_list = list(map(int, train_list.T[0]))
train_list = [i-1 for i in train_list] 

In [5]:
rating_matrix_csc = load_npz('movielens/sparse_matrix_ml-' + dataset + '_selected.npz').tocsc()
rating_matrix_csc_train = rating_matrix_csc[:, train_list]
rating_matrix_csc_test  = rating_matrix_csc[:, test_list]
print("file load DONE")

file load DONE


In [12]:
rating_matrix_csc_train

<138493x17889 sparse matrix of type '<class 'numpy.float64'>'
	with 13915986 stored elements in Compressed Sparse Column format>

In [6]:
dt_model = DecisionTreeModel(rating_matrix_csc_train)

rU Generation start:
0.00%
0.72%
1.44%
2.16%
2.87%
3.59%
4.31%
5.03%
5.75%
6.47%
7.19%
7.90%
8.62%
9.34%
10.06%
10.78%
11.50%
12.22%
12.93%
13.65%
14.37%
15.09%
15.81%
16.53%
17.25%
17.96%
18.68%
19.40%
20.12%
20.84%
21.56%
22.28%
23.00%
23.71%
24.43%
25.15%
25.87%
26.59%
27.31%
28.03%
28.74%
29.46%
30.18%
30.90%
31.62%
32.34%
33.06%
33.77%
34.49%
35.21%
35.93%
36.65%
37.37%
38.09%
38.80%
39.52%
40.24%
40.96%
41.68%
42.40%
43.12%
43.83%
44.55%
45.27%
45.99%
46.71%
47.43%
48.15%
48.86%
49.58%
50.30%
51.02%
51.74%
52.46%
53.18%
53.89%
54.61%
55.33%
56.05%
56.77%
57.49%
58.21%
58.93%
59.64%
60.36%
61.08%
61.80%
62.52%
63.24%
63.96%
64.67%
65.39%
66.11%
66.83%
67.55%
68.27%
68.99%
69.70%
70.42%
71.14%
71.86%
72.58%
73.30%
74.02%
74.73%
75.45%
76.17%
76.89%
77.61%
78.33%
79.05%
79.76%
80.48%
81.20%
81.92%
82.64%
83.36%
84.08%
84.79%
85.51%
86.23%
86.95%
87.67%
88.39%
89.11%
89.82%
90.54%
91.26%
91.98%
92.70%
93.42%
94.14%
94.85%
95.57%
96.29%
97.01%
97.73%
98.45%
99.17%
99.89%
rU Generation

In [9]:
for dictname in dt_model.rU:
    d = shelve.open('rU_data/' + str(dictname), protocol = pickle.HIGHEST_PROTOCOL)
    d['content'] = dt_model.rU[dictname]
    d.close()
print("DONE!")

DONE!


In [10]:
dictt = {}
for dictname in dt_model.rU:
    print(dictname)
    d = shelve.open('./rU_data/'+str(dictname), protocol = pickle.HIGHEST_PROTOCOL)
    dictt.update(d['content'])
    d.close()

0
1
2
3


In [11]:
print(len(dictt.keys()))

17889


In [13]:
import klepto
Tree = klepto.archives.dir_archive('treeFile', cached=True, serialized=True)

Tree['biasU'] = dt_model.biasU
Tree['sum_cur_t'] = dt_model.sum_cur_t
Tree['sum_2_cur_t'] = dt_model.sum_2_cur_t
Tree['sum_cntt'] = dt_model.sum_cntt
Tree['rI'] = dt_model.rI

Tree.dump()
Tree.clear()

# save_npz('netflix/biasUM', dt_model.biasUM.tocsr())