In [6]:
import numpy as np
import scipy.io as sio

datasets = ['books', 'dvd', 'electronics', 'kitchen']

for dataset in datasets:
    attrb = []
    group = []
    
    # training file
    fid = open('{}_train.svmlight'.format(dataset), 'r')
    line = fid.readline().strip()
    while line:
        line = line.split()
        label = int(line[0])
        label = [1, 0] if label==1 else [0, 1]
        bag_of_words = np.zeros((1, 5000))
        for j in range(len(line)-1):
            word_freq = line[j+1].split(':')
            bag_of_words[0, int(word_freq[0])] = int(word_freq[1])
        attrb.append(bag_of_words)
        group.append(label)
        line = fid.readline().strip()
    fid.close()
        
    # testing file
    fid = open('{}_test.svmlight'.format(dataset), 'r')
    line = fid.readline().strip()
    while line:
        line = line.split()
        label = int(line[0])
        label = [1, 0] if label==1 else [0, 1]
        bag_of_words = np.zeros((1, 5000))
        for j in range(len(line)-1):
            word_freq = line[j+1].split(':')
            bag_of_words[0, int(word_freq[0])] = int(word_freq[1])
        attrb.append(bag_of_words)
        group.append(label)
        line = fid.readline().strip()
    fid.close()
    
    attrb = np.concatenate(attrb, axis=0)
    group = np.array(group)
    
    print(attrb.shape)
    print(group.shape)
    
    sio.savemat('{}.mat'.format(dataset), {'attrb': attrb, 'group': group})
            

(6465, 5000)
(6465, 2)
(5586, 5000)
(5586, 2)
(7681, 5000)
(7681, 2)
(7945, 5000)
(7945, 2)


In [9]:
from scipy.sparse import csc_matrix

attrb = sio.loadmat('books.mat')['attrb']
attrb = csc_matrix(attrb)

print(attrb[0, :])

  (0, 0)	6.0
  (0, 1)	8.0
  (0, 2)	8.0
  (0, 3)	2.0
  (0, 4)	3.0
  (0, 6)	5.0
  (0, 7)	3.0
  (0, 8)	3.0
  (0, 9)	1.0
  (0, 10)	6.0
  (0, 12)	2.0
  (0, 13)	1.0
  (0, 14)	1.0
  (0, 16)	2.0
  (0, 17)	4.0
  (0, 19)	8.0
  (0, 20)	2.0
  (0, 21)	1.0
  (0, 22)	4.0
  (0, 23)	1.0
  (0, 24)	1.0
  (0, 25)	2.0
  (0, 27)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  :	:
  (0, 3960)	1.0
  (0, 3996)	1.0
  (0, 4225)	1.0
  (0, 4261)	1.0
  (0, 4297)	1.0
  (0, 4304)	2.0
  (0, 4308)	1.0
  (0, 4362)	1.0
  (0, 4363)	1.0
  (0, 4379)	1.0
  (0, 4380)	1.0
  (0, 4398)	1.0
  (0, 4439)	1.0
  (0, 4440)	1.0
  (0, 4444)	1.0
  (0, 4464)	3.0
  (0, 4465)	1.0
  (0, 4473)	1.0
  (0, 4551)	1.0
  (0, 4570)	1.0
  (0, 4655)	1.0
  (0, 4697)	1.0
  (0, 4939)	1.0
  (0, 4949)	1.0
  (0, 4962)	1.0


In [29]:
import numpy as np

A = np.array([[1, 2], [3 ,4]])
print(A)
print(np.max(A, 1))
most_freq = np.concatenate([np.reshape(1/np.max(A, 1), (-1, 1))]*A.shape[1], axis=1)
A = np.multiply(A, most_freq)

print(A)

print(np.log(4))

print(np.array(A>0, np.int32))

[[1 2]
 [3 4]]
[2 4]
[[0.5  1.  ]
 [0.75 1.  ]]
1.3862943611198906
[[1 1]
 [1 1]]


# KNN Graph Construction

In [10]:
import numpy as np

A = np.array([[0, 0], [3, 0]])
B = np.max(A, 1)
print(B)
if (B<0.001).any():
    print('Hi')

print(B)

[0 3]
Hi
[0 3]


In [8]:
from scipy.sparse import csc_matrix
from sklearn.neighbors import kneighbors_graph
import scipy.io as sio
import numpy as np


datasets = ['electronics', 'books']

for dataset in datasets:
    attrb = sio.loadmat('{}.mat'.format(dataset))['attrb']
    group = sio.loadmat('{}.mat'.format(dataset))['group']

    max_freq = np.max(attrb, 1)
    max_freq[np.where(max_freq==0)] = 1
    most_freq = np.concatenate([np.reshape(1/max_freq, (-1, 1))]*attrb.shape[1], axis=1)
    TF = np.multiply(attrb, most_freq)

    IDF = np.log(attrb.shape[0]/(np.sum(np.array(attrb>0.01, np.int32), axis=0)+1))
    IDF = np.concatenate([np.reshape(IDF, (1, -1))]*attrb.shape[0], axis=0)

    TF_IDF = np.multiply(TF, IDF)
    network = kneighbors_graph(TF_IDF, 5, mode='connectivity', metric='cosine', include_self=False)

    network = csc_matrix(network)
    attrb = csc_matrix(attrb)

    print(dataset)
    print(network.shape)
    print(attrb.shape)
    print(group.shape)

    sio.savemat('{}.mat'.format(dataset), {'network': network, 'group': group, 'attrb': attrb})


electronics
(7681, 7681)
(7681, 5000)
(7681, 2)
books
(6465, 6465)
(6465, 5000)
(6465, 2)
