In [1]:
import numpy as np
import logging
import sys
from multiprocessing import Pool
logging.basicConfig(format='%(asctime)s %(message)s',level=logging.INFO)


def update_lda(freq, np_topic, dkm, kwm, row, col, k):
    prop_list = []
    old_topic = np_topic[row, col]
    dkm[row, old_topic] -= freq
    kwm[col, old_topic] -= freq
    for i in range(k):
        if np.sum(dkm[row, :]) and np.sum(kwm[col, :]):
            prop = (dkm[row, i]/np.sum(dkm[row, :]))*(kwm[col, i]/np.sum(kwm[col, :]))
        else:
            prop = 0
        prop_list.append(prop)
    new_topic = np.argmax(prop_list)
    np_topic[row, col] = new_topic
    dkm[row, new_topic] += freq
    kwm[col, new_topic] += freq

def construct_matrix(np_data, np_topic, k):
    n_row, n_col = np_data.shape
    dkm = np.zeros([n_row, k])
    kwm = np.zeros([n_col, k])
    for n in range(n_row):
        topic_count = []
        for i in range(k):
            row = np_data[n, :]
            counted = np.sum(row[np_topic[n, :]==i])
            topic_count.append(counted)
        dkm[n, :] = np.array(topic_count)
    for n in range(n_col):
        topic_count = []
        for i in range(k):
            row = np_data[:, n]
            counted = np.sum(row[np_topic[:, n]==i])
            topic_count.append(counted)
        kwm[n, :] = np.array(topic_count)
    return dkm, kwm

def lda(np_data, k):
    n_row, n_col = np_data.shape
    np_topic = np.random.rand(n_row, n_col)*k
    np_topic = np_topic.astype(np.int)
    dkm, kwm = construct_matrix(np_data, np_topic, k)
    for row in range(n_row):
        for col in range(n_col):
            if np_data[row, col]:
                freq = np_data[row, col]
                update_lda(freq, np_topic, dkm, kwm, row, col, k)
    return dkm, kwm, np_topic

def perplexity(np_data, np_topic, dkm, kwm):
    n_row, n_col = np_data.shape
    accu_prop = []
    for row in range(n_row):
        for col in range(n_col):
            if np_data[row, col]:
                freq = np_data[row, col]
                total_prop = []
                for i in range(k):
                    if np.sum(dkm[row, :]) and np.sum(kwm[col, :]) and dkm[row, i] and kwm[col, i]:
                        prop = (
                            dkm[row, i]/np.sum(dkm[row, :]))*(kwm[col, i]/np.sum(kwm[col, :])
                        )
                        total_prop.append(np.log(prop)*freq)
                    else:
                        total_prop.append(0)
                ans_prop = np.max(total_prop)
                accu_prop.append(ans_prop)
    return np.exp(np.sum(accu_prop)/np.sum(np_data)*-1)

def main(k):
    file_name = 'out_file/tiny_set.csv'
    logging.info("Loading input file : {}".format(file_name))
    with open(file_name) as f:
        total = []
        for i, line in enumerate(f):
            lin = line.split(',')
            total.append(lin)
            if i % 1000 == 0:
                logging.info("Loaded: {}".format(i))
    np_data = np.array(total, dtype=np.int)
    logging.info("Loading completed")
    for i in range(100):
        if (i+1)%20 == 0:
            logging.info('Running iteration {}'.format(i+1))
        dkm, kwm, np_topic = lda(np_data, k)
    perp = perplexity(np_data, np_topic, dkm, kwm)
    logging.info("Perplexity k={} : {}".format(k, perp))
    return dkm, kwm

if __name__=="__main__":
    # k = [2,3,4,5,6,7,8,9,10]
    # p = Pool(9)
    # perp_list = p.map(main, k)
    # print(perp_list)
    # try:
#     k = sys.argv[1]
#     k = int(k)
    k = 6
    dkm, kwm = main(k)
    # except:
    #     print("Please specify K")


2017-12-10 14:11:25,176 Loading input file : out_file/tiny_set.csv
2017-12-10 14:11:25,176 Loaded: 0
2017-12-10 14:11:25,274 Loaded: 1000
2017-12-10 14:11:26,331 Loading completed
2017-12-10 14:13:04,396 Running iteration 20
2017-12-10 14:15:00,880 Running iteration 40
2017-12-10 14:17:01,259 Running iteration 60
2017-12-10 14:19:06,203 Running iteration 80
2017-12-10 14:21:01,670 Running iteration 100
2017-12-10 14:21:11,121 Perplexity k=6 : 1.0748621458109895


In [2]:
print(dkm.shape)
print(kwm.shape)

(2000, 6)
(4936, 6)


In [40]:
import json
vocab = json.load(open('out_file/tiny_vocab.txt'))

In [56]:
word_topic = np.argmax(kwm, axis=1)
word_prop = np.max(kwm, axis=1)/np.sum(kwm, axis=1)

In [20]:
np.unique(word_topic, return_counts=True)

(array([0, 1, 2, 3, 4, 5]), array([3666,  348,  224,  284,  175,  239]))

In [45]:
# print(word_topic.shape) - 4936
# print(word_prop.shape) - 4936
# print(len(vocab)) - 4936
group = {0:[], 1:[], 2:[], 3:[], 4:[], 5:[]}
for i, v in enumerate(vocab):
    group[word_topic[i]].append(v)

In [31]:
prop_group_4 = word_prop[word_topic==4]
prop_group_4

array([  5.,   2.,   2.,   5.,  10.,  25.,   2.,   3.,   2.,   2.,   6.,
         3.,   4.,   4.,  10.,   2.,   3.,   2.,   2.,   5.,  27.,   3.,
         2.,   5.,   2.,   3.,   2.,   6.,   2.,   3.,   4.,   9.,   2.,
         3.,   8.,  10.,   6.,   3.,   2.,   2.,  12.,  11.,   2.,   3.,
         4.,   5.,   3.,   4.,   2.,   2.,   3.,   5.,   2.,   5.,   4.,
        15.,   3.,   3.,   2.,   3.,   3.,   8.,   3.,   2.,   5.,  13.,
         9.,   4.,   3.,   2.,   2.,   2.,   3.,   4.,   6.,   6.,   2.,
         4.,   7.,   7.,   3.,   6.,   4.,   4.,   2.,   2.,   5.,   2.,
         2.,   2.,   4.,   4.,  10.,   2.,   2.,   7.,   3.,   2.,   4.,
         2.,   2.,   3.,   5.,   2.,   2.,   2.,   2.,   6.,   4.,   3.,
         2.,   4.,   6.,   3.,  10.,   9.,   6.,   2.,   2.,   3.,   2.,
         3.,   2.,   2.,   2.,   6.,   4.,   2.,   2.,   3.,   3.,   2.,
         7.,   2.,   2.,   5.,   4.,   2.,   2.,   6.,   3.,   2.,   2.,
         4.,   2.,   6.,   2.,   4.,   2.,   6.,   

In [36]:
prop_group_4[np.argsort(-prop_group_4)]

array([ 27.,  25.,  15.,  13.,  12.,  11.,  10.,  10.,  10.,  10.,  10.,
         9.,   9.,   9.,   8.,   8.,   7.,   7.,   7.,   7.,   6.,   6.,
         6.,   6.,   6.,   6.,   6.,   6.,   6.,   6.,   6.,   6.,   6.,
         5.,   5.,   5.,   5.,   5.,   5.,   5.,   5.,   5.,   5.,   5.,
         5.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,
         4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,   4.,
         4.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,
         3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,
         3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,   3.,
         3.,   3.,   3.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,
         2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,
         2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,
         2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,   2.,
         2.,   2.,   2.,   2.,   2.,   2.,   2.,   

In [50]:
order = np.argsort(-prop_group_4)
order

array([ 20,   5,  55,  65,  40,  41,  35,  14,  92, 114,   4,  31, 115,
        66,  34,  61,  79,  78, 132,  95, 145, 125,  27,  81, 149,  74,
       139, 107,  10, 112, 116,  75,  36,  45,  51,  53,  64,   0,   3,
        86, 162,  23, 135,  19, 102,  83, 111, 108, 136,  73,  98,  82,
       143,  77,  91, 147,  67,  90, 126,  30, 159,  54, 157,  12,  13,
        44,  47,  29, 161,  25,  21,  96, 140, 166,  59, 173,  11, 109,
       168, 113,   7, 119, 121, 130, 129, 101,  16, 163,  43,  50, 156,
        72,  46,  68,  56, 155,  62,  57,  60,  37,  33,  80, 131, 172,
       133, 171, 170, 154, 128, 169, 160, 150, 151, 146, 167, 137, 138,
       148, 165, 141, 158, 142, 164, 153, 144, 134, 152,  87, 124,  58,
        52,  49,  48,  42,  39,  38,  32,  28,  63,  26,  22,  18,  17,
        15,   9,   8,   6,   2,   1,  24, 127,  69,  71, 123, 122, 120,
       118, 117, 110, 106, 105, 104,  70, 103,  99,  97,  94,  93,  89,
        88,  85,  84,  76, 100, 174])

In [64]:
for i, v in enumerate(vocab):
    if word_topic[i] == 0:
        print(v+'-'+str(word_prop[i]))

01-1.0
02-1.0
09-1.0
0i-1.0
10-0.46875
10000-1.0
101h-1.0
1080-1.0
1080p30-1.0
1080p60-1.0
10hours-1.0
10mins-1.0
10watt-1.0
10y-1.0
1110-1.0
115gb-1.0
118gb-1.0
11min-1.0
120fps-1.0
12640-1.0
127801-1.0
128077-1.0
128512-1.0
128513-1.0
128522-1.0
128524-1.0
128526-1.0
128545-1.0
128557-1.0
12min-1.0
12th-1.0
1334x750-1.0
143-1.0
149-1.0
15-0.9
1575-1.0
1576-1.0
1578-1.0
1582-1.0
1583-1.0
1585-1.0
1587-1.0
15min-1.0
1604-1.0
1605-1.0
1606-1.0
1607-1.0
1608-1.0
16mp-1.0
17-1.0
1711-1.0
1740-1.0
18-1.0
180-1.0
1810-1.0
18hr-1.0
18k-1.0
190-1.0
1920-1.0
1920x1080-1.0
199-1.0
1a-1.0
1gbhauwei-1.0
1yr-1.0
2000-1.0
20000-1.0
2000mah-1.0
2009-1.0
200hp-1.0
2012-1.0
2013-1.0
201611-1.0
2016why-1.0
2017-0.8
2019-1.0
20600-1.0
20fps-1.0
20k-1.0
20mins-1.0
20mp-1.0
21-1.0
2160p-1.0
21oct-1.0
22min-1.0
23-1.0
230-1.0
24-1.0
240-1.0
240fps-1.0
24k-1.0
25k-1.0
25mb-1.0
25th-1.0
26-1.0
260-1.0
2600-1.0
26k-1.0
27gb-1.0
2800-1.0
28gb-1.0
28th-1.0
290-1.0
299-1.0
2a-1.0
2hrs-1.0
2l-1.0
2mp-1.0
2yr-1.0
