In [17]:
import numpy as np
import gzip
from sklearn.cluster import KMeans
from scipy.optimize import linear_sum_assignment
# from ids import *
from block_rect_maxvol import *
import warnings

In [2]:
fn = 'Channel120.txt.gz'

In [23]:
def LoadRawData(fn):
    if fn[-3:] == '.gz':
        op = gzip.open
    else:
        op = open
        
    with op(fn, 'r') as f:
        data = [line.strip().split("\t") for line in f.readlines()]
    return data

def ProssData(d):
    ans = {}
    nums = {}
    for l in d:
        key = tuple(int(i) for i in l[:3])
        num = int(l[3])
        df = np.array([float(i) for i in l[4::2]]) + \
              np.array([float(i) for i in l[5::2]])*1j
                                 
        if key in ans:
            ans[key].append(df)
            nums[key].append(num)
        else:
            ans[key] = [df]
            nums[key] = [num]

    for key in ans:
        # print  ans[key]
        ans[key] = np.array(ans[key])
        # print  ans[key]
        
    return ans, np.array(nums)

def EqualClusters(data, num_clusters, ToReIdx=False):
    # See https://stats.stackexchange.com/questions/8744/clustering-procedure-where-each-cluster-has-an-equal-number-of-points
    n = data.shape[0]
    m = n // num_clusters
    if n != m*num_clusters:
        # warnings.warn("Data will be cut")
        print "Data will be cut"
        n = m*num_clusters
        data = data[:n]
    # print "EqualClusters: {}, {}".format(num_clusters, n)
        
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(data)
    dst = kmeans.fit_transform(data)
    dst2 = dst**2
    dst = np.tile(dst2, m)
    rr, cl = linear_sum_assignment(dst)
    assert(np.all(rr == np.arange(n)))
    
    if ToReIdx:
        cl = ReIdx(cl, num_clusters)
        
    return cl 

def ReIdx(idx, m):
    return np.hstack((idx[i::m] for i in np.arange(m)))

    

In [4]:
d = LoadRawData(fn)
res, nums = ProssData(d)

In [5]:
# Check first matrices data sizes
cnt = 0
for i in res:
    print i, res[i].shape
    cnt += 1
    if cnt > 10:
        break

(3, 26, 3) (173, 64)
(1, 38, 2) (144, 64)
(5, 48, 3) (171, 64)
(2, 33, 1) (161, 64)
(2, 14, 2) (145, 64)
(2, 8, 2) (145, 64)
(2, 25, 3) (174, 64)
(4, 39, 3) (173, 64)
(2, 19, 3) (174, 64)
(2, 40, 1) (161, 64)
(1, 48, 1) (162, 64)


In [24]:
nblock = 8
cnt = 0
# num_clusters = 10
for i in res:
    A = res[i]
    m, sz = A.shape
    num_cl = m // nblock
    print i, m, sz, num_cl, 
    idx = EqualClusters(A, num_cl)
    print len(idx)
    bm = rect_block_maxvol(A[idx], nblock-1, Kmax = sz, max_iters=100, rect_tol = 0.05, tol = 0.0, debug = False, ext_debug = False)
    print bm
    
    cnt += 1
    if cnt > 5:
        break


(3, 26, 3) 173 64 21 Data will be cut
168
[120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143   0   1   2   3   4   5   6   7 160 161 162 163
 164 165 166 167 112 113 114 115 116 117 118 119  40  41  42  43  44  45
  46  47  88  89  90  91  92  93  94  95  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  56  57
  58  59  60  61  62  63  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111  48  49  50  51  52  53  54  55  24  25  26  27  28  29
  30  31   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22  23
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159  32  33
  34  35  36  37  38  39]
(1, 38, 2) 144 64 18 144
[ 40  41  42  43  44  45  46  47  16  17  18  19  20  21  22  23 136 137
 138 139 140 141 142 143  72  73  74  75  76  77  78  79  88  89  90  91
  92  93  94  95  48  49  50  51  52  53  54  55  56  57  58  59  60  61
  62  63 128 129 130 131 132 13

In [47]:
X = np.array([[1, 2], [1, 4], [1, 0],
               [4, 2], [4, 4], [4, 0]])

kmeans = KMeans(n_clusters=3, random_state=0).fit(X)
print kmeans.labels_

print EqualClusters(X, 3, ToReIdx=False)
print EqualClusters(X, 3, ToReIdx=True)


[0 0 0 1 2 1]
[3 2 0 1 5 4]
[3 1 2 5 0 4]
