In [91]:
import numpy as np
import random
import copy
import itertools
import time
from sklearn.metrics import jaccard_score
from scipy.stats import hmean

class Box:
    def __init__(self, data_matrix, initial_i):
        self.dim = len(data_matrix.shape)
        self.data = data_matrix
        self.lambda_0 = data_matrix.mean()
        self.data_shift = self.data - self.lambda_0 
        self.initialize_sets(initial_i)
    
    def initialize_sets(self, initial_i):
        self.sets = [set() for i in range(self.dim)]
        self.sets[0].add(initial_i)
        idx = []
        idx.append([initial_i])
        for i in range(1, self.dim):
            idx.append(slice(None))
        non_zero_args = np.nonzero(self.data[idx])
        for dim in range(1, self.dim):
            self.sets[dim] = set(non_zero_args[dim].tolist())
    
    def update(self, dif):
        if dif['inside']:
            self.sets[dif['dim']].remove(dif['entity'])
        else:
            self.sets[dif['dim']].add(dif['entity'])
            
    def try_update(self, dif):
        sets = copy.deepcopy(self.sets)
        if dif['inside']:
            sets[dif['dim']].remove(dif['entity'])
        else:
            sets[dif['dim']].add(dif['entity'])
        box_hash = self.get_hash(sets)
        return box_hash
    
    def get_cur_hash(self):
        return self.get_hash(self.sets)
    
    def get_hash(self, sets):
        common_list = []
        for cur_set in sets:
            common_list.extend(sorted(cur_set))
        box_hash = hash(tuple(common_list))
        return box_hash

class Clusterization:
    
    def get_clusters(self, data_matrix, euristic=0):
        boxes = []
        data_matrix_cur = copy.deepcopy(data_matrix)
        not_used_idx = [i for i in range(0, data_matrix.shape[0])]
        for i in range(data_matrix.shape[0]):
            if euristic == 1:
                if i not in not_used_idx:
                    continue
            cur_box = self.FourClusterBox(i, data_matrix_cur, not_used_idx)
            new_box = True
            for box_set in cur_box.sets:
                if len(box_set) == 0:
                    new_box = False
            if new_box:
                boxes.append(cur_box)
                if euristic == 1:
                    not_used_idx_upd = []
                    for idx in not_used_idx:
                        if idx not in cur_box.sets[0]:
                            not_used_idx_upd.append(idx)
                    not_used_idx = not_used_idx_upd
        return boxes

    def find_dif(self, box, dim, entity, inside):
        def get_z(inside):
            if inside:
                return -1
            return 1
        
        def get_cur_sets_sum(box):
            idx = []
            for box_set in box.sets:
                idx.append(list(box_set))
            return box.data_shift[np.ix_(*idx)].sum()
        
        def get_entity_sets_sum(box, dim, entity):
            idx = []
            for i, box_set in enumerate(box.sets):
                if i == dim:
                    idx.append([entity])
                else:
                    idx.append(list(box_set))
            return box.data_shift[np.ix_(*idx)].sum()
        
        z = get_z(inside)
        cur_sets_sum = get_cur_sets_sum(box)
        entity_sets_sum = get_entity_sets_sum(box, dim, entity)
        terms = []
        terms.append(entity_sets_sum**2)
        if box.dim == 2:
            terms[0] = terms[0] * z
        terms.append(2 * z * cur_sets_sum * entity_sets_sum)
        terms.append(((-1) * cur_sets_sum **2)/len(box.sets[dim]))
        if box.dim != 2:
            terms[2] = terms[2] * z
        numerator = np.array(terms).sum()
        set_lengths = 1
        for i in range(box.dim):
            if i != dim:
                set_lengths *= len(box.sets[i])
        denominator = (len(box.sets[dim]) + z) * set_lengths
        if denominator != 0:
            dif = numerator/denominator
        else:
            dif = -np.inf
        return dif

    def FourClusterBox(self, initial_i, data_matrix, not_used_idx):
        box = Box(data_matrix, initial_i)
        ind = 0
        start = time.time()
        uniq_boxes = set()#avoid repeating boxes
        uniq_boxes.add(box.get_cur_hash())
        while(True):
            ind += 1
            max_dif = {'value': -np.inf, 'dim': -1, 'entity': -1, 'inside': -1}
            for dim in range(box.dim):
                for entity in range(box.data.shape[dim]):
                    if dim == 0 and entity not in not_used_idx:
                        continue
                    inside = entity in box.sets[dim]
                    if inside and len(box.sets[dim]) == 1:
                        continue
                    possible_dif = {'dim': dim, 'entity': entity, 'inside': inside}
                    cur_hash = box.try_update(possible_dif)
                    if cur_hash in uniq_boxes:#avoid box repeating
                        continue 
                    cur_dif = self.find_dif(box, dim, entity, inside)
                    if cur_dif > max_dif['value']:
                        max_dif['value'] = cur_dif
                        max_dif['dim'] = dim
                        max_dif['entity'] = entity
                        max_dif['inside'] = inside
            if max_dif['value'] <= 0:
                break
            box.update(max_dif)
            uniq_boxes.add(box.get_cur_hash())
        return box

class NonOverlappingCluster:
    def __init__(self, data_matrix_size, clusters_num_borders={'min':4, 'max':5}):
        initial_sets = self.generate_sets(data_matrix_size)
        K = np.random.randint(clusters_num_borders['min'], clusters_num_borders['max'])
        self.tricluster_sets = self.generate_triclusters(initial_sets, K)
        self.data_matrix = self.generate_data_matrix(data_matrix_size)
        
    def generate_sets(self, size_sets):
        sets = []
        for size_set in size_sets:
            cur_set = [i for i in range(size_set)]
            sets.append(cur_set)
        return sets
    
    def generate_triclusters(self, initial_sets, K):
        tricluster_sets = []
        for k in range(K):
            tricluster_set = []
            for initial_set in initial_sets:
                max_tricluster_size = int(29/K)
                tricluster_size = np.random.randint(4, max_tricluster_size - 1)
                tricluster_elements = [initial_set.pop(random.randrange(len(initial_set))) for _ in range(tricluster_size)]
                tricluster_set.append(tricluster_elements)
            tricluster_sets.append(tricluster_set)
        return tricluster_sets
    
    def generate_data_matrix(self, data_matrix_size):
        data_matrix = np.zeros(data_matrix_size)
        for tricluster_set in self.tricluster_sets:
            elements = list(itertools.product(*tricluster_set))
            for element in elements:
                data_matrix[element] = 1
        return data_matrix

In [92]:
%%time
start_time = time.time()
t=29
data_matrix_size = [t, t+1, t+2, t+3]
data = NonOverlappingCluster(data_matrix_size)

CPU times: user 2.08 ms, sys: 1.75 ms, total: 3.83 ms
Wall time: 2.13 ms


In [93]:
data.data_matrix.sum()/data.data_matrix.size * 100

0.20856507230255839

In [94]:
len(data.tricluster_sets)

4

In [95]:
%%time
df_0 = make_experiment(0, data.data_matrix, data.tricluster_sets)



uniform_noise experiment  0  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  1  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  2  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  3  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  4  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  5  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  6  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  7  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  8  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  9  mean : 1.0  std:  0.0 number of clusters 4
CPU times: user 5.16 s, sys: 104 ms, total: 5.26 s
Wall time: 5.28 s


In [96]:
%%time
df_05 = make_experiment(0.05, data.data_matrix, data.tricluster_sets)



uniform_noise experiment  0  mean : 0.4953665614356404  std:  0.21190634933070474 number of clusters 5
uniform_noise experiment  1  mean : 0.47659025297713214  std:  0.19826543935674099 number of clusters 6
uniform_noise experiment  2  mean : 0.5976073087804623  std:  0.2646751977833349 number of clusters 7
uniform_noise experiment  3  mean : 0.5549695662127727  std:  0.23967021445039996 number of clusters 8
uniform_noise experiment  4  mean : 0.5024833058453414  std:  0.23217714493635 number of clusters 11
uniform_noise experiment  5  mean : 0.5008378811590886  std:  0.2570309222191543 number of clusters 6
uniform_noise experiment  6  mean : 0.6937389272243811  std:  0.2911280114819827 number of clusters 5
uniform_noise experiment  7  mean : 0.5628584184256453  std:  0.31041305970278454 number of clusters 7
uniform_noise experiment  8  mean : 0.5407640897114582  std:  0.26356624719014987 number of clusters 5
uniform_noise experiment  9  mean : 0.5747959760060324  std:  0.2853817078200

In [97]:
%%time
df_10 = make_experiment(0.1, data.data_matrix, data.tricluster_sets)



uniform_noise experiment  0  mean : 0.42927101434212506  std:  0.22249103697060704 number of clusters 10
uniform_noise experiment  1  mean : 0.4930363650797087  std:  0.2806873657367821 number of clusters 10
uniform_noise experiment  2  mean : 0.5425485926223377  std:  0.28052684443998643 number of clusters 8
uniform_noise experiment  3  mean : 0.6139364956508918  std:  0.29792879153733204 number of clusters 6
uniform_noise experiment  4  mean : 0.6049937042124541  std:  0.32474543957489765 number of clusters 6
uniform_noise experiment  5  mean : 0.6320400085091246  std:  0.28937776277307914 number of clusters 6
uniform_noise experiment  6  mean : 0.5264313850332325  std:  0.28016157863111973 number of clusters 9
uniform_noise experiment  7  mean : 0.5515509652536791  std:  0.2880965432885452 number of clusters 8
uniform_noise experiment  8  mean : 0.5128059904468447  std:  0.27485559151171324 number of clusters 10
uniform_noise experiment  9  mean : 0.619215146887003  std:  0.31331298

In [98]:
%%time
df_30 = make_experiment(0.3, data.data_matrix, data.tricluster_sets)



uniform_noise experiment  0  mean : 0.44990906298062766  std:  0.21811435853192251 number of clusters 13
uniform_noise experiment  1  mean : 0.4723853506815968  std:  0.2338956669854041 number of clusters 15
uniform_noise experiment  2  mean : 0.5365629825648208  std:  0.24706132030559932 number of clusters 8
uniform_noise experiment  3  mean : 0.41552922338526055  std:  0.11471620062577458 number of clusters 15
uniform_noise experiment  4  mean : 0.43028697679872746  std:  0.16296195681760586 number of clusters 11
uniform_noise experiment  5  mean : 0.40156636946787866  std:  0.10654076736372776 number of clusters 15
uniform_noise experiment  6  mean : 0.45827061209146236  std:  0.19717294423090675 number of clusters 13
uniform_noise experiment  7  mean : 0.477775589505689  std:  0.23645098770924697 number of clusters 13
uniform_noise experiment  8  mean : 0.5229103226943961  std:  0.3001557032106092 number of clusters 11
uniform_noise experiment  9  mean : 0.4012117738328592  std:  0

In [99]:
np.save('series1_matrix.npy', data.data_matrix)
df_0.to_excel('series1_prob00.xlsx', index=False)
df_05.to_excel('series1_prob05.xlsx', index=False)
df_10.to_excel('series1_prob10.xlsx', index=False)
df_30.to_excel('series1_prob30.xlsx', index=False)

In [37]:
make_experiment(0.1, data_matrix, tetraclusters, )

[[0], slice(None, None, None), slice(None, None, None), slice(None, None, None)]
[[1], slice(None, None, None), slice(None, None, None), slice(None, None, None)]




[[2], slice(None, None, None), slice(None, None, None), slice(None, None, None)]
[[6], slice(None, None, None), slice(None, None, None), slice(None, None, None)]
[{19, 12, 5}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}]
aaaaaaa
[{13, 14, 15, 16, 17, 18}, {9, 10, 11, 12, 13, 14, 15, 16, 17}, {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}, {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}]
aaaaaaa
[{0, 1, 2, 3, 4}, {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}]
aaaaaaa
[{6, 7, 8, 9, 10, 11}, {3, 4, 5, 6, 7, 8, 9, 10, 11}, {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}, {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}]
aaaaaaa
uniform_noise experiment  0  mean : 0.815624999

In [76]:
class OverlappingCluster:
    def __init__(self, num_clusters, size=[29, 30, 31, 32]):
        '''
            create size matrix with clusters not overlapping by the first coordinate (wlog) and with minimum sparsity
        '''
        tetraclusters = [[[] for j in range(len(size))] for i in range(num_clusters)]
        self.generate_non_overlapping(size[0], num_clusters, tetraclusters, 0)
        for i in range(1, len(size)):
            self.generate_overlapping(size[i], num_clusters, tetraclusters, i)
        self.tetraclusters = tetraclusters
        self.data_matrix = self.generate_data_matrix(size)
    
    def generate_non_overlapping(self, size, num_clusters, tetraclusters, dim):
        cluster_size = int(size/num_clusters) - 1
        initial_set = [i for i in range(size)]
        for cur_cluster in range(num_clusters):
            cluster_elements = [initial_set.pop(random.randrange(len(initial_set))) for _ in range(cluster_size)]
            tetraclusters[cur_cluster][dim] = cluster_elements

    def generate_overlapping(self, size, num_clusters, tetraclusters, dim):
        initial_list = [i for i in range(size)]
        for cur_cluster in range(num_clusters):
            min_cluster_size = 2
            max_cluster_size = int(size/2) - num_clusters
            cluster_size = np.random.randint(min_cluster_size, max_cluster_size)
            cluster_elements = random.sample(initial_list, cluster_size)
            tetraclusters[cur_cluster][dim] = cluster_elements
            
    def generate_data_matrix(self, data_matrix_size):
        data_matrix = np.zeros(data_matrix_size)
        for tetracluster in self.tetraclusters:
            elements = list(itertools.product(*tetracluster))
            for element in elements:
                data_matrix[element] = 1
        return data_matrix

In [100]:
cluster = OverlappingCluster(4)

In [101]:
cluster.tetraclusters

[[[5, 18, 10, 19, 3, 4],
  [0, 1, 6, 14, 13, 18, 2, 10, 17],
  [3, 23, 0, 2, 9, 18, 25],
  [10, 24, 6, 9, 4, 0, 17, 29]],
 [[21, 27, 11, 1, 22, 9],
  [0, 10],
  [17, 14, 23, 16, 11, 27, 30, 1, 13, 0],
  [18, 26, 8, 27, 20, 14, 22]],
 [[25, 0, 20, 15, 16, 7],
  [2, 29, 4, 26, 5, 10, 1, 22, 28],
  [12, 27, 19, 22, 4, 0, 28, 7],
  [27, 1, 14]],
 [[2, 12, 24, 13, 28, 6], [21, 26, 12], [6, 3], [21, 4, 13, 29, 0, 11, 9]]]

In [57]:
len(cluster.tetraclusters)

4

In [102]:
cluster.data_matrix.sum()/cluster.data_matrix.size * 100

0.6270856507230256

In [103]:
%%time
df_0 = make_experiment(0, cluster.data_matrix, cluster.tetraclusters)



uniform_noise experiment  0  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  1  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  2  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  3  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  4  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  5  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  6  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  7  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  8  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  9  mean : 1.0  std:  0.0 number of clusters 4
CPU times: user 5.74 s, sys: 108 ms, total: 5.84 s
Wall time: 5.89 s


In [104]:
%%time
df_5 = make_experiment(0.05, cluster.data_matrix, cluster.tetraclusters)



uniform_noise experiment  0  mean : 0.6752739731629674  std:  0.2313671891955107 number of clusters 3
uniform_noise experiment  1  mean : 0.6032815380765131  std:  0.2319009733414801 number of clusters 4
uniform_noise experiment  2  mean : 0.6622664835164834  std:  0.2821281969265921 number of clusters 5
uniform_noise experiment  3  mean : 0.6974859775641026  std:  0.3041047005432564 number of clusters 4
uniform_noise experiment  4  mean : 0.7176679867349238  std:  0.284769040819147 number of clusters 4
uniform_noise experiment  5  mean : 0.7103473681139122  std:  0.24052388659316723 number of clusters 4
uniform_noise experiment  6  mean : 0.8237165555877524  std:  0.24930243790964426 number of clusters 3
uniform_noise experiment  7  mean : 0.6319078144078144  std:  0.21567796410959117 number of clusters 4
uniform_noise experiment  8  mean : 0.8222201273671862  std:  0.2514187069942867 number of clusters 3
uniform_noise experiment  9  mean : 0.6702297276337392  std:  0.2743404619937764

In [63]:
import pandas as pd

In [105]:
%%time
df_10 = make_experiment(0.1, cluster.data_matrix, cluster.tetraclusters)



uniform_noise experiment  0  mean : 0.7177886530518004  std:  0.24321753171422078 number of clusters 4
uniform_noise experiment  1  mean : 0.5485028437348947  std:  0.2658686502978247 number of clusters 4
uniform_noise experiment  2  mean : 0.6715741329426508  std:  0.2963359614816881 number of clusters 4
uniform_noise experiment  3  mean : 0.7599023701079622  std:  0.21921169816847313 number of clusters 4
uniform_noise experiment  4  mean : 0.6202336530759452  std:  0.312843766408164 number of clusters 5
uniform_noise experiment  5  mean : 0.6039468667583993  std:  0.29274681384218376 number of clusters 5
uniform_noise experiment  6  mean : 0.7467082289055973  std:  0.27793332582007535 number of clusters 5
uniform_noise experiment  7  mean : 0.7262241872119969  std:  0.2632159727284131 number of clusters 5
uniform_noise experiment  8  mean : 0.6699712975195683  std:  0.2734071987171584 number of clusters 5
uniform_noise experiment  9  mean : 0.642106338836602  std:  0.2624636734325676

In [106]:
%%time
df_30 = make_experiment(0.3, cluster.data_matrix, cluster.tetraclusters)



uniform_noise experiment  0  mean : 0.5203932427927617  std:  0.30502896406238356 number of clusters 7
uniform_noise experiment  1  mean : 0.48260228369554997  std:  0.2324629678197856 number of clusters 6
uniform_noise experiment  2  mean : 0.5918264353841463  std:  0.29570596049627157 number of clusters 8
uniform_noise experiment  3  mean : 0.7538438644688645  std:  0.29307802963977836 number of clusters 5
uniform_noise experiment  4  mean : 0.52360702293738  std:  0.274917492643149 number of clusters 7
uniform_noise experiment  5  mean : 0.6246898883504584  std:  0.29624297238134045 number of clusters 7
uniform_noise experiment  6  mean : 0.44291461724281483  std:  0.21304325787158543 number of clusters 8
uniform_noise experiment  7  mean : 0.5076516643305858  std:  0.26032282708420895 number of clusters 9
uniform_noise experiment  8  mean : 0.6740578885539722  std:  0.29262568479930534 number of clusters 6
uniform_noise experiment  9  mean : 0.6643564205653022  std:  0.310619736121

In [107]:
np.save('series2_matrix.npy', cluster.data_matrix)
df_0.to_excel('series2_prob00.xlsx', index=False)
df_5.to_excel('series2_prob05.xlsx', index=False)
df_10.to_excel('series2_prob10.xlsx', index=False)
df_30.to_excel('series2_prob30.xlsx', index=False)

In [4]:
import numpy as np
import random
import copy
import itertools
import time
from sklearn.metrics import jaccard_score
from scipy.stats import hmean

In [5]:
def add_noise(data_matrix, distribution, prob):
    if distribution == 'uniform':# prob = probablitiy to change value in datamatrix
        np.random.seed(random.randint(1, 1e9))
        noise = np.random.uniform(0, 1, data_matrix.shape)
        data_matrix[(noise <= prob) & (data_matrix == 1)] = 0
        data_matrix[(noise <= prob) & (data_matrix == 0)] = 1
    if distribution == 'normal':# prob = deviation in normal distribution
        noise = np.random.normal(0, prob, data_matrix.shape)
        data_matrix = data_matrix + noise
    return data_matrix

In [6]:
class BoxResult:
    def __init__(self, cluster_predict):
        self.score = -np.inf
        self.predict = cluster_predict
        self.test = np.nan
        self.test_id = -np.inf
        
    def update(self, cluster_test, cluster_test_id):
        #cur_score = self.jaccard(cluster_test)
        cur_score = self.kluch(cluster_test)
        if cur_score > self.score:
            self.score = cur_score
            self.test = cluster_test
            self.test_id = cluster_test_id
    
    def jaccard(self, cluster_test):
        dimension_score = []
        for dim in range(len(self.predict)):
            s1 = self.predict[dim]
            s2 = set(cluster_test[dim])
            dim_score = float(len(s1.intersection(s2)) / len(s1.union(s2)))
            dimension_score.append(dim_score)
        score = np.mean(dimension_score)
        return score
    
    def kluch(self, cluster_test):
        dimension_score = []
        for dim in range(len(self.predict)):
            s1 = self.predict[dim]
            s2 = set(cluster_test[dim])
            intersect = len(s1.intersection(s2))
            dim_score = (float(intersect / len(s1)) + float(intersect / len(s2)))/2
            dimension_score.append(dim_score)
        score = np.mean(dimension_score)
        return score
    
def check_similarity_2(clusters_test, clusters_predict):#находим для каждого изначального кластера макимально похожий на него
    box_results = []
    for i, cluster_test in enumerate(clusters_test):
        for cluster_predict in clusters_predict:
            box_result = BoxResult(cluster_predict.sets)
            box_result.update(cluster_test, i)
        box_results.append(box_result)
    return box_results
        
def check_similarity(clusters_test, clusters_predict):
    box_results = []
    for cluster_predict in clusters_predict:
        box_result = BoxResult(cluster_predict.sets)
        for i, cluster_test in enumerate(clusters_test):
            box_result.update(cluster_test, i)
        box_results.append(box_result)
    return box_results

In [108]:
dim_size = {
    0: {1:[1, 4], 2:[6, 10], 3:[12, 16], 4:[17, 20]},
    1: {1:[1, 6], 2:[3, 9], 3:[6, 12], 4:[10, 16]},
    2: {1:[1, 9], 2:[3, 12], 3:[6, 11], 4:[10, 15]},
    3: {1:[2, 10], 2:[4, 12], 3:[7, 12], 4:[12, 17]}
}# 4 clusters
dim_size_3 = {
    0: {1:[1, 5], 2:[7, 12], 3:[14, 19]},
    1: {1:[1, 8], 2:[4, 12], 3:[10, 18]},
    2: {1:[1, 11], 2:[3, 14], 3:[6, 18]},
    3: {1:[2, 14], 2:[3, 16], 3:[5, 20]}
}#3 clusters
matrix_size = [20, 20, 20, 20]
def create_fixed_data_matrix(matrix_size, dim_size, num_clusters):
    tetraclusters = [[[] for j in range(0, 4)] for i in range(0, num_clusters)]
    for i in range(4): #all dims
        for j in range(1, num_clusters+1):#all clusters
            for u in range(dim_size[i][j][0], dim_size[i][j][1] + 1):
                tetraclusters[j-1][i].append(u-1)
    data_matrix = np.zeros(matrix_size)
    for tetracluster in tetraclusters:
        elements = list(itertools.product(*tetracluster))
        for element in elements:
            data_matrix[element] = 1
    return data_matrix, tetraclusters
data_matrix, tetraclusters = create_fixed_data_matrix(matrix_size, dim_size, 4)

In [61]:
def make_experiment(prob, data_matrix, tetraclusters):
    number_exeperiments = 10
    data_matrix_uniform_noise_prev = 0
    df = pd.DataFrame(columns=['number_experiment', 'mean', 'std','number of clusters'])
    for i in range(number_exeperiments):
        start_time = time.time()
        data_matrix_uniform_noise = add_noise(copy.deepcopy(data_matrix), 'uniform', prob)
        clusters_uniform_noise = Clusterization().get_clusters(data_matrix_uniform_noise, 1)
        uniform_noise_results = check_similarity(tetraclusters, clusters_uniform_noise)
        uniform_noise_results = np.array([uniform_noise_result.score for uniform_noise_result in uniform_noise_results])
        data_matrix_uniform_noise_prev = data_matrix_uniform_noise
        df.loc[i, :] = [i+1, uniform_noise_results.mean(), uniform_noise_results.std(), uniform_noise_results.shape[0]]
        print('uniform_noise experiment ', i,  ' mean :', uniform_noise_results.mean(), ' std: ', uniform_noise_results.std(), 
             'number of clusters', uniform_noise_results.shape[0])
    return df


In [111]:
data_matrix.sum()/data_matrix.size * 100

4.601249999999999

In [112]:
%%time
df_0 = make_experiment(0, data_matrix, tetraclusters)



uniform_noise experiment  0  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  1  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  2  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  3  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  4  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  5  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  6  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  7  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  8  mean : 1.0  std:  0.0 number of clusters 4
uniform_noise experiment  9  mean : 1.0  std:  0.0 number of clusters 4
CPU times: user 2.86 s, sys: 61.1 ms, total: 2.92 s
Wall time: 2.95 s


In [113]:
%%time
df_05 = make_experiment(0.05, data_matrix, tetraclusters)



uniform_noise experiment  0  mean : 0.7657986111111111  std:  0.2342385791244292 number of clusters 4
uniform_noise experiment  1  mean : 0.736498538011696  std:  0.21630766988131814 number of clusters 5
uniform_noise experiment  2  mean : 0.7657986111111111  std:  0.2342385791244292 number of clusters 4
uniform_noise experiment  3  mean : 0.762375992063492  std:  0.23779492236412222 number of clusters 4
uniform_noise experiment  4  mean : 0.74  std:  0.2128893491934249 number of clusters 5
uniform_noise experiment  5  mean : 0.7690876831501832  std:  0.23091281067270836 number of clusters 4
uniform_noise experiment  6  mean : 0.74  std:  0.2128893491934249 number of clusters 5
uniform_noise experiment  7  mean : 0.74  std:  0.2128893491934249 number of clusters 5
uniform_noise experiment  8  mean : 0.7657986111111111  std:  0.2342385791244292 number of clusters 4
uniform_noise experiment  9  mean : 0.7816964285714285  std:  0.21866844325137447 number of clusters 4
CPU times: user 27.8

In [114]:
%%time
df_10 = make_experiment(0.1, data_matrix, tetraclusters)



uniform_noise experiment  0  mean : 0.8434095860566448  std:  0.22145228713630982 number of clusters 3
uniform_noise experiment  1  mean : 0.7690876831501832  std:  0.23091281067270836 number of clusters 4
uniform_noise experiment  2  mean : 0.7631983604845447  std:  0.23702023572380262 number of clusters 4
uniform_noise experiment  3  mean : 0.7822720864661654  std:  0.21814239634206142 number of clusters 4
uniform_noise experiment  4  mean : 0.7855769230769232  std:  0.21454188886649236 number of clusters 4
uniform_noise experiment  5  mean : 0.740657894736842  std:  0.21226671649959164 number of clusters 5
uniform_noise experiment  6  mean : 0.762797619047619  std:  0.2377843021779295 number of clusters 4
uniform_noise experiment  7  mean : 0.7657986111111111  std:  0.2342385791244292 number of clusters 4
uniform_noise experiment  8  mean : 0.7801339285714286  std:  0.22011271517569198 number of clusters 4
uniform_noise experiment  9  mean : 0.7830827067669173  std:  0.2172523979218

In [115]:
%%time
df_30 = make_experiment(0.3, data_matrix, tetraclusters)



uniform_noise experiment  0  mean : 0.770704334365325  std:  0.2298784221614642 number of clusters 4
uniform_noise experiment  1  mean : 0.8272546897546897  std:  0.24429876058526567 number of clusters 3
uniform_noise experiment  2  mean : 0.8419241433947316  std:  0.22355302029491514 number of clusters 3
uniform_noise experiment  3  mean : 0.6623309566895093  std:  0.2783325143664657 number of clusters 5
uniform_noise experiment  4  mean : 0.7349358974358975  std:  0.2659188463427079 number of clusters 4
uniform_noise experiment  5  mean : 0.7475816462948816  std:  0.2524206899046249 number of clusters 4
uniform_noise experiment  6  mean : 0.7833508403361344  std:  0.21697808306763344 number of clusters 4
uniform_noise experiment  7  mean : 0.832033326687932  std:  0.2375407474246176 number of clusters 3
uniform_noise experiment  8  mean : 0.7689764776524646  std:  0.23119463345143573 number of clusters 4
uniform_noise experiment  9  mean : 0.7592209690893902  std:  0.2414163253588211

In [120]:
%%time
df_60 = make_experiment(0.5, data_matrix, tetraclusters)



uniform_noise experiment  0  mean : 0.6073186733901019  std:  0.24934906396058543 number of clusters 7
uniform_noise experiment  1  mean : 0.6781638071895425  std:  0.26770879077412707 number of clusters 5
uniform_noise experiment  2  mean : 0.7512310172466422  std:  0.24935265176581123 number of clusters 4
uniform_noise experiment  3  mean : 0.8361365758424583  std:  0.23173787682049068 number of clusters 3
uniform_noise experiment  4  mean : 0.8401829481792717  std:  0.22594978245535333 number of clusters 4
uniform_noise experiment  5  mean : 0.6691849816849818  std:  0.2617142921003691 number of clusters 7
uniform_noise experiment  6  mean : 0.7164314273689274  std:  0.2622974415826793 number of clusters 6
uniform_noise experiment  7  mean : 0.5773003082351822  std:  0.2679557692967851 number of clusters 7
uniform_noise experiment  8  mean : 0.6474465859576154  std:  0.25987111038901783 number of clusters 6
uniform_noise experiment  9  mean : 0.847806686777275  std:  0.2152338476620

In [121]:
np.save('series3_matrix.npy', data_matrix)
df_0.to_excel('series3_prob00.xlsx', index=False)
df_5.to_excel('series3_prob05.xlsx', index=False)
df_10.to_excel('series3_prob10.xlsx', index=False)
df_30.to_excel('series3_prob30.xlsx', index=False)
df_60.to_excel('series3_prob50.xlsx', index=False)