In [17]:
import pickle
import os
import itertools
import copy
import numpy as np

In [18]:
def unknown_configuration(modules_per_size, max_number_modules):
    never_seen_config = [[] for i in range(max_number_modules)]
    seen_config = list(modules_per_size[1].keys())
    for i in range(2, max_number_modules):        
        for test_config in list(modules_per_size[i].keys()):
            composed_config = list(set(test_config.split("-")))
            for module in composed_config:
                if module not in seen_config:
                    never_seen_config[i].append(test_config)
                    break
    return never_seen_config

In [19]:
def equal_input(modules_per_size, max_number_modules):
    equal_config = [{} for i in range(max_number_modules)]
    for i in range(1, max_number_modules):
        for key in list(modules_per_size[i].keys()):
            for inp, _ in modules_per_size[i][key]:
                str_inp = [str(i) for i in inp]
                str_inp = '_'.join(str_inp)
                try:
                    equal_config[i][str_inp] += 1
                except:
                    equal_config[i][str_inp] = 0 # start at 0 because we count only repetitions
    return equal_config

In [20]:
def equal_sample(modules_per_size, max_number_modules):
    equal_config = [{} for i in range(max_number_modules)]
    for i in range(1, max_number_modules):
        for key in list(modules_per_size[i].keys()):
            for inp, out in modules_per_size[i][key]:
                str_inp = [str(i) for i in inp]
                str_inp = '_'.join(str_inp)
                
                str_out = [str(i) for i in out]
                str_out = '_'.join(str_out)
                
                try:
                    equal_config[i][str_inp + str_out] += 1
                except:
                    equal_config[i][str_inp + str_out] = 0 # start at 0 because we count only repetitions
    return equal_config

In [21]:
def active_channel(inp):
    return [1 if i > 0 else 0 for i in inp]

In [22]:
def number_channel_permutation(modules_per_size, max_number_modules):
    different_permutations = [{} for i in range(max_number_modules)]
    for i in range(1, max_number_modules):
        for key in list(modules_per_size[i].keys()):
            for inp, _ in modules_per_size[i][key]:
                str_inp = [str(i) for i in active_channel(inp)]
                str_inp = '_'.join(str_inp)
                try:
                    different_permutations[i][str_inp] += 1
                except:
                    different_permutations[i][str_inp] = 1 # start at 1 because we count occurrences
    return different_permutations

In [23]:
def create_data(modules_per_size, extra_module, max_number_modules):
    for j in range(2, max_number_modules):
        print("\r [.] Iteration number", j, "of", max_number_modules-1, end='')
        for i in range(2, max_number_modules):        
            for test_metadata in list(extra_module[i].keys()):
                splitted_metadata = list(set(test_metadata.split("-")))
                test_module = splitted_metadata[0]
                try:
                    seen_data = modules_per_size[1][test_module]
                except:
                    continue

                test_data = extra_module[i][test_metadata]
                
                dict_seen_data = {}
                count = 0
                for data in seen_data:
                    str_data = [str(a) for a in data[0]]
                    d_key = ''.join(str_data)
                    if d_key in dict_seen_data:
                        dict_seen_data[d_key].append(data[1])
                    else:
                        dict_seen_data[d_key] = [data[1]]
                
                #for key in dict_seen_data:
                #    if len(dict_seen_data[key]) == 2:
                #        print(" [.] Iteration", j, "Euclidean distance:", np.linalg.norm(np.asarray(dict_seen_data[key][0]) - np.asarray(dict_seen_data[key][1])))
                        
                for t_data in test_data:
                    str_t_data = [str(a) for a in t_data[0]]
                    d_key = ''.join(str_t_data)
                    if d_key in dict_seen_data:
                        dict_key = '-'.join(splitted_metadata[1:])
                        try:
                            # we can get the first one because since it's deterministic they should all be the same
                            s_data = dict_seen_data[d_key][0]
                            modules_per_size[i-1][dict_key].append([s_data, t_data[1]])
                        except:
                            modules_per_size[i-1][dict_key] = [[s_data, t_data[1]]]

                        extra_module[i][test_metadata] = [a for a in extra_module[i][test_metadata] if a != test_data]

In [24]:
def animate(t):
    title = module
    fig.suptitle(title, fontsize=16)
    ax1.cla()
    ax1.plot(X_train[t])

    ax2.cla()
    ax2.plot(y_train[t])

In [25]:
data_folder = "data/"
datasets = ['c1', 'c2', 'c3', 'c4']
reduced_datasets = ['c1']
max_number_modules = 9

In [26]:
for dataset in datasets:
    
    modules_per_size = [{} for i in range(max_number_modules)]
        
    dataset_folder = os.path.join(data_folder, dataset)
    X = pickle.load(open(dataset_folder + "/X.pkl", "rb"))
    y = pickle.load(open(dataset_folder + "/y.pkl", "rb"))
    
    for x_, y_ in zip(X, y):
        sequence_size = len(x_[0])
        dict_key = ""
        for i in range(sequence_size):
            dict_key += (x_[0][i][0] + "_" +str(x_[0][i][1][0]) + "_" + str(x_[0][i][1][1]) + "-")
        
        dict_key = dict_key[:-1]
        try:
            modules_per_size[sequence_size][dict_key].append([x_[1], list(y_)])
        except:
            modules_per_size[sequence_size][dict_key] = [[x_[1], list(y_)]]
    
    print(" [+] Dataset", dataset, " information")
    print(" [=] Number of different cascades of each size")
    for i in range(1, max_number_modules):
        different_cascades = len(list(modules_per_size[i].keys()))
        print(" [Cascade size " + str(i) + "] ", different_cascades, " possibilities")
        
    
    # A known cascade is composed of modules already seen alone in cascades
    # of size 1. Thus, an unkown cascade contain modules that weren't found alone
    # in cascades of size 1.
    never_seen_config = unknown_configuration(modules_per_size, max_number_modules)
    print(" [=] Number of unknown cascades of each size")
    for i in range(2, max_number_modules):
        never_seen = len(never_seen_config[i])
        print(" [Cascade size " + str(i) + "] ", never_seen, " unseen")
    
    print(" [=] Biggest cascade")
    print(" [.]", list(modules_per_size[8].keys())[0].split('-'))
    
    print(" [=] Cascades of size 1")
    print(" [.]", list(modules_per_size[1].keys()))

    print(" [=] Number samples of each size of cascades")
    total_total = 0
    for i in range(1, max_number_modules):
        total_elem = 0
        for key in list(modules_per_size[i].keys()):
            total_elem += len(modules_per_size[i][key])
        total_total += total_elem
        print(" [Cascade size " + str(i) + "] ", total_elem, " ocurrences")
    print(" [.] Total number of samples:", total_total)
    
    equal_config = equal_input(modules_per_size, max_number_modules)
    print(" [=] Number of equal input samples for the same cascade of each size of cascade")
    total_total = 0
    for i in range(1, max_number_modules):
        total_elem = 0
        number_keys = len(list(equal_config[i].keys()))
        for key in list(equal_config[i].keys()):
            total_elem += equal_config[i][key]
        total_total += total_elem
        print(" [Cascade size " + str(i) + "] ", total_elem, " repeated inputs for the same cascade. Total different inputs:", number_keys)
        
    channel_permutations = number_channel_permutation(modules_per_size, max_number_modules)
    print(" [=] Channel activation for different cascade size")
    total_total = 0
    for i in range(1, max_number_modules):
        total_elem = 0
        number_keys = len(list(channel_permutations[i].keys()))
        for key in list(channel_permutations[i].keys()):
            total_elem += channel_permutations[i][key]
        total_total += total_elem
        print(" [Cascade size " + str(i) + "] ", total_elem, " same active channels for the same cascade size. Different channel activation:", number_keys)
        
        
    extra_module = [] # extra_module is used to repeat create_data without creating the same data
    for li in modules_per_size:
        d2 = copy.deepcopy(li)
        extra_module.append(d2)
    print(" [=] Data creation")
    create_data(modules_per_size, extra_module, max_number_modules)
    
    print("")
    print(" [=] Number samples of each size of cascades after data augmentation")
    total_total = 0
    for i in range(1, max_number_modules):
        total_elem = 0
        for key in list(modules_per_size[i].keys()):
            total_elem += len(modules_per_size[i][key])
        total_total += total_elem
        print(" [Cascade size " + str(i) + "] ", total_elem, " ocurrences")
    print(" [.] Total number of samples:", total_total)
    
    equal_config = equal_sample(modules_per_size, max_number_modules)
    print(" [=] Number of equal data samples of each size of cascade after data augmentation")
    total_total = 0
    total_different = 0
    for i in range(1, max_number_modules):
        total_elem = 0
        number_keys = len(list(equal_config[i].keys()))
        for key in list(equal_config[i].keys()):
            total_elem += equal_config[i][key]
        total_total += total_elem
        total_different += number_keys
        print(" [Cascade size " + str(i) + "] ", total_elem, " repeated data samples for the same cascade. Total different inputs:", number_keys)
    print(" [.] Total number of repeated samples:", total_total)
    print(" [.] Total number of unique samples:", total_different)

 [+] Dataset c1  information
 [=] Number of different cascades of each size
 [Cascade size 1]  5  possibilities
 [Cascade size 2]  7  possibilities
 [Cascade size 3]  6  possibilities
 [Cascade size 4]  5  possibilities
 [Cascade size 5]  4  possibilities
 [Cascade size 6]  3  possibilities
 [Cascade size 7]  2  possibilities
 [Cascade size 8]  1  possibilities
 [=] Number of unknown cascades of each size
 [Cascade size 2]  0  unseen
 [Cascade size 3]  0  unseen
 [Cascade size 4]  0  unseen
 [Cascade size 5]  0  unseen
 [Cascade size 6]  0  unseen
 [Cascade size 7]  0  unseen
 [Cascade size 8]  0  unseen
 [=] Biggest cascade
 [.] ['EDFA_24.0_20.0', 'SMF_5.6_0', 'EDFA_24.0_20.0', 'SMF_5.2_0', 'EDFA_24.0_20.0', 'SMF_0_5.0', 'EDFA_24.0_20.0', 'SMF_0_5.4']
 [=] Cascades of size 1
 [.] ['EDFA_24.0_20.0', 'SMF_5.6_0', 'SMF_5.2_0', 'SMF_0_5.0', 'SMF_0_5.4']
 [=] Number samples of each size of cascades
 [Cascade size 1]  1728  ocurrences
 [Cascade size 2]  1512  ocurrences
 [Cascade size 3]  1

 [+] Dataset c3  information
 [=] Number of different cascades of each size
 [Cascade size 1]  4  possibilities
 [Cascade size 2]  6  possibilities
 [Cascade size 3]  6  possibilities
 [Cascade size 4]  5  possibilities
 [Cascade size 5]  4  possibilities
 [Cascade size 6]  3  possibilities
 [Cascade size 7]  2  possibilities
 [Cascade size 8]  1  possibilities
 [=] Number of unknown cascades of each size
 [Cascade size 2]  0  unseen
 [Cascade size 3]  0  unseen
 [Cascade size 4]  0  unseen
 [Cascade size 5]  0  unseen
 [Cascade size 6]  0  unseen
 [Cascade size 7]  0  unseen
 [Cascade size 8]  0  unseen
 [=] Biggest cascade
 [.] ['EDFA_20.0_10.0', 'SMF_1.6_0', 'EDFA_20.0_10.0', 'SMF_1.3_0', 'EDFA_20.0_10.0', 'SMF_0_1.4', 'EDFA_20.0_10.0', 'SMF_0_1.4']
 [=] Cascades of size 1
 [.] ['EDFA_20.0_10.0', 'SMF_1.6_0', 'SMF_1.3_0', 'SMF_0_1.4']
 [=] Number samples of each size of cascades
 [Cascade size 1]  5568  ocurrences
 [Cascade size 2]  4872  ocurrences
 [Cascade size 3]  4176  ocurrenc

In [27]:
    
modules_per_size = [{} for i in range(max_number_modules)]

for dataset in datasets:
    dataset_folder = os.path.join(data_folder, dataset)
    X = pickle.load(open(dataset_folder + "/X.pkl", "rb"))
    y = pickle.load(open(dataset_folder + "/y.pkl", "rb"))

    for x_, y_ in zip(X, y):
        sequence_size = len(x_[0])
        dict_key = ""
        for i in range(sequence_size):
            dict_key += (x_[0][i][0] + "_" +str(x_[0][i][1][0]) + "_" + str(x_[0][i][1][1]) + "-")

        dict_key = dict_key[:-1]
        try:
            modules_per_size[sequence_size][dict_key].append([x_[1], list(y_)])
        except:
            modules_per_size[sequence_size][dict_key] = [[x_[1], list(y_)]]

print(" [+] Datasets", datasets, " information")
print(" [=] Number of different cascades of each size")
for i in range(1, max_number_modules):
    different_cascades = len(list(modules_per_size[i].keys()))
    print(" [Cascade size " + str(i) + "] ", different_cascades, " possibilities")


# A known cascade is composed of modules already seen alone in cascades
# of size 1. Thus, an unkown cascade contain modules that weren't found alone
# in cascades of size 1.
never_seen_config = unknown_configuration(modules_per_size, max_number_modules)
print(" [=] Number of unknown cascades of each size")
for i in range(2, max_number_modules):
    never_seen = len(never_seen_config[i])
    print(" [Cascade size " + str(i) + "] ", never_seen, " unseen")

print(" [=] Biggest cascade")
print(" [.]", list(modules_per_size[8].keys())[0].split('-'))

print(" [=] Cascades of size 1")
print(" [.]", list(modules_per_size[1].keys()))

print(" [=] Number samples of each size of cascades")
total_total = 0
for i in range(1, max_number_modules):
    total_elem = 0
    for key in list(modules_per_size[i].keys()):
        total_elem += len(modules_per_size[i][key])
    total_total += total_elem
    print(" [Cascade size " + str(i) + "] ", total_elem, " ocurrences")
print(" [.] Total number of samples:", total_total)

equal_config = equal_input(modules_per_size, max_number_modules)
print(" [=] Number of equal input samples for the same cascade of each size of cascade")
total_total = 0
for i in range(1, max_number_modules):
    total_elem = 0
    number_keys = len(list(equal_config[i].keys()))
    for key in list(equal_config[i].keys()):
        total_elem += equal_config[i][key]
    total_total += total_elem
    print(" [Cascade size " + str(i) + "] ", total_elem, " repeated inputs for the same cascade. Total different inputs:", number_keys)

channel_permutations = number_channel_permutation(modules_per_size, max_number_modules)
print(" [=] Channel activation for different cascade size")
total_total = 0
for i in range(1, max_number_modules):
    total_elem = 0
    number_keys = len(list(channel_permutations[i].keys()))
    for key in list(channel_permutations[i].keys()):
        total_elem += channel_permutations[i][key]
    total_total += total_elem
    print(" [Cascade size " + str(i) + "] ", total_elem, " same active channels for the same cascade size. Different channel activation:", number_keys)


extra_module = [] # extra_module is used to repeat create_data without creating the same data
for li in modules_per_size:
    d2 = copy.deepcopy(li)
    extra_module.append(d2)
print(" [=] Data creation")
create_data(modules_per_size, extra_module, max_number_modules)

print("")
print(" [=] Number samples of each size of cascades after data augmentation")
total_total = 0
for i in range(1, max_number_modules):
    total_elem = 0
    for key in list(modules_per_size[i].keys()):
        total_elem += len(modules_per_size[i][key])
    total_total += total_elem
    print(" [Cascade size " + str(i) + "] ", total_elem, " ocurrences")
print(" [.] Total number of samples:", total_total)

equal_config = equal_sample(modules_per_size, max_number_modules)
print(" [=] Number of equal data samples of each size of cascade after data augmentation")
total_total = 0
total_different = 0
for i in range(1, max_number_modules):
    total_elem = 0
    number_keys = len(list(equal_config[i].keys()))
    for key in list(equal_config[i].keys()):
        total_elem += equal_config[i][key]
    total_total += total_elem
    total_different += number_keys
    print(" [Cascade size " + str(i) + "] ", total_elem, " repeated data samples for the same cascade. Total different inputs:", number_keys)
print(" [.] Total number of repeated samples:", total_total)
print(" [.] Total number of unique samples:", total_different)

 [+] Datasets ['c1', 'c2', 'c3', 'c4']  information
 [=] Number of different cascades of each size
 [Cascade size 1]  13  possibilities
 [Cascade size 2]  25  possibilities
 [Cascade size 3]  24  possibilities
 [Cascade size 4]  20  possibilities
 [Cascade size 5]  16  possibilities
 [Cascade size 6]  12  possibilities
 [Cascade size 7]  8  possibilities
 [Cascade size 8]  4  possibilities
 [=] Number of unknown cascades of each size
 [Cascade size 2]  0  unseen
 [Cascade size 3]  0  unseen
 [Cascade size 4]  0  unseen
 [Cascade size 5]  0  unseen
 [Cascade size 6]  0  unseen
 [Cascade size 7]  0  unseen
 [Cascade size 8]  0  unseen
 [=] Biggest cascade
 [.] ['EDFA_24.0_20.0', 'SMF_5.6_0', 'EDFA_24.0_20.0', 'SMF_5.2_0', 'EDFA_24.0_20.0', 'SMF_0_5.0', 'EDFA_24.0_20.0', 'SMF_0_5.4']
 [=] Cascades of size 1
 [.] ['EDFA_24.0_20.0', 'SMF_5.6_0', 'SMF_5.2_0', 'SMF_0_5.0', 'SMF_0_5.4', 'EDFA_20.0_0.0', 'SMF_1.6_0', 'SMF_1.3_0', 'SMF_0_1.4', 'EDFA_20.0_10.0', 'EDFA_20.0_17.0', 'EDFA_20.0_18.0'

In [28]:
import matplotlib.animation
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams["animation.html"] = "jshtml"
plt.rcParams['figure.dpi'] = 100  
plt.ioff()
plt.style.use('ggplot')

In [29]:
modules_per_size[1].keys()

dict_keys(['EDFA_24.0_20.0', 'SMF_5.6_0', 'SMF_5.2_0', 'SMF_0_5.0', 'SMF_0_5.4', 'EDFA_20.0_0.0', 'SMF_1.6_0', 'SMF_1.3_0', 'SMF_0_1.4', 'EDFA_20.0_10.0', 'EDFA_20.0_17.0', 'EDFA_20.0_18.0', 'EDFA_20.0_20.0'])

In [30]:
#module = 'EDFA_20.0_0.0'
module = 'SMF_1.6_0'
module_size = 1

In [31]:
data = np.asarray(modules_per_size[module_size][module])

X_train = data[:, 0]
y_train = data[:, 1]

assert (data[0][0] == X_train[0]).all()
assert (data[0][1] == y_train[0]).all()

print(" [.] Current module in analysis:", module, ". Module size:", module_size)

plt.close('all')
fig, (ax1, ax2) = plt.subplots(1,2)
matplotlib.animation.FuncAnimation(fig, animate, frames=100)

 [.] Current module in analysis: SMF_1.6_0 . Module size: 1
