In [1]:
import sys
sys.path.insert(0, '..')
import numpy as np
import os
import yaml
import sequenceanalyzer as sa
import partition as pt
import partitionset as ps
import sequence_generator as sg
import dmarkov as dm
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans as k
import save_plot as sp
import eigenvectorcalcs as eig
import moore
from scipy.special import lambertw
from scipy.stats.mstats import gmean
# sys.path.insert(0, 'Code')

# path = 'logistic_map'
# L = 10000000

path = '10dbq1'
L = 50000000

## Machine initialization

#### Generate sequence

In [2]:
def logistic_map(x0 = 0.5, r = 3.75):
    x = [x0]
    s = ''
    for i in range(10000000):
        x.append(r*x[i]*(1-x[i]))
        if x[i] <= 0.67:
             s += '0'
        elif x[i] <= 0.79:
             s += '1'
        else:
             s += '2'
    return s

# os.makedirs('logistic_map/sequences')
# os.makedirs('logistic_map/machine')
# os.makedirs('logistic_map/probabilities/conditional')
s = logistic_map()

with open(f'{path}/sequences/lm_{L}.yaml', 'w') as f:
    yaml.dump(s, f)

#### Load sequence

In [3]:
with open(f'{path}/sequences/lm_{L}.yaml', 'r') as f:
    s = yaml.load(f)

In [None]:
p, a = sa.calc_probs(s, 10)
p_cond = sa.calc_cond_probs(p, a, 10)

with open('logistic_map/probabilities/lm_10000000.yaml', 'w') as f:
    yaml.dump(p, f)
with open('logistic_map/probabilities/conditional/lm_10000000.yaml', 'w') as f:
    yaml.dump(p_cond, f)

In [5]:
m = dm.DMarkov(p_cond, 4, a, p)
len(m.states)

21

In [6]:
idx = dict((s.name, m.states.index(s)) for s in m.states)
morphs = []
all_oedges = [state.outedges for state in m.states]

for oedges in all_oedges:
    curr_morph = [0] * len(m.index_labels)
    for oedge in oedges:
        label = oedge[0]
        curr_morph[m.index_labels[label]] = oedge[-1]
    morphs.append(curr_morph)
    
morphs = np.array(morphs)
data = morphs.T
print(f'Data shape: {data.shape}')

In [9]:
import skfuzzy as fuzz

In [10]:
ncenters = 5

cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(data, ncenters, 2, error=0.005, maxiter=1000, init=None)

In [46]:
u[u < 1e-03] = 0
w = u.T
closest_cluster = []
maxlen = 0

from itertools import product

for i in w:
    c = np.where(i > 0.1)[-1]
    if len(c) > 1:
        maxlen 
    closest_cluster.append(np.array(c))
    
print(closest_cluster)

[array([1]), array([1]), array([0]), array([2, 4]), array([0]), array([1]), array([1]), array([0]), array([4]), array([0]), array([2]), array([1]), array([0]), array([3]), array([1]), array([0]), array([0]), array([1]), array([1]), array([0]), array([4])]


In [47]:
import copy
clusters = [[]]

for i in closest_cluster:
    if len(i) > 1:
        n = len(i)
        new_clusters = []
        for idx in i:
            temp = copy.deepcopy(clusters)
            for j in range(len(temp)):
                temp[j].append(idx)
                new_clusters.append(temp[j])
#             print(new_clusters)
        clusters = copy.deepcopy(new_clusters)
#         print(new_clusters)
    elif len(i) == 1:
        idx = i[0]
        for j in range(len(clusters)):
            clusters[j].append(idx)
#             print(clusters)

In [48]:
closest_clusters = copy.deepcopy(clusters)

In [49]:
closest_clusters

[[1, 1, 0, 2, 0, 1, 1, 0, 4, 0, 2, 1, 0, 3, 1, 0, 0, 1, 1, 0, 4],
 [1, 1, 0, 4, 0, 1, 1, 0, 4, 0, 2, 1, 0, 3, 1, 0, 0, 1, 1, 0, 4]]

### To do: generate different "cluster" arrays with the combinations of clusters from c-means

In [52]:
def imprime_clusters(cluster):
    for c in cluster:
        for st in c:
            print(f'{st.name}: {st.outedges}')
        print()
    print('end of partition\n\n')

for closest_cluster in closest_clusters:
    clusters = [[] for i in closest_cluster]
    # print(f"Clusterization check")

    for i in range(len(morphs)):
        cluster_index = closest_cluster[i]
        # print(f"\tCenter: {kmeans.cluster_centers_[state_idx]}, Outedge: {machine.states[i].outedges}")
        clusters[cluster_index].append(m.states[i])
    # Fix empty clusters problem
    clusters = [c for c in clusters if c]
    imprime_clusters(clusters)


0202: [('0', '2020', 1.0)]
0102: [('0', '1020', 1.0)]
0112: [('0', '1120', 1.0)]
2002: [('0', '0020', 1.0)]
2012: [('0', '0120', 1.0)]
1202: [('0', '2020', 1.0)]
1102: [('0', '1020', 1.0)]
1112: [('0', '1120', 1.0)]

0020: [('2', '0202', 1.0)]
0200: [('2', '2002', 1.0)]
0120: [('2', '1202', 1.0)]
0110: [('2', '1102', 1.0)]
2010: [('2', '0102', 1.0)]
1020: [('2', '0202', 1.0)]
1120: [('2', '1202', 1.0)]
1110: [('2', '1102', 1.0)]

0201: [('0', '2010', 0.10968483560021094), ('1', '2011', 0.5061025023943421), ('2', '2012', 0.38421266200544696)]
2020: [('0', '0200', 0.2324569222273915), ('1', '0201', 0.37181498406459385), ('2', '0202', 0.39572809370801465)]

2011: [('0', '0110', 0.43114312055066095), ('1', '0111', 0.42728673787919746), ('2', '0112', 0.14157014157014158)]

0111: [('0', '1110', 0.18592943085034475), ('1', '1111', 0.5710747600378533), ('2', '1112', 0.24299580911180213)]
1111: [('0', '1110', 0.21032626553138323), ('1', '1111', 0.5716076376071427), ('2', '1112', 0.2180660968614

In [8]:
def dist(a, b):
    return np.linalg.norm(np.array(a) - np.array(b), axis=1)

# def dist(a, b):
#     eps = 1e-15
#     a = np.array(a, dtype='float64')
#     b = np.array(b, dtype='float64')
#     a[a == 0] = eps
#     b[b == 0] = eps
#     kl = (a*np.log(a/b) + b*np.log(b/a))/2
    
#     if a.shape != b.shape:
#         return np.sum(kl, axis = 0)
#     if a.shape == b.shape:
#         return np.sum(kl, axis = 1)

# def dist(vec1, vec2):
#     kl = [0, 0]
#     print("Calculating Kullback-Leibler divergence")
#     if len(vec1) and len(vec1) == len(vec2):
#         #Probabilities of subsequences of length K are stored in probabilities[K-1]
#         for i in range(len(vec1)):
#             p = vec1[i] or 1e-15
#             q = vec2[i] or 1e-15
#             # print(f'p={p}, q={q}')
#             kl[0] += p*np.log2(p/q)
#             kl[1] += p*np.log2(p/q)
#     else:
#         print ("[error] Probabilities not computed.")
#     print("*****************")
#     print("Kullback-Leibler divergence calculated!")
#     print("*****************")
#     return (kl[0]+kl[1])/2

def custom_kmeans(matrix, k, centroids):
    current_centroids = np.array(centroids)
    matrix = np.array(matrix)
    previous_centroids = np.zeros(current_centroids.shape)

    error = dist(current_centroids, previous_centroids)
    nearest_clusters = np.zeros(len(matrix))
    print('~~~ Starting K-Means ~~~')
    while sum(error) > 0.01:
        print(f'Error = {sum(error)}')
        for i in range(len(matrix)):
            distances = dist(matrix[i], current_centroids)
            nearest_clusters[i] = np.argmin(distances)
            print(f'Distance to centroids : {distances}\n Nearest_cluster: {nearest_clusters[i]}\n')

            previous_centroids = current_centroids.copy()

        for i in range(k):
            current_centroids[i] = np.mean(matrix[np.where(nearest_clusters == i)], axis=0)
        print(f'New centroids: {current_centroids}\n Old centroids: {previous_centroids}\n\n')

        error = dist(current_centroids, previous_centroids)
    return (nearest_clusters, current_centroids)

def get_center(features, target):
    target_set = set(target)
    return np.array([np.mean(features[np.where(target == i)], 0) for i in range(len(target_set))])

def get_initial_centroids(samples = [], K = 5):
    i = 0
    centroids = []

    for sample in samples:
        if sample not in centroids:
            centroids.append(sample)
            i += 1
        if i == K:
            return np.array(centroids)
    return []

In [12]:
m.states[0].outedges

[('0', '11110', 0.2057681799149818),
 ('1', '11111', 0.5714032261498393),
 ('2', '11112', 0.22282859393517895)]