In [1]:
import random
import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
%matplotlib inline
%config InlineBackend.figure_format='retina'
import tensorflow_federated as tff
from collections import OrderedDict
import linecache
tf.executing_eagerly()
import nest_asyncio
nest_asyncio.apply()
from os import listdir
from os.path import isfile, join
from sklearn.cluster import KMeans
from fcmeans import FCM
from sklearn.cluster import AgglomerativeClustering
from tslearn.clustering import TimeSeriesKMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import SpectralClustering

2023-04-12 04:36:05.494051: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
NUM_EPOCHS = 10
# should be 5
BATCH_SIZE = 40
# should be 40
SHUFFLE_BUFFER = 100
PREFETCH_BUFFER = 10
time_steps = 48
interval = 1000
future_steps = 12
split = 0.8

def preprocess(dataset):

    def batch_format_fn(x_d, y_d):
        return OrderedDict(
            x=x_d,
            y=y_d)

    return dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=1).batch(
      BATCH_SIZE).map(batch_format_fn).prefetch(PREFETCH_BUFFER)

def read_subset(file,lower,upper):
    data = []
    for i in range(lower,upper):
        line = linecache.getline(file, i)
        line = line.strip('\n')
        data.append(line)
    return data  

def check_data(data):
    if 'Null' in data:
        return False
    else:
        for i in range(len(data)):
            data[i]=float(data[i])
        return data
    
def create_dataset_fed(files, lower, upper, time_steps=1):
    Xs, ys = [], []
    for file in files:
        x_t, y_t=[], []
        data = read_subset(f'./ExperementData/{file}',lower,upper)
#         data = read_subset(f'./HomesClean/{file}',lower,upper)
        if data!=False:
            for i in range(len(data) - time_steps -1-future_steps):
                v = data[i:(i + time_steps)] 
                z = data[(i + time_steps):(i + time_steps+future_steps)]
                if check_nulls(z) and check_nulls(v):
                    x_t.append(conv_float(v))
                    y_t.append(conv_float(z))
            x_t = np.array(x_t)[:,:,np.newaxis]
            y_t = np.array(y_t)[:,:,np.newaxis]
            Xs.append(x_t)
            ys.append(y_t)
    Xs = np.array(Xs)
    ys = np.array(ys)
    #return [tf.data.Dataset.from_tensor_slices((Xs[x],  np.array(ys[x]))) for x in range(len(Xs))]
    return [ tf.data.Dataset.from_tensor_slices((Xs[x],  np.array(ys[x]))) for x in range(len(Xs))]

def make_federated_data(files, lower, upper):
    data = create_dataset_fed(files,lower, upper,time_steps)
    return [
      preprocess( x ) for x in data if x!=False
    ]

def create_keras_model():
    return tf.keras.models.Sequential([
      keras.layers.LSTM(64, input_shape=(time_steps, 1)),
      keras.layers.Dense(12),
    ])

def model_fn():
    # We _must_ create a new model here, and _not_ capture it from an external
    # scope. TFF will call this within different graph contexts.
    keras_model = create_keras_model()
    return tff.learning.from_keras_model(
      keras_model,
      input_spec=preprocessed_example_dataset.element_spec,
      loss=tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()])

def check_nulls(data):
    if not(all(is_float(ele) for ele in data)):
        return False
    else:
        return True
    
def conv_float(data):
    for i in range(len(data)):
        data[i]=float(data[i])
    return(data)
    
def is_float(element):
    try:
        float(element)
        return True
    except ValueError:
        return False

In [3]:
def initialize_fn():
    model = model_fn()
    return model.trainable_variables

def next_fn(server_weights, federated_dataset):
    # Broadcast the server weights to the clients.
    server_weights_at_client = broadcast(server_weights)

    # Each client computes their updated weights.
    client_weights = client_update(federated_dataset, server_weights_at_client)

    # The server averages these updates.
    mean_client_weights = mean(client_weights)

    # The server updates its model.
    server_weights = server_update(mean_client_weights)

    return server_weights

@tf.function
def client_update(model, dataset, server_weights, client_optimizer):
    """Performs training (using the server model weights) on the client's dataset."""
    # Initialize the client model with the current server weights.
    client_weights = model.trainable_variables
    # Assign the server weights to the client model.
    tf.nest.map_structure(lambda x, y: x.assign(y),
                        client_weights, server_weights)

    # Use the client_optimizer to update the local model.
    for batch in dataset:
        with tf.GradientTape() as tape:
          # Compute a forward pass on the batch of data
            outputs = model.forward_pass(batch)

        # Compute the corresponding gradient
        grads = tape.gradient(outputs.loss, client_weights)
        grads_and_vars = zip(grads, client_weights)

        # Apply the gradient using a client optimizer.
        client_optimizer.apply_gradients(grads_and_vars)

    return client_weights

@tf.function
def server_update(model, mean_client_weights):
    """Updates the server model weights as the average of the client model weights."""
    model_weights = model.trainable_variables
    # Assign the mean client weights to the server model.
    tf.nest.map_structure(lambda x, y: x.assign(y),
                        model_weights, mean_client_weights)
    return model_weights

example_dataset = create_dataset_fed(['1086.txt'],1, 500,time_steps)[0]
preprocessed_example_dataset = preprocess(example_dataset)

@tff.tf_computation
def server_init():
    model = model_fn()
    return model.trainable_variables

@tff.federated_computation
def initialize_fn():
  return tff.federated_value(server_init(), tff.SERVER)

whimsy_model = model_fn()
tf_dataset_type = tff.SequenceType(whimsy_model.input_spec)
model_weights_type = server_init.type_signature.result

@tff.tf_computation(tf_dataset_type, model_weights_type)
def client_update_fn(tf_dataset, server_weights):
  model = model_fn()
  client_optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
  return client_update(model, tf_dataset, server_weights, client_optimizer)

@tff.tf_computation(model_weights_type)
def server_update_fn(mean_client_weights):
  model = model_fn()
  return server_update(model, mean_client_weights)

federated_server_type = tff.FederatedType(model_weights_type, tff.SERVER)
federated_dataset_type = tff.FederatedType(tf_dataset_type, tff.CLIENTS)

@tff.federated_computation(federated_server_type, federated_dataset_type)
def next_fn(server_weights, federated_dataset):
  # Broadcast the server weights to the clients.
  server_weights_at_client = tff.federated_broadcast(server_weights)

  # Each client computes their updated weights.
  client_weights = tff.federated_map(
      client_update_fn, (federated_dataset, server_weights_at_client))
  
  # The server averages these updates.
  mean_client_weights = tff.federated_mean(client_weights)

  # The server updates its model.
  server_weights = tff.federated_map(server_update_fn, mean_client_weights)

  return (server_weights, client_weights)

In [6]:
federated_algorithm = tff.templates.IterativeProcess(
    initialize_fn=initialize_fn,
    next_fn=next_fn
)
state = federated_algorithm.initialize()
updated_server_state_phase_1 = state
onlyfiles = [f for f in listdir('./ExperementData/') if isfile(join('./ExperementData/', f)) and f[-4:]==".txt"]
# onlyfiles = [f for f in listdir('./HomesClean/') if isfile(join('./HomesClean/', f)) and f[-4:]==".txt"]
# epoch_info={}
for i in range(NUM_EPOCHS):
    selected_files = onlyfiles
    location = random.randint(1,int(25727*split)-interval)
    print(location)
    print(i)
    federated_train_data = make_federated_data(selected_files, location, location+interval)
    result = federated_algorithm.next(state, federated_train_data)
    updated_server_state_phase_1 = result[0]
    updated_client_weights = result[1]
#     epoch_info[i]=(location, updated_server_state_phase_1, updated_client_weights)
# epoch_info
# example 1086.txt file???
    

7277
0
16630
1
2999
2
13888
3
18589
4
12312
5
5419
6
5674
7
2179
8
12370
9


In [6]:
# def is_file_ok(PATH, f, N=False):
#     with open(PATH + f,'r') as reader:
#         temp = []
#         nulls = []
#         for l, line in enumerate(reader):
#             if line.strip() == 'Null':
#                 temp.append('Null')
#                 # print(f, l, line)
#                 nulls.append(l)
        
#         if N:
#             return nulls

#         if len(nulls) < 10:
#             return True
#         else:
#             return False
        
#         #print(f, len(nulls))
#         #if len(nulls) < 51:
#          #   print(nulls)

In [11]:
# import os, random
# from os import listdir
# from os.path import isfile, join

# source = './HomesClean/'
# dest = './ExperementData/'


In [12]:
# test = []
# randomDict = {}

# only_files = [f for f in listdir(source) if isfile(join(source, f))and f[-4:]==".txt"]
# for f in only_files:
#     nulls = is_file_ok(source, f, True)
#     N = len(nulls)
#     test.append((N, f))

#     if N not in randomDict:
#         randomDict[N] = 0
#     randomDict[N] += 1

# test.sort()

In [13]:
# print(len(test))
# for t in test:
#     print(t)

3198
(2, '1000.txt')
(2, '1001.txt')
(2, '1002.txt')
(2, '1003.txt')
(2, '1004.txt')
(2, '1005.txt')
(2, '1006.txt')
(2, '1009.txt')
(2, '1013.txt')
(2, '1014.txt')
(2, '1015.txt')
(2, '1016.txt')
(2, '1018.txt')
(2, '1019.txt')
(2, '1020.txt')
(2, '1021.txt')
(2, '1022.txt')
(2, '1024.txt')
(2, '1026.txt')
(2, '1027.txt')
(2, '1028.txt')
(2, '1030.txt')
(2, '1031.txt')
(2, '1032.txt')
(2, '1033.txt')
(2, '1035.txt')
(2, '1036.txt')
(2, '1037.txt')
(2, '1039.txt')
(2, '1041.txt')
(2, '1042.txt')
(2, '1044.txt')
(2, '1045.txt')
(2, '1047.txt')
(2, '1049.txt')
(2, '1050.txt')
(2, '1052.txt')
(2, '1053.txt')
(2, '1054.txt')
(2, '1055.txt')
(2, '1056.txt')
(2, '1057.txt')
(2, '1058.txt')
(2, '1059.txt')
(2, '1060.txt')
(2, '1061.txt')
(2, '1062.txt')
(2, '1063.txt')
(2, '1064.txt')
(2, '1065.txt')
(2, '1067.txt')
(2, '1069.txt')
(2, '1071.txt')
(2, '1072.txt')
(2, '1073.txt')
(2, '1074.txt')
(2, '1075.txt')
(2, '1076.txt')
(2, '1077.txt')
(2, '1079.txt')
(2, '1081.txt')
(2, '1082.txt')
(2,

(2, '2943.txt')
(2, '2944.txt')
(2, '2945.txt')
(2, '2950.txt')
(2, '2951.txt')
(2, '2952.txt')
(2, '2953.txt')
(2, '2954.txt')
(2, '2955.txt')
(2, '2956.txt')
(2, '2957.txt')
(2, '2960.txt')
(2, '2963.txt')
(2, '2964.txt')
(2, '2965.txt')
(2, '2967.txt')
(2, '2968.txt')
(2, '2969.txt')
(2, '2970.txt')
(2, '2971.txt')
(2, '2973.txt')
(2, '2978.txt')
(2, '2979.txt')
(2, '2980.txt')
(2, '2983.txt')
(2, '2984.txt')
(2, '2985.txt')
(2, '2987.txt')
(2, '2988.txt')
(2, '2989.txt')
(2, '2991.txt')
(2, '2992.txt')
(2, '2993.txt')
(2, '2994.txt')
(2, '2995.txt')
(2, '2996.txt')
(2, '3000.txt')
(2, '3002.txt')
(2, '3003.txt')
(2, '3004.txt')
(2, '3008.txt')
(2, '3009.txt')
(2, '3010.txt')
(2, '3011.txt')
(2, '3012.txt')
(2, '3013.txt')
(2, '3018.txt')
(2, '3019.txt')
(2, '3020.txt')
(2, '3021.txt')
(2, '3022.txt')
(2, '3023.txt')
(2, '3024.txt')
(2, '3025.txt')
(2, '3028.txt')
(2, '3029.txt')
(2, '3030.txt')
(2, '3031.txt')
(2, '3032.txt')
(2, '3034.txt')
(2, '3036.txt')
(2, '3040.txt')
(2, '304

(98, '1796.txt')
(98, '1838.txt')
(98, '1864.txt')
(98, '1869.txt')
(98, '1944.txt')
(98, '1945.txt')
(98, '2026.txt')
(98, '2097.txt')
(98, '2107.txt')
(98, '2251.txt')
(98, '2386.txt')
(98, '2472.txt')
(98, '2515.txt')
(98, '2702.txt')
(98, '2745.txt')
(98, '2862.txt')
(98, '2879.txt')
(98, '2908.txt')
(98, '3027.txt')
(98, '3093.txt')
(98, '3204.txt')
(98, '3257.txt')
(98, '3276.txt')
(98, '3339.txt')
(98, '3410.txt')
(98, '3433.txt')
(98, '3467.txt')
(98, '3557.txt')
(98, '3578.txt')
(98, '3630.txt')
(98, '3712.txt')
(98, '3730.txt')
(98, '3890.txt')
(98, '3955.txt')
(98, '3968.txt')
(98, '3984.txt')
(98, '4001.txt')
(98, '4054.txt')
(98, '4073.txt')
(98, '4132.txt')
(98, '4137.txt')
(146, '2100.txt')
(146, '2728.txt')
(146, '2764.txt')
(146, '2998.txt')
(146, '4007.txt')
(194, '2982.txt')
(338, '2139.txt')
(385, '1897.txt')
(385, '2212.txt')
(385, '2439.txt')
(385, '3907.txt')
(529, '1508.txt')
(529, '3944.txt')
(577, '2697.txt')
(626, '3350.txt')
(721, '2645.txt')
(770, '1125.txt

In [28]:
# tikva = []
# for key, value in randomDict.items():
#     # print(key, value)
#     # how many nulls in how many files
#     tikva.append((key, value))

# tikva.sort()
# for t in tikva:
#     print(t)

In [29]:
# no_of_files = 99
# added_files = 0

# only_files = [f for f in listdir(source) if isfile(join(source, f))and f[-4:]==".txt"]
# only_files.remove("1086.txt")
# random_file = "1086.txt"
# source_file = source + '/' + random_file
# dest_file = dest+'/'+random_file
# destination_for_file = open(dest_file,'wb+')
# source_for_file = open(source_file,'rb')
# destination_for_file.write(source_for_file.read())
# destination_for_file.close()

# while added_files < no_of_files:
#     random_file = random.choice(only_files)
#     only_files.remove(random_file)

#     if not is_file_ok(source, random_file):
#         continue
    
#     added_files += 1
#     source_file = source + '/' + random_file
#     dest_file = dest + '/' + random_file
#     destination_for_file = open(dest_file,'wb+')
#     source_for_file = open(source_file,'rb')
#     destination_for_file.write(source_for_file.read())
#     destination_for_file.close()


In [14]:
# print(epoch_info[0][2])    
number_of_clusters = 3
flats = [client_weight[0].reshape(-1) for client_weight in updated_client_weights]

In [15]:
def print_clusters(clustered_weights_indexes, clustering_method):
    clusters = [[] for i in range(number_of_clusters)]
    for i in range (len(clustered_weights_indexes)):
        clusters[clustered_weights_indexes[i]].append(onlyfiles[i])
    print(clustering_method, clustered_weights_indexes, "\n")
    lengths = {}
    b=0
    for i in clusters:
        lengths[b]=len(i)
        b+=1
    print("Length of clusters: ", lengths, "\n")
    print("Clusters according to " + clustering_method + "\n")
    print(clusters)

In [16]:
# K-MEANS
kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.fit(flats)
clustered_weights_indexes_kmeans = kmeans.predict(flats)
print_clusters(clustered_weights_indexes_kmeans, "K-means")

# clusters_kmeans = [[] for i in range(number_of_clusters)]
# for i in range (len(clustered_weights_indexes_kmeans)):
#     clusters_kmeans[clustered_weights_indexes_kmeans[i]].append(onlyfiles[i])
# print("K-means clustering: ", clustered_weights_indexes_kmeans, "\n")
# lengths_kmeans = {}
# n=0
# for i in clusters_kmeans:
#     lengths_kmeans[n]=len(i)
#     n+=1
# print("Length of clusters: ", lengths_kmeans, "\n")
# print("Clusters according to k-means: ")
# clusters_kmeans

K-means [0 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 2 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0] 

Length of clusters:  {0: 93, 1: 1, 2: 6} 

Clusters according to K-means

[['3081.txt', '4076.txt', '2407.txt', '3296.txt', '1318.txt', '3447.txt', '4049.txt', '1904.txt', '1086.txt', '2202.txt', '4129.txt', '2574.txt', '4111.txt', '1727.txt', '1055.txt', '3871.txt', '2776.txt', '3497.txt', '3050.txt', '3910.txt', '2239.txt', '3133.txt', '1081.txt', '3046.txt', '1839.txt', '3195.txt', '3585.txt', '3036.txt', '2315.txt', '2301.txt', '2922.txt', '3427.txt', '2067.txt', '1180.txt', '1143.txt', '1619.txt', '2304.txt', '3346.txt', '1343.txt', '2474.txt', '1627.txt', '3781.txt', '2893.txt', '2065.txt', '2529.txt', '2501.txt', '1950.txt', '2685.txt', '2121.txt', '2647.txt', '3405.txt', '3820.txt', '2519.txt', '3349.txt', '1827.txt', '3599.txt', '2081.txt', '2536.txt', '2522.txt', '1

In [17]:
# FUZZY_C_MEANS (need to pip install fuzzy c means)
fuzzy_c_means = FCM(n_clusters=number_of_clusters) 
flats_fcm = np.array(flats)
fuzzy_c_means.fit(flats_fcm)
clustered_weights_indexes_fuzzycmeans = fuzzy_c_means.predict(flats_fcm)  
print_clusters(clustered_weights_indexes_fuzzycmeans, "Fuzzy C-means")

# clusters_fuzzycmeans = [[] for i in range(number_of_clusters)]
# for i in range (len(clustered_weights_indexes_fuzzycmeans)):
#     clusters_fuzzycmeans[clustered_weights_indexes_fuzzycmeans[i]].append(onlyfiles[i])
# print("Fuzzy c-means clustering: ", clustered_weights_indexes_fuzzycmeans, "\n")
# lengths_fuzzycmeans = {}
# m=0
# for i in clusters_fuzzycmeans:
#     lengths_fuzzycmeans[m]=len(i)
#     m+=1
# print("Length of clusters: ", lengths_fuzzycmeans, "\n")
# print("Clusters according to fuzzy c-means: ")
# clusters_fuzzycmeans


Fuzzy C-means [1 1 0 0 1 2 0 2 0 0 0 0 0 1 1 1 0 1 1 0 1 0 0 1 0 1 2 2 0 2 1 1 0 0 0 1 0
 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0
 0 1 0 0 1 1 0 0 1 0 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1 0] 

Length of clusters:  {0: 58, 1: 37, 2: 5} 

Clusters according to Fuzzy C-means

[['2407.txt', '3296.txt', '3447.txt', '4049.txt', '1904.txt', '1086.txt', '2202.txt', '4129.txt', '1055.txt', '3497.txt', '3910.txt', '2239.txt', '1081.txt', '1839.txt', '3036.txt', '2315.txt', '2301.txt', '3427.txt', '2067.txt', '1180.txt', '1619.txt', '2304.txt', '3346.txt', '3781.txt', '2893.txt', '2065.txt', '2501.txt', '1950.txt', '2685.txt', '2121.txt', '2519.txt', '3349.txt', '1827.txt', '2081.txt', '2522.txt', '1610.txt', '3359.txt', '1980.txt', '3617.txt', '1404.txt', '1809.txt', '1834.txt', '2235.txt', '1113.txt', '3843.txt', '2593.txt', '1515.txt', '2424.txt', '4055.txt', '1312.txt', '3931.txt', '1115.txt', '3884.txt', '1063.txt', '2595.txt', '2436.txt', '1117.txt', '2153.txt'], 

In [18]:
# Hierarchical Clustering
hierarchical_cluster = AgglomerativeClustering(n_clusters=number_of_clusters, affinity='euclidean', linkage='ward')
clustered_weights_indexes_hierarchical = hierarchical_cluster.fit_predict(flats)
print_clusters(clustered_weights_indexes_hierarchical, "Heirarchical clustering")



# clusters_hierarchical = [[] for i in range(number_of_clusters)]
# for i in range (len(clustered_weights_indexes_hierarchical)):
#     clusters_hierarchical[clustered_weights_indexes_hierarchical[i]].append(onlyfiles[i])
# print("Hierarchical clustering: ", clustered_weights_indexes_hierarchical, "\n")
# lengths_hierarchical = {}
# l=0
# for i in clusters_hierarchical:
#     lengths_hierarchical[l]=len(i)
#     l+=1
# print("Length of clusters: ", lengths_hierarchical, "\n")
# print("Clusters according to hierarchical: ")
# clusters_hierarchical


Heirarchical clustering [0 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 2 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 2 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0] 

Length of clusters:  {0: 93, 1: 1, 2: 6} 

Clusters according to Heirarchical clustering

[['3081.txt', '4076.txt', '2407.txt', '3296.txt', '1318.txt', '3447.txt', '4049.txt', '1904.txt', '1086.txt', '2202.txt', '4129.txt', '2574.txt', '4111.txt', '1727.txt', '1055.txt', '3871.txt', '2776.txt', '3497.txt', '3050.txt', '3910.txt', '2239.txt', '3133.txt', '1081.txt', '3046.txt', '1839.txt', '3195.txt', '3585.txt', '3036.txt', '2315.txt', '2301.txt', '2922.txt', '3427.txt', '2067.txt', '1180.txt', '1143.txt', '1619.txt', '2304.txt', '3346.txt', '1343.txt', '2474.txt', '1627.txt', '3781.txt', '2893.txt', '2065.txt', '2529.txt', '2501.txt', '1950.txt', '2685.txt', '2121.txt', '2647.txt', '3405.txt', '3820.txt', '2519.txt', '3349.txt', '1827.txt', '3599.txt', '2081.

In [19]:
# Time Series K means
model = TimeSeriesKMeans(n_clusters=4, metric="dtw", max_iter=10)
clustered_weights_indexes_kmeans_timeseries = model.fit_predict(flats)
print_clusters(clustered_weights_indexes_kmeans_timeseries, "K-means with Time Series")


# clusters_kmeans_timeseries = [[] for i in range(number_of_clusters)]
# for i in range (len(clustered_weights_indexes_kmeans_timeseries)):
#     clusters_kmeans_timeseries[clustered_weights_indexes_kmeans_timeseries[i]].append(onlyfiles[i])
# print("K-means clustering with time series: ", clustered_weights_indexes_kmeans_timeseries, "\n")
# lengths_kmeans_timeseries = {}
# a=0
# for i in clusters_kmeans_timeseries:
#     lengths_kmeans_timeseries[a]=len(i)
#     a+=1
# print("Length of clusters: ", lengths_kmeans_timeseries, "\n")
# print("Clusters according to k-means with time series: ")
# clusters_kmeans_timeseries

IndexError: list index out of range

In [20]:
# Mini Batch K-Means (batch size 6)

minibatchkmeans = MiniBatchKMeans(n_clusters=number_of_clusters, random_state=0, batch_size=6)
# have a reason as to why it is batch_size=6
clustered_weights_indexes_minibatchkmeans = minibatchkmeans.fit_predict(flats)
print_clusters(clustered_weights_indexes_minibatchkmeans, "Mini Batch K-means")

# clusters_minibatchkmeans = [[] for i in range(number_of_clusters)]
# for i in range (len(clustered_weights_indexes_minibatchkmeans)):
#     clusters_minibatchkmeans[clustered_weights_indexes_minibatchkmeans[i]].append(onlyfiles[i])
# print("Mini Batch K-means clustering: ", clustered_weights_indexes_minibatchkmeans, "\n")
# lengths_minibatchkmeans = {}
# b=0
# for i in clusters_minibatchkmeans:
#     lengths_minibatchkmeans[b]=len(i)
#     b+=1
# print("Length of clusters: ", lengths_minibatchkmeans, "\n")
# print("Clusters according to mini batch k-means: ")
# clusters_minibatchkmeans

Mini Batch K-means [0 0 2 2 1 1 2 1 2 2 2 0 0 1 0 1 0 0 0 2 0 2 2 0 2 0 1 1 2 1 1 0 0 0 2 1 0
 2 0 1 0 2 2 0 1 1 2 2 0 0 2 2 2 2 1 1 0 2 2 2 0 2 1 2 2 2 0 0 0 2 0 1 0 2
 2 1 2 2 1 1 2 2 0 2 2 0 2 0 1 0 1 2 2 0 2 2 2 0 0 2] 

Length of clusters:  {0: 34, 1: 22, 2: 44} 

Clusters according to Mini Batch K-means

[['3081.txt', '4076.txt', '2202.txt', '4129.txt', '4111.txt', '1055.txt', '3871.txt', '2776.txt', '3050.txt', '3133.txt', '3046.txt', '3585.txt', '3036.txt', '2315.txt', '3427.txt', '1180.txt', '1619.txt', '1343.txt', '2065.txt', '2529.txt', '3820.txt', '3599.txt', '3167.txt', '1980.txt', '3617.txt', '2680.txt', '1809.txt', '2791.txt', '2828.txt', '2964.txt', '3931.txt', '1711.txt', '1117.txt', '3073.txt'], ['1318.txt', '1330.txt', '3041.txt', '2574.txt', '1727.txt', '1447.txt', '3330.txt', '4163.txt', '3195.txt', '2922.txt', '1143.txt', '2474.txt', '1627.txt', '2647.txt', '3405.txt', '2536.txt', '3777.txt', '1660.txt', '1059.txt', '4121.txt', '2838.txt', '1840.txt'], ['2407.txt'

In [None]:
# Spectral Clustering
clustered_weights_indexes_spectral = SpectralClustering(n_clusters=number_of_clusters, assign_labels='discretize', random_state=0).fit_predict(flats)
print_clusters(clustered_weights_indexes_spectral, "Spectral")