In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import math
from scipy.stats import kruskal
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
from itertools import combinations
from collections import Counter


from scipy import linalg
from scipy.sparse.csgraph import laplacian
from sklearn.cluster import KMeans
from sklearn.manifold import spectral_embedding
from sklearn.metrics import silhouette_score

# Clustering functions

In [None]:
model_settings = {"ml": {"model": {"n_clusters": 2}}}

In [None]:
def cluster(adjacency):
        """Clusters the sequences based on their adjacency matrix

        Args:
            adjacency: adjacency matrix of the sequences to cluster
            n_clusters: number of clusters
            normed: normalised or not for the laplacian
    
        Returns:
            sklearn clustering object
            projection of the data points
            eigen values

        """
        L = laplacian(adjacency, normed=True)
        eigenvals, _ = linalg.eig(L)
        eigenvals = np.real(eigenvals)
        eigenvals_sorted = eigenvals[np.argsort(eigenvals)]

        # Create embedding
        random_state = np.random.RandomState(193)
        proj_X = spectral_embedding(adjacency, n_components=model_settings['ml']['model']['n_clusters'],
                                random_state=random_state,
                                drop_first=False)

        # Cluster the points using k-means clustering
        kmeans = KMeans(model_settings['ml']['model']['n_clusters'], random_state = random_state, n_init=10)
        kmeans.fit(proj_X)
        labels = kmeans.labels_

        details = {
            'model': kmeans,
            'projection': proj_X,
            'eigenvalues': eigenvals_sorted
        }

        return labels, details

In [None]:
def train(sequences):
    """Trains the algorithms for different values of k, then returns the best one

    Args:
        sequences (_type_): already formatted sequences
    """
    results = {
        'sequences': sequences,
        'lids': sorted_session_codes, # A list of students unique ids
        'k-range': [
            2,
            20
        ]
    }

    for k in range(results['k-range'][0], results['k-range'][1]):
        model_settings['ml']['model']['n_clusters'] = k
        labels, details = cluster(sequences)

        scores = silhouette_score(sequences, labels)

        results[k] = {
            'labels': labels,
            'details': details,
            'scores': scores
        }
        
        print('    scores for {}: {}'.format(k, scores))
        
    return results

# Analysis functions

In [None]:
df_fin = pd.read_excel('///.xlsx') # A table containing students post-test scores

In [None]:
def sigmoid(x, L, k, x0):
    return 1 + (L - 1) / (1 + np.exp(-k * (x - x0)))

# Parameters
L = 2  # Maximum value (factor should approach 2)
k = 1  # Adjust the steepness of the curve
x0 = 3  # Midpoint where the rate of increase slows down

In [None]:
def plot_two_histograms(data1, data2, xlabel, bins, range_hist):
    fig, axs = plt.subplots(2, 1, figsize=(6, 5), sharex=True, sharey=True)  # 2 rows, 1 column

    weights1 = np.ones_like(data1) / len(data1) * 100
    weights2 = np.ones_like(data2) / len(data2)* 100

    axs[0].hist(data1, bins=bins, range=range_hist, weights=weights1, color='green', alpha=0.7, edgecolor='black', linewidth=1)
    axs[0].set_title('Cluster 1')
    axs[0].set_ylabel('Percentage of students')

    axs[1].hist(data2, bins=bins, range=range_hist, weights=weights2, color='orange', alpha=0.7, edgecolor='black', linewidth=1)
    axs[1].set_title('Cluster 2')
    axs[1].set_xlabel(xlabel)
    axs[1].set_ylabel('Percentage of students')

    plt.tight_layout()  # Adjust layout for better spacing
    plt.show()

In [None]:
def plot_three_histograms(data1, data2, data3, xlabel, bins, range_hist):
    fig, axs = plt.subplots(3, 1, figsize=(6, 5), sharex=True, sharey=True)  # 3 rows, 1 column

    weights1 = np.ones_like(data1) / len(data1) * 100
    weights2 = np.ones_like(data2) / len(data2)* 100
    weights3 = np.ones_like(data3) / len(data3)* 100

    axs[0].hist(data1, bins=bins, range=range_hist, weights=weights1, color='green', alpha=0.7, edgecolor='black', linewidth=1)
    axs[0].set_title('Cluster 1')

    axs[1].hist(data2, bins=bins, range=range_hist, weights=weights2, color='orange', alpha=0.7, edgecolor='black', linewidth=1)
    axs[1].set_title('Cluster 2')
    axs[1].set_ylabel('Percentage of students')

    axs[2].hist(data3, bins=bins, range=range_hist, weights=weights3, color='blue', alpha=0.7, edgecolor='black', linewidth=1)
    axs[2].set_title('Cluster 3')
    axs[2].set_xlabel(xlabel)

    plt.tight_layout()  # Adjust layout for better spacing
    plt.show()

In [None]:
def plot_four_histograms(data1, data2, data3, data4, xlabel, bins, range_hist):
    fig, axs = plt.subplots(4, 1, figsize=(8, 7), sharex=True, sharey=True)  # 4 rows, 1 column

    weights1 = np.ones_like(data1) / len(data1) * 100
    weights2 = np.ones_like(data2) / len(data2)* 100
    weights3 = np.ones_like(data3) / len(data3)* 100
    weights4 = np.ones_like(data4) / len(data4)* 100

    axs[0].hist(data1, bins=bins, range=range_hist, weights=weights1, color='green', alpha=0.7, edgecolor='black', linewidth=1)
    axs[0].set_title('Cluster 1')

    axs[1].hist(data2, bins=bins, range=range_hist, weights=weights2, color='orange', alpha=0.7, edgecolor='black', linewidth=1)
    axs[1].set_title('Cluster 2')
    axs[1].set_ylabel('Percentage of students')

    axs[2].hist(data3, bins=bins, range=range_hist, weights=weights3, color='blue', alpha=0.7, edgecolor='black', linewidth=1)
    axs[2].set_title('Cluster 3')

    axs[3].hist(data4, bins=bins, range=range_hist, weights=weights4, color='red', alpha=0.7, edgecolor='black', linewidth=1)
    axs[3].set_title('Cluster 4')
    axs[3].set_xlabel(xlabel)
    axs[3].set_ylabel('Percentage of students')

    plt.tight_layout()  # Adjust layout for better spacing
    plt.show()

# General functions for clusters analysis

In [None]:
def build_vector(original_vector, sorted_session_codes, cluster_codes): # cluster_codes is a list containing student ids in a particular cluster
    vector = []

    for i in range(len(original_vector)):
        if sorted_session_codes[i] in cluster_codes:
            vector.append(original_vector[i])

    return vector

In [None]:
def count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_codes):
    count_A_DC = 0
    count_A_DI = 0
    count_B = 0

    for i in range(len(new_labels)):
        if sorted_session_codes[i] in cluster_codes:
            if 'A_DC' in instructions[i]:
                count_A_DC += 1
            elif 'A_DI' in instructions[i]:
                count_A_DI += 1
            else:
                count_B += 1

    return count_A_DC, count_A_DI, count_B

In [None]:
def count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_codes):
    count_ja = 0
    count_nein = 0

    for i in range(len(new_labels)):
        if sorted_session_codes[i] in cluster_codes:
            if df_fin['bll_prior'][i] == 'Ja':
                count_ja += 1
            elif df_fin['bll_prior'][i] == 'Nein':
                count_nein += 1

    return count_ja, count_nein

In [None]:
def bll_cluster(df_fin, cluster_codes):
    bll = []

    for i in range(len(df_fin['session_code_sim1'])):
        if df_fin['session_code_sim1'][i] in cluster_codes and math.isnan(df_fin['bll_weighed (12)'][i]) == False:
            bll.append(df_fin['bll_weighed (12)'][i])

    return bll

In [None]:
def get_asterisks_for_pval(p_val):
    """Receives the p-value and returns asterisks string."""
    if p_val > 0.05:
        p_text = "ns"  # above threshold => not significant
    elif p_val < 1e-4:  
        p_text = '****'
    elif p_val < 1e-3:
        p_text = '***'
    elif p_val < 1e-2:
        p_text = '**'
    else:
        p_text = '*'
    
    return p_text

In [None]:
def chisq_and_posthoc_corrected(df):
    """Receives a dataframe and performs chi2 test and then post hoc.
    Prints the p-values and corrected p-values (after FDR correction)"""
    # start by running chi2 test on the matrix
    chi2, p, dof, ex = chi2_contingency(df, correction=True)
    print(f"Chi2 result of the contingency table: {chi2}, p-value: {p}")
    
    # post-hoc
    all_combinations = list(combinations(df.index, 2))  # gathering all combinations for post-hoc chi2
    p_vals = []
    print("Significance results:")
    for comb in all_combinations:
        new_df = df[(df.index == comb[0]) | (df.index == comb[1])]
        chi2, p, dof, ex = chi2_contingency(new_df, correction=True)
        p_vals.append(p)
        # print(f"For {comb}: {p}")  # uncorrected

    # checking significance
    # correction for multiple testing using BH
    reject_list, corrected_p_vals, _, _ = multipletests(p_vals, method='fdr_bh')
    for p_val, corr_p_val, reject, comb in zip(p_vals, corrected_p_vals, reject_list, all_combinations):
        print(f"{comb}: p_value: {p_val:5f}; corrected: {corr_p_val:5f} ({get_asterisks_for_pval(p_val)}) reject: {reject}")

# Functions 2 clusters

In [None]:
def two_cluster_codes(sorted_session_codes, labels_2):
    cluster_1_codes = []
    cluster_2_codes = []

    for i in range(len(sorted_session_codes)):
        if labels_2[i] == 0:
            cluster_1_codes.append(sorted_session_codes[i])
        else:
            cluster_2_codes.append(sorted_session_codes[i])

    return cluster_1_codes, cluster_2_codes

## One vector

In [None]:
def two_clusters_one_vector(new_labels, df_fin, instructions, sorted_session_codes, labels_2, original_vector1):

    original_vector1_name = input("Please enter an original_vector1 name: ")

    cluster_1_codes, cluster_2_codes = two_cluster_codes(sorted_session_codes, labels_2)
    print("Cluster 1:", len(cluster_1_codes), "students,", "Cluster 2:", len(cluster_2_codes), "students")


    vector_1 = build_vector(original_vector1, sorted_session_codes, cluster_1_codes)
    vector_2 = build_vector(original_vector1, sorted_session_codes, cluster_2_codes)
    plot_two_histograms(vector_1, vector_2, f'Percentage of {original_vector1_name} actions', bins=10, range_hist=(0, 1))
    print(kruskal(vector_1, vector_2))
    print("")


    count_A_DC_1, count_A_DI_1, count_B_1 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_1_codes)
    count_A_DC_2, count_A_DI_2, count_B_2 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_2_codes)

    data = np.array([[count_A_DC_1, count_A_DC_2], [count_A_DI_1, count_A_DI_2], [count_B_1, count_B_2]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_DC_1: {count_A_DC_1},', f'A_DC_2: {count_A_DC_2}')
    print(f'A_DI_1: {count_A_DI_1},', f'A_DI_2: {count_A_DI_2}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")

    data = np.array([[count_A_DC_1 + count_A_DI_1, count_B_1], [count_A_DC_2 + count_A_DI_2, count_B_2]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_1: {count_A_DC_1 + count_A_DI_1},', f'A_2: {count_A_DC_2 + count_A_DI_2}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    count_ja_1, count_nein_1 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_1_codes)
    count_ja_2, count_nein_2 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_2_codes)

    data = np.array([[count_ja_1, count_ja_2], [count_nein_1, count_nein_2]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'Ja_1: {count_ja_1},', f'Ja_2: {count_ja_2}')
    print(f'Nein_1: {count_nein_1},', f'Nein_2: {count_nein_2}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    bll_1 = bll_cluster(df_fin, cluster_1_codes)
    bll_2 = bll_cluster(df_fin, cluster_2_codes)
    print(kruskal(bll_1, bll_2))
    print(np.mean(bll_1), "+-", np.std(bll_1))
    print(np.mean(bll_2), "+-", np.std(bll_2))
    print("bll_1:", bll_1)
    print("bll_2:", bll_2)

# K-modes functions

In [None]:
def k_modes_count(vector):
    my_list_of_tuples = [tuple(sublist) for sublist in vector]
    sublist_counts = Counter(my_list_of_tuples)

    return sublist_counts

## K-modes 2 clusters function

In [None]:
def k_modes_two_clusters(sorted_session_codes, labels_2, original_vector, new_labels, df_fin, instructions):
    cluster_1_codes, cluster_2_codes = two_cluster_codes(sorted_session_codes, labels_2)
    print("Cluster 1:", len(cluster_1_codes), "students,", "Cluster 2:", len(cluster_2_codes), "students")
    print("")

    vector_1 = build_vector(original_vector, sorted_session_codes, cluster_1_codes)
    vector_2 = build_vector(original_vector, sorted_session_codes, cluster_2_codes)

    sublist_counts_1 = k_modes_count(vector_1)
    print("Cluster 1:")
    for sublist, count in sublist_counts_1.items():
        print(f"{sublist}: {count} times")
    sublist_counts_2 = k_modes_count(vector_2)
    print("Cluster 2:")
    for sublist, count in sublist_counts_2.items():
        print(f"{sublist}: {count} times")
    print("")


    count_A_DC_1, count_A_DI_1, count_B_1 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_1_codes)
    count_A_DC_2, count_A_DI_2, count_B_2 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_2_codes)

    data = np.array([[count_A_DC_1, count_A_DC_2], [count_A_DI_1, count_A_DI_2], [count_B_1, count_B_2]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_DC_1: {count_A_DC_1},', f'A_DC_2: {count_A_DC_2}')
    print(f'A_DI_1: {count_A_DI_1},', f'A_DI_2: {count_A_DI_2}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")

    data = np.array([[count_A_DC_1 + count_A_DI_1, count_B_1], [count_A_DC_2 + count_A_DI_2, count_B_2]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_1: {count_A_DC_1 + count_A_DI_1},', f'A_2: {count_A_DC_2 + count_A_DI_2}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    count_ja_1, count_nein_1 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_1_codes)
    count_ja_2, count_nein_2 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_2_codes)

    data = np.array([[count_ja_1, count_ja_2], [count_nein_1, count_nein_2]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'Ja_1: {count_ja_1},', f'Ja_2: {count_ja_2}')
    print(f'Nein_1: {count_nein_1},', f'Nein_2: {count_nein_2}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    bll_1 = bll_cluster(df_fin, cluster_1_codes)
    bll_2 = bll_cluster(df_fin, cluster_2_codes)
    print(kruskal(bll_1, bll_2))
    print("bll_1:", bll_1)
    print("bll_2:", bll_2)

## K-modes 3 clusters function

In [None]:
def k_modes_three_clusters(sorted_session_codes, labels_3, original_vector, new_labels, df_fin, instructions):
    cluster_1_codes, cluster_2_codes, cluster_3_codes = three_cluster_codes(sorted_session_codes, labels_3)
    print("Cluster 1:", len(cluster_1_codes), "students,", "Cluster 2:", len(cluster_2_codes), "students,", "Cluster 3:", len(cluster_3_codes), "students")
    print("")

    vector_1 = build_vector(original_vector, sorted_session_codes, cluster_1_codes)
    vector_2 = build_vector(original_vector, sorted_session_codes, cluster_2_codes)
    vector_3 = build_vector(original_vector, sorted_session_codes, cluster_3_codes)

    sublist_counts_1 = k_modes_count(vector_1)
    print("Cluster 1:")
    for sublist, count in sublist_counts_1.items():
        print(f"{sublist}: {count} times")
    sublist_counts_2 = k_modes_count(vector_2)
    print("Cluster 2:")
    for sublist, count in sublist_counts_2.items():
        print(f"{sublist}: {count} times")
    sublist_counts_3 = k_modes_count(vector_3)
    print("Cluster 3:")
    for sublist, count in sublist_counts_3.items():
        print(f"{sublist}: {count} times")
    print("")


    count_A_DC_1, count_A_DI_1, count_B_1 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_1_codes)
    count_A_DC_2, count_A_DI_2, count_B_2 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_2_codes)
    count_A_DC_3, count_A_DI_3, count_B_3 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_3_codes)

    data = np.array([[count_A_DC_1, count_A_DC_2, count_A_DC_3], [count_A_DI_1, count_A_DI_2, count_A_DI_3], [count_B_1, count_B_2, count_B_3]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_DC_1: {count_A_DC_1},', f'A_DC_2: {count_A_DC_2}', f'A_DC_3: {count_A_DC_3}')
    print(f'A_DI_1: {count_A_DI_1},', f'A_DI_2: {count_A_DI_2}', f'A_DI_3: {count_A_DI_3}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}', f'B_3: {count_B_3}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")

    data = np.array([[count_A_DC_1 + count_A_DI_1, count_B_1], [count_A_DC_2 + count_A_DI_2, count_B_2], [count_A_DC_3 + count_A_DI_3, count_B_3]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_1: {count_A_DC_1 + count_A_DI_1},', f'A_2: {count_A_DC_2 + count_A_DI_2}', f'A_3: {count_A_DC_3 + count_A_DI_3}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}', f'B_3: {count_B_3}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    count_ja_1, count_nein_1 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_1_codes)
    count_ja_2, count_nein_2 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_2_codes)
    count_ja_3, count_nein_3 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_3_codes)

    data = np.array([[count_ja_1, count_nein_1], [count_ja_2, count_nein_2], [count_ja_3, count_nein_3]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'Ja_1: {count_ja_1},', f'Ja_2: {count_ja_2}', f'Ja_3: {count_ja_3}')
    print(f'Nein_1: {count_nein_1},', f'Nein_2: {count_nein_2}', f'Nein_3: {count_nein_3}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    bll_1 = bll_cluster(df_fin, cluster_1_codes)
    bll_2 = bll_cluster(df_fin, cluster_2_codes)
    bll_3 = bll_cluster(df_fin, cluster_3_codes)
    print(kruskal(bll_1, bll_2, bll_3))
    print("bll_1:", bll_1)
    print("bll_2:", bll_2)
    print("bll_3:", bll_3)

## K-modes 4 clusters function

In [None]:
def k_modes_four_clusters(sorted_session_codes, labels_4, original_vector, new_labels, df_fin, instructions):
    cluster_1_codes, cluster_2_codes, cluster_3_codes, cluster_4_codes = four_cluster_codes(sorted_session_codes, labels_4)
    print("Cluster 1:", len(cluster_1_codes), "students,", "Cluster 2:", len(cluster_2_codes), "students,", "Cluster 3:", len(cluster_3_codes), "students,", "Cluster 4:", len(cluster_4_codes), "students")
    print("")

    vector_1 = build_vector(original_vector, sorted_session_codes, cluster_1_codes)
    vector_2 = build_vector(original_vector, sorted_session_codes, cluster_2_codes)
    vector_3 = build_vector(original_vector, sorted_session_codes, cluster_3_codes)
    vector_4 = build_vector(original_vector, sorted_session_codes, cluster_4_codes)

    sublist_counts_1 = k_modes_count(vector_1)
    print("Cluster 1:")
    for sublist, count in sublist_counts_1.items():
        print(f"{sublist}: {count} times")
    sublist_counts_2 = k_modes_count(vector_2)
    print("Cluster 2:")
    for sublist, count in sublist_counts_2.items():
        print(f"{sublist}: {count} times")
    sublist_counts_3 = k_modes_count(vector_3)
    print("Cluster 3:")
    for sublist, count in sublist_counts_3.items():
        print(f"{sublist}: {count} times")
    sublist_counts_4 = k_modes_count(vector_4)
    print("Cluster 4:")
    for sublist, count in sublist_counts_4.items():
        print(f"{sublist}: {count} times")
    print("")


    count_A_DC_1, count_A_DI_1, count_B_1 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_1_codes)
    count_A_DC_2, count_A_DI_2, count_B_2 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_2_codes)
    count_A_DC_3, count_A_DI_3, count_B_3 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_3_codes)
    count_A_DC_4, count_A_DI_4, count_B_4 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_4_codes)

    data = np.array([[count_A_DC_1, count_A_DC_2, count_A_DC_3, count_A_DC_4], [count_A_DI_1, count_A_DI_2, count_A_DI_3, count_A_DI_4], [count_B_1, count_B_2, count_B_3, count_B_4]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_DC_1: {count_A_DC_1},', f'A_DC_2: {count_A_DC_2}', f'A_DC_3: {count_A_DC_3}', f'A_DC_4: {count_A_DC_4}')
    print(f'A_DI_1: {count_A_DI_1},', f'A_DI_2: {count_A_DI_2}', f'A_DI_3: {count_A_DI_3}', f'A_DI_4: {count_A_DI_4}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}', f'B_3: {count_B_3}', f'B_4: {count_B_4}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    data = np.array([[count_A_DC_1 + count_A_DI_1, count_B_1], [count_A_DC_2 + count_A_DI_2, count_B_2], [count_A_DC_3 + count_A_DI_3, count_B_3], [count_A_DC_4 + count_A_DI_4, count_B_4]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_1: {count_A_DC_1 + count_A_DI_1},', f'A_2: {count_A_DC_2 + count_A_DI_2}', f'A_3: {count_A_DC_3 + count_A_DI_3}', f'A_4: {count_A_DC_4 + count_A_DI_4}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}', f'B_3: {count_B_3}', f'B_4: {count_B_4}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")



    count_ja_1, count_nein_1 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_1_codes)
    count_ja_2, count_nein_2 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_2_codes)
    count_ja_3, count_nein_3 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_3_codes)
    count_ja_4, count_nein_4 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_4_codes)

    data = np.array([[count_ja_1, count_nein_1], [count_ja_2, count_nein_2], [count_ja_3, count_nein_3], [count_ja_4, count_nein_4]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'Ja_1: {count_ja_1},', f'Ja_2: {count_ja_2}', f'Ja_3: {count_ja_3}', f'Ja_4: {count_ja_4}')
    print(f'Nein_1: {count_nein_1},', f'Nein_2: {count_nein_2}', f'Nein_3: {count_nein_3}', f'Nein_4: {count_nein_4}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    bll_1 = bll_cluster(df_fin, cluster_1_codes)
    bll_2 = bll_cluster(df_fin, cluster_2_codes)
    bll_3 = bll_cluster(df_fin, cluster_3_codes)
    bll_4 = bll_cluster(df_fin, cluster_4_codes)
    print(kruskal(bll_1, bll_2, bll_3, bll_4))
    print("bll_1:", bll_1)
    print("bll_2:", bll_2)
    print("bll_3:", bll_3)
    print("bll_4:", bll_4)

## K-modes 5 clusters function

In [None]:
def k_modes_five_clusters(sorted_session_codes, labels_5, original_vector, new_labels, df_fin, instructions):
    cluster_1_codes, cluster_2_codes, cluster_3_codes, cluster_4_codes, cluster_5_codes = five_cluster_codes(sorted_session_codes, labels_5)
    print("Cluster 1:", len(cluster_1_codes), "students,", "Cluster 2:", len(cluster_2_codes), "students,", "Cluster 3:", len(cluster_3_codes), "students,", "Cluster 4:", len(cluster_4_codes), "students,", "Cluster 5:", len(cluster_5_codes), "students")
    print("")

    vector_1 = build_vector(original_vector, sorted_session_codes, cluster_1_codes)
    vector_2 = build_vector(original_vector, sorted_session_codes, cluster_2_codes)
    vector_3 = build_vector(original_vector, sorted_session_codes, cluster_3_codes)
    vector_4 = build_vector(original_vector, sorted_session_codes, cluster_4_codes)
    vector_5 = build_vector(original_vector, sorted_session_codes, cluster_5_codes)

    sublist_counts_1 = k_modes_count(vector_1)
    print("Cluster 1:")
    for sublist, count in sublist_counts_1.items():
        print(f"{sublist}: {count} times")
    sublist_counts_2 = k_modes_count(vector_2)
    print("Cluster 2:")
    for sublist, count in sublist_counts_2.items():
        print(f"{sublist}: {count} times")
    sublist_counts_3 = k_modes_count(vector_3)
    print("Cluster 3:")
    for sublist, count in sublist_counts_3.items():
        print(f"{sublist}: {count} times")
    sublist_counts_4 = k_modes_count(vector_4)
    print("Cluster 4:")
    for sublist, count in sublist_counts_4.items():
        print(f"{sublist}: {count} times")
    sublist_counts_5 = k_modes_count(vector_5)
    print("Cluster 5:")
    for sublist, count in sublist_counts_5.items():
        print(f"{sublist}: {count} times")
    print("")


    count_A_DC_1, count_A_DI_1, count_B_1 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_1_codes)
    count_A_DC_2, count_A_DI_2, count_B_2 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_2_codes)
    count_A_DC_3, count_A_DI_3, count_B_3 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_3_codes)
    count_A_DC_4, count_A_DI_4, count_B_4 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_4_codes)
    count_A_DC_5, count_A_DI_5, count_B_5 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_5_codes)

    data = np.array([[count_A_DC_1, count_A_DC_2, count_A_DC_3, count_A_DC_4, count_A_DC_5], [count_A_DI_1, count_A_DI_2, count_A_DI_3, count_A_DI_4, count_A_DI_5], [count_B_1, count_B_2, count_B_3, count_B_4, count_B_5]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_DC_1: {count_A_DC_1},', f'A_DC_2: {count_A_DC_2}', f'A_DC_3: {count_A_DC_3}', f'A_DC_4: {count_A_DC_4}', f'A_DC_5: {count_A_DC_5}')
    print(f'A_DI_1: {count_A_DI_1},', f'A_DI_2: {count_A_DI_2}', f'A_DI_3: {count_A_DI_3}', f'A_DI_4: {count_A_DI_4}', f'A_DI_5: {count_A_DI_5}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}', f'B_3: {count_B_3}', f'B_4: {count_B_4}', f'B_5: {count_B_5}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")

    data = np.array([[count_A_DC_1 + count_A_DI_1, count_B_1], [count_A_DC_2 + count_A_DI_2, count_B_2], [count_A_DC_3 + count_A_DI_3, count_B_3], [count_A_DC_4 + count_A_DI_4, count_B_4], [count_A_DC_5 + count_A_DI_5, count_B_5]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_1: {count_A_DC_1 + count_A_DI_1},', f'A_2: {count_A_DC_2 + count_A_DI_2}', f'A_3: {count_A_DC_3 + count_A_DI_3}', f'A_4: {count_A_DC_4 + count_A_DI_4}', f'A_5: {count_A_DC_5 + count_A_DI_5}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}', f'B_3: {count_B_3}', f'B_4: {count_B_4}', f'B_5: {count_B_5}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    count_ja_1, count_nein_1 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_1_codes)
    count_ja_2, count_nein_2 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_2_codes)
    count_ja_3, count_nein_3 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_3_codes)
    count_ja_4, count_nein_4 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_4_codes)
    count_ja_5, count_nein_5 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_5_codes)

    data = np.array([[count_ja_1, count_ja_2, count_ja_3, count_ja_4, count_ja_5], [count_nein_1, count_nein_2, count_nein_3, count_nein_4, count_nein_5]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'Ja_1: {count_ja_1},', f'Ja_2: {count_ja_2}', f'Ja_3: {count_ja_3}', f'Ja_4: {count_ja_4}', f'Ja_5: {count_ja_5}')
    print(f'Nein_1: {count_nein_1},', f'Nein_2: {count_nein_2}', f'Nein_3: {count_nein_3}', f'Nein_4: {count_nein_4}', f'Nein_5: {count_nein_5}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    bll_1 = bll_cluster(df_fin, cluster_1_codes)
    bll_2 = bll_cluster(df_fin, cluster_2_codes)
    bll_3 = bll_cluster(df_fin, cluster_3_codes)
    bll_4 = bll_cluster(df_fin, cluster_4_codes)
    bll_5 = bll_cluster(df_fin, cluster_5_codes)
    print(kruskal(bll_1, bll_2, bll_3, bll_4, bll_5))
    print("bll_1:", bll_1)
    print("bll_2:", bll_2)
    print("bll_3:", bll_3)
    print("bll_4:", bll_4)
    print("bll_5:", bll_5)

## K-modes 6 clusters function

In [None]:
def k_modes_six_clusters(sorted_session_codes, labels_6, original_vector, new_labels, df_fin, instructions):
    cluster_1_codes, cluster_2_codes, cluster_3_codes, cluster_4_codes, cluster_5_codes, cluster_6_codes = six_cluster_codes(sorted_session_codes, labels_6)
    print("Cluster 1:", len(cluster_1_codes), "students,", "Cluster 2:", len(cluster_2_codes), "students,", "Cluster 3:", len(cluster_3_codes), "students,", "Cluster 4:", len(cluster_4_codes), "students,", "Cluster 5:", len(cluster_5_codes), "students", "Cluster 6:", len(cluster_6_codes), "students")
    print("")

    vector_1 = build_vector(original_vector, sorted_session_codes, cluster_1_codes)
    vector_2 = build_vector(original_vector, sorted_session_codes, cluster_2_codes)
    vector_3 = build_vector(original_vector, sorted_session_codes, cluster_3_codes)
    vector_4 = build_vector(original_vector, sorted_session_codes, cluster_4_codes)
    vector_5 = build_vector(original_vector, sorted_session_codes, cluster_5_codes)
    vector_6 = build_vector(original_vector, sorted_session_codes, cluster_6_codes)

    sublist_counts_1 = k_modes_count(vector_1)
    print("Cluster 1:")
    for sublist, count in sublist_counts_1.items():
        print(f"{sublist}: {count} times")
    sublist_counts_2 = k_modes_count(vector_2)
    print("Cluster 2:")
    for sublist, count in sublist_counts_2.items():
        print(f"{sublist}: {count} times")
    sublist_counts_3 = k_modes_count(vector_3)
    print("Cluster 3:")
    for sublist, count in sublist_counts_3.items():
        print(f"{sublist}: {count} times")
    sublist_counts_4 = k_modes_count(vector_4)
    print("Cluster 4:")
    for sublist, count in sublist_counts_4.items():
        print(f"{sublist}: {count} times")
    sublist_counts_5 = k_modes_count(vector_5)
    print("Cluster 5:")
    for sublist, count in sublist_counts_5.items():
        print(f"{sublist}: {count} times")
    sublist_counts_6 = k_modes_count(vector_6)
    print("Cluster 6:")
    for sublist, count in sublist_counts_6.items():
        print(f"{sublist}: {count} times")
    print("")


    count_A_DC_1, count_A_DI_1, count_B_1 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_1_codes)
    count_A_DC_2, count_A_DI_2, count_B_2 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_2_codes)
    count_A_DC_3, count_A_DI_3, count_B_3 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_3_codes)
    count_A_DC_4, count_A_DI_4, count_B_4 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_4_codes)
    count_A_DC_5, count_A_DI_5, count_B_5 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_5_codes)
    count_A_DC_6, count_A_DI_6, count_B_6 = count_three_instructions(new_labels, instructions, sorted_session_codes, cluster_6_codes)

    data = np.array([[count_A_DC_1, count_A_DC_2, count_A_DC_3, count_A_DC_4, count_A_DC_5, count_A_DC_6], [count_A_DI_1, count_A_DI_2, count_A_DI_3, count_A_DI_4, count_A_DI_5, count_A_DI_6], [count_B_1, count_B_2, count_B_3, count_B_4, count_B_5, count_B_6]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_DC_1: {count_A_DC_1},', f'A_DC_2: {count_A_DC_2}', f'A_DC_3: {count_A_DC_3}', f'A_DC_4: {count_A_DC_4}', f'A_DC_5: {count_A_DC_5}', f'A_DC_6: {count_A_DC_6}')
    print(f'A_DI_1: {count_A_DI_1},', f'A_DI_2: {count_A_DI_2}', f'A_DI_3: {count_A_DI_3}', f'A_DI_4: {count_A_DI_4}', f'A_DI_5: {count_A_DI_5}', f'A_DI_6: {count_A_DI_6}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}', f'B_3: {count_B_3}', f'B_4: {count_B_4}', f'B_5: {count_B_5}', f'B_6: {count_B_6}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")

    data = np.array([[count_A_DC_1 + count_A_DI_1, count_B_1], [count_A_DC_2 + count_A_DI_2, count_B_2], [count_A_DC_3 + count_A_DI_3, count_B_3], [count_A_DC_4 + count_A_DI_4, count_B_4], [count_A_DC_5 + count_A_DI_5, count_B_5], [count_A_DC_6 + count_A_DI_6, count_B_6]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'A_1: {count_A_DC_1 + count_A_DI_1},', f'A_2: {count_A_DC_2 + count_A_DI_2}', f'A_3: {count_A_DC_3 + count_A_DI_3}', f'A_4: {count_A_DC_4 + count_A_DI_4}', f'A_5: {count_A_DC_5 + count_A_DI_5}', f'A_6: {count_A_DC_6 + count_A_DI_6}')
    print(f'B_1: {count_B_1},', f'B_2: {count_B_2}', f'B_3: {count_B_3}', f'B_4: {count_B_4}', f'B_5: {count_B_5}', f'B_6: {count_B_6}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    count_ja_1, count_nein_1 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_1_codes)
    count_ja_2, count_nein_2 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_2_codes)
    count_ja_3, count_nein_3 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_3_codes)
    count_ja_4, count_nein_4 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_4_codes)
    count_ja_5, count_nein_5 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_5_codes)
    count_ja_6, count_nein_6 = count_prior_knowledge(new_labels, df_fin, sorted_session_codes, cluster_6_codes)

    data = np.array([[count_ja_1, count_ja_2, count_ja_3, count_ja_4, count_ja_5, count_ja_6], [count_nein_1, count_nein_2, count_nein_3, count_nein_4, count_nein_5, count_nein_6]])
    chi2, p, _, _ = chi2_contingency(data)
    print(f'Ja_1: {count_ja_1},', f'Ja_2: {count_ja_2}', f'Ja_3: {count_ja_3}', f'Ja_4: {count_ja_4}', f'Ja_5: {count_ja_5}', f'Ja_6: {count_ja_6}')
    print(f'Nein_1: {count_nein_1},', f'Nein_2: {count_nein_2}', f'Nein_3: {count_nein_3}', f'Nein_4: {count_nein_4}', f'Nein_5: {count_nein_5}', f'Nein_6: {count_nein_6}')
    print(f"Chi-square statistic: {chi2},", f"P-value: {p}")
    print("")


    bll_1 = bll_cluster(df_fin, cluster_1_codes)
    bll_2 = bll_cluster(df_fin, cluster_2_codes)
    bll_3 = bll_cluster(df_fin, cluster_3_codes)
    bll_4 = bll_cluster(df_fin, cluster_4_codes)
    bll_5 = bll_cluster(df_fin, cluster_5_codes)
    bll_6 = bll_cluster(df_fin, cluster_6_codes)
    print(kruskal(bll_1, bll_2, bll_3, bll_4, bll_5, bll_6))
    print("bll_1:", bll_1)
    print("bll_2:", bll_2)
    print("bll_3:", bll_3)
    print("bll_4:", bll_4)
    print("bll_5:", bll_5)
    print("bll_6:", bll_6)

# -------------------------------------------------------------------------------------------

# Clustering

In [None]:
path_of_the_directory = '///' # Directory with Labeled files
file = Path(path_of_the_directory).glob('**/*.pkl')

new_labels = []
sorted_session_codes = []
instructions = []

for i in file:
    with open(i, 'rb') as fp:
        separate_actions_dict = pickle.load(fp)

    new_labels.append(separate_actions_dict['single_exp'])
    sorted_session_codes.append(separate_actions_dict['session_code'])
    instructions.append(separate_actions_dict['group'] + '_' + separate_actions_dict['subgroup'])

In [None]:
vector_non_opt = []
vector_cvs = []
vector_range_steps = []

for student in new_labels:

    cvs = []
    cvs_non_cvs = []
    for i in range(len(student['CVS'])):
        if student['CVS'][i] in ['CVS_explore_width', 'CVS_explore_concentration', 'CVS_explore_wavelength', 'CVS_explore_solution', 'CVS_record_width', 'CVS_record_concentration', 'CVS_record_wavelength', 'CVS_record_solution']:
            cvs.append(student['CVS'][i])
        if student['CVS'][i] in ['CVS_explore_width', 'CVS_explore_concentration', 'CVS_explore_wavelength', 'CVS_explore_solution', 'CVS_record_width', 'CVS_record_concentration', 'CVS_record_wavelength', 'CVS_record_solution', 'Non_CVS_explore_width', 'Non_CVS_explore_concentration', 'Non_CVS_explore_wavelength', 'Non_CVS_explore_solution', 'Non_CVS_record_width', 'Non_CVS_record_concentration', 'Non_CVS_record_wavelength', 'Non_CVS_record_solution']:
            cvs_non_cvs.append(student['CVS'][i])  

    vector_cvs.append(len(cvs)/len(cvs_non_cvs))

    non_opt = []
    non_opt_opt = []
    for i in range(len(student['Optimal'])):
        if student['Optimal'][i] in ['Non_Optimal_explore_width', 'Non_Optimal_explore_concentration', 'Non_Optimal_explore_wavelength', 'Non_Optimal_record_width', 'Non_Optimal_record_concentration', 'Non_Optimal_record_wavelength']:
            non_opt.append(student['Optimal'][i])
        if student['Optimal'][i] in ['Non_Optimal_explore_width', 'Non_Optimal_explore_concentration', 'Non_Optimal_explore_wavelength', 'Optimal_explore_width', 'Optimal_explore_concentration', 'Optimal_explore_wavelength', 'Non_Optimal_record_width', 'Non_Optimal_record_concentration', 'Non_Optimal_record_wavelength', 'Optimal_record_width', 'Optimal_record_concentration', 'Optimal_record_wavelength']:
            non_opt_opt.append(student['Optimal'][i])  

    vector_non_opt.append(len(non_opt)/len(non_opt_opt))

    range_steps = []
    for i in range(len(student['CVS'])):
        if student['CVS'][i] in ['CVS_explore_width', 'CVS_explore_concentration', 'CVS_explore_wavelength', 'Non_CVS_explore_width', 'Non_CVS_explore_concentration', 'Non_CVS_explore_wavelength']:
            range_steps.append(student['Range: percentage, steps, stops'][i][0] * sigmoid(student['Range: percentage, steps, stops'][i][1], L, k, x0))
    
    vector_range_steps.append(np.mean(range_steps) / 200)

## perc_cvs

In [None]:
vector_cvs_array = np.array(vector_cvs)
vector_cvs_reshaped = vector_cvs_array.reshape(-1, 1)

distance_matrix = pairwise_distances(vector_cvs_reshaped, metric='euclidean')
similarity_matrix_cvs = pairwise_kernels(distance_matrix, metric='rbf', gamma=0.00001)

In [None]:
train(similarity_matrix_cvs)

### 2 clusters

In [None]:
labels_2_cvs = [1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
         1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
         0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
         0, 0, 1, 0, 1, 0, 0]

In [None]:
two_clusters_one_vector(new_labels, df_fin, instructions, sorted_session_codes, labels_2_cvs, vector_cvs)

## perc_non_opt

In [None]:
vector_non_opt_array = np.array(vector_non_opt)
vector_non_opt_reshaped = vector_non_opt_array.reshape(-1, 1)

distance_matrix = pairwise_distances(vector_non_opt_reshaped, metric='euclidean')
similarity_matrix_non_opt = pairwise_kernels(distance_matrix, metric='rbf', gamma=0.00001)

In [None]:
train(similarity_matrix_non_opt)

In [None]:
x = range(2, 20)
y = [] # A list of Silhouette scores for each n clusters

# Create the plot
plt.plot(x, y)

# Adding labels and a title
plt.xlabel('N clusters')
plt.ylabel('Silhouette score')

plt.xlim(1, 20)

custom_x_ticks = list(range(2, 21))
plt.xticks(custom_x_ticks)

# Display the plot
plt.show()

### 2 clusters

In [None]:
labels_2_non_opt = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]

In [None]:
two_clusters_one_vector(new_labels, df_fin, instructions, sorted_session_codes, labels_2_non_opt, vector_non_opt)

## Range*sigmoid(steps)

In [None]:
vector_range_steps_array = np.array(vector_range_steps)
vector_range_steps_reshaped = vector_range_steps_array.reshape(-1, 1)

distance_matrix = pairwise_distances(vector_range_steps_reshaped, metric='euclidean')
similarity_matrix_range_steps = pairwise_kernels(distance_matrix, metric='rbf', gamma=0.00001)

In [None]:
train(similarity_matrix_range_steps)

In [None]:
x = range(2, 20)
y = [] 

# Create the plot
plt.plot(x, y)

# Adding labels and a title
plt.xlabel('N clusters')
plt.ylabel('Silhouette score')

plt.xlim(1, 20)

custom_x_ticks = list(range(2, 21))
plt.xticks(custom_x_ticks)

# Display the plot
plt.show()

### 2 clusters

In [None]:
labels_2_range_steps = [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0,
         0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
         0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
         0, 1, 1, 0, 0, 0, 0]

In [None]:
two_clusters_one_vector(new_labels, df_fin, instructions, sorted_session_codes, labels_2_range_steps, vector_range_steps)

## K-modes

CVS: 0 - High, 1 - Low

Optimal: 0 - Low Non_opt, 1 - High Non_opt

Range: 0 - Low, 1 - High

In [None]:
current_sequences = []

for i in range(len(new_labels)):
    vector = []

    vector.append(labels_2_cvs[i])
    vector.append(labels_2_non_opt[i])
    vector.append(labels_2_range_steps[i])

    current_sequences.append(vector)

In [None]:
distance_matrix = pairwise_distances(current_sequences, metric='euclidean')
similarity_matrix_multiple = pairwise_kernels(distance_matrix, metric='rbf', gamma=0.00001)

In [None]:
silhouette_scores = []
all_clusters = []

# Specify the range of clusters you want to try
for n_clusters in range(2, 8):
    km = KModes(n_clusters=n_clusters, init='Huang', n_init=10, verbose=1)
    clusters = km.fit_predict(similarity_matrix_multiple)
    all_clusters.append(clusters)
    
    # Compute silhouette score and append to the list
    silhouette_avg = silhouette_score(similarity_matrix_multiple, clusters)
    silhouette_scores.append(silhouette_avg)

# Print or visualize the silhouette scores
for n_clusters, score in zip(range(2, 8), silhouette_scores):
    print(f"Number of clusters: {n_clusters}, Silhouette Score: {score}")
    print(f"Clusters: {all_clusters[n_clusters - 2]}")

In [None]:
x = range(2, 8)
y = silhouette_scores

# Create the plot
plt.plot(x, y)

# Adding labels and a title
plt.xlabel('N clusters')
plt.ylabel('Silhouette score')

plt.xlim(1, 8)

custom_x_ticks = list(range(2, 9))
plt.xticks(custom_x_ticks)

# Display the plot
plt.show()

### 2 clusters

In [None]:
k_modes_two_clusters(sorted_session_codes, multiple_labels_k_2_codes, current_sequences, new_labels, df_fin, instructions)

### 3 clusters

In [None]:
k_modes_three_clusters(sorted_session_codes, multiple_labels_k_3_codes, current_sequences, new_labels, df_fin, instructions)

### 4 clusters

In [None]:
k_modes_four_clusters(sorted_session_codes, multiple_labels_k_4_codes, current_sequences, new_labels, df_fin, instructions)

### 5 clusters

In [None]:
multiple_labels_k_5_codes = []

In [None]:
k_modes_five_clusters(sorted_session_codes, multiple_labels_k_5_codes, current_sequences, new_labels, df_fin, instructions)

### 6 clusters

In [None]:
multiple_labels_k_6_codes = []

In [None]:
k_modes_six_clusters(sorted_session_codes, multiple_labels_k_6_codes, current_sequences, new_labels, df_fin, instructions)