In [2]:
import pandas as pd
import numpy as np
import pm4py
from variant_stats import get_variants_stats
from pm4py.util import ml_utils
import importlib.util
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer
from sklearn.cluster import SpectralClustering
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import community

In [3]:
def calculate_similarity_sequence_alignment_metric(path1, path2):
    def node_similarity(node1, node2):
        try:
            if node1['feature_value'] == node2['feature_value']:
                return 1  # High similarity score for exact matches in feature_value
            elif node1['feature_abbrv'] == node2['feature_abbrv']:
                return 0.5  # Partial similarity score for matching feature_abbrv
            else:
                return 0  # No similarity
        except:
            return 0
    len1, len2 = len(path1), len(path2)
    dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]

    for i in range(len1 + 1):
        dp[i][0] = i * -0.1  # Penalty for adding nodes
    for j in range(len2 + 1):
        dp[0][j] = j * -0.1  # Penalty for deleting nodes

    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            match_score = dp[i-1][j-1] + node_similarity(path1[i-1], path2[j-1])
            delete_score = dp[i-1][j] - 0.1  # Penalty for deletion
            insert_score = dp[i][j-1] - 0.1  # Penalty for insertion
            dp[i][j] = max(match_score, delete_score, insert_score)

    raw_score = dp[-1][-1]
    
    # Calculate maximum and minimum possible scores
    max_score = min(len1, len2) * 1  # Assuming 1 is the maximum similarity per node
    min_score = -(len1 + len2) * 0.1  # Penalty for complete mismatch
    
    # Normalize the score to a [0, 1] range
    normalized_score = (raw_score - min_score) / (max_score - min_score)
    
    return raw_score, normalized_score

In [4]:
df2 = pd.read_pickle('decision_paths/subset_optimized_simple_size_0.2_fitnessweights_p0.34_f0.33_c0.33_weightmodel_weight_positive_simplified.pickle')

dists = []
for i in tqdm(range(len(df2))):
    data1 = df2['rule_to_simplified_rules'].iloc[i]
    for path1 in data1:
        temp_dist = []
        for j in range(len(df2)):
            data2 = df2['rule_to_simplified_rules'].iloc[j]
            for path2 in data2:
                raw_score, normalized_score = calculate_similarity_sequence_alignment_metric(path1, path2)
                temp_dist.append(normalized_score)
        dists.append(temp_dist)

100%|██████████████████████████████████████████████████████████████████████████████████| 64/64 [03:01<00:00,  2.83s/it]


In [42]:
np_dist = np.array(dists)
np.fill_diagonal(np_dist, 0)

In [40]:
zero_count = np.count_nonzero(np_dist == 0)
print(f"Number of zeros: {zero_count}")

Number of zeros: 2138


In [41]:
np_dist.shape

(2138, 2138)

In [37]:
threshold = 0.55
np_dist[np_dist>threshold] = 1
np_dist[np_dist<1] = 0

In [44]:
G = nx.from_numpy_array(np_dist)
for u, v, d in G.edges(data=True):
    d['weight'] = np_dist[u, v]

In [50]:
import community as community_louvain

partition = community.best_partition(G, weight='weight')

NameError: name 'community' is not defined

In [1]:
import community as community_louvain