In [None]:
# print cwd
import os
os.getcwd()

In [None]:
# store path to data directory
data_dir = os.path.join(os.getcwd(), '../data/CSMiningData') # '../data/CSMiningData'  '../data/test'
print(data_dir)

# store the number of files in the data directory
num_files = len([name for name in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, name))])
print(num_files)

In [None]:
import os
import numpy as np
from scipy.stats import entropy
from collections import Counter

def construct_markov_chain(trace):
    """
    Constructs a Markov chain from a sequence of actions.

    Args:
    trace (list): sequence of actions.

    Returns:
    dict: Markov chain represented as a dictionary where keys are actions and values are dictionaries representing the probabilities of transitioning from the key action to each of the other actions.
    """

    # initialize Markov chain
    markov_chain = {}

    for i in range(len(trace) - 1):
        if trace[i] not in markov_chain:
            markov_chain[trace[i]] = {}

        if trace[i+1] not in markov_chain[trace[i]]:
            markov_chain[trace[i]][trace[i+1]] = 0

        markov_chain[trace[i]][trace[i+1]] += 1

    # normalize the probabilities
    for action in markov_chain:
        total = sum(markov_chain[action].values())
        for next_action in markov_chain[action]:
            markov_chain[action][next_action] /= total

    return markov_chain

def calc_kld_jsd(file1, file2, epsilon=1e-10):
    """
    Calculates Kullback-Leibler divergence and Jensen-Shannon Divergence for two files.

    Args:
    file1 (str): path to the first file.
    file2 (str): path to the second file.
    epsilon (float): a small value to prevent division by zero in KL divergence calculation.

    Returns:
    tuple: Kullback-Leibler divergence of file1 from file2, Kullback-Leibler divergence of file2 from file1, and Jensen-Shannon Divergence between file1 and file2.
    """

    # Read files and split into words
    with open(file1, 'r') as f:
        words1 = f.read().split()
    with open(file2, 'r') as f:
        words2 = f.read().split()

    # Create a list of pairs for each file. Each pair is a transition from one action to another.
    pairs1 = [(words1[i], words1[i+1]) for i in range(len(words1) - 1)]
    pairs2 = [(words2[i], words2[i+1]) for i in range(len(words2) - 1)]

    # Count frequency of pairs in each file. This will be the probability distribution of each file.
    pair_freq1 = Counter(pairs1)
    pair_freq2 = Counter(pairs2)

    # Get unique pairs from both files. This will be the set of all pairs in both files.
    unique_pairs = set(pair_freq1.keys()).union(set(pair_freq2.keys()))

    # Calculate probability distribution for each file. If a pair is not present in a file, its probability is 0.
    counts1 = np.array([pair_freq1.get(pair, 0) for pair in unique_pairs], dtype=np.float64)
    counts2 = np.array([pair_freq2.get(pair, 0) for pair in unique_pairs], dtype=np.float64)

    # Normalize the probability distributions to sum to 1.
    file1_probs = counts1 / counts1.sum()
    file2_probs = counts2 / counts2.sum()

    # Add epsilon to the probabilities to prevent division by zero in KL divergence calculation.
    file1_probs = file1_probs + epsilon
    file2_probs = file2_probs + epsilon

    # Calculate Kullback-Leibler divergence for each file from the other.
    kld12 = np.sum(file1_probs * np.log(file1_probs / file2_probs))
    kld21 = np.sum(file2_probs * np.log(file2_probs / file1_probs))

    # Calculate Jensen-Shannon Divergence between the two files.
    jsd = 0.5 * (kld12 + kld21)

    return kld12, kld21, jsd

def compare_all_files(data_dir):
    """
    Compares all files in a directory using Kullback-Leibler divergence and Jensen-Shannon Divergence.

    Args:
    data_dir (str): path to the directory containing the files.

    Returns:
    tuple: Matrix of Kullback-Leibler divergences and Matrix of Jensen-Shannon Divergences.
    """

    # get list of all files in the directory
    file_names = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]

    # get number of files
    num_files = len(file_names)

    # initialize matrices to store KLD and JSD values
    KLDMatrix = np.zeros((num_files, num_files))
    JSDMatrix = np.zeros((num_files, num_files))

    # calculate KLD and JSD for all pairs of files
    for i in range(num_files):
        for j in range(i+1, num_files):
            # calculate KLD and JSD for files i and j
            KL1, KL2, JSD = calc_kld_jsd(file_names[i], file_names[j])

            # store KLD and JSD values in the matrices
            KLDMatrix[i, j] = KL1
            KLDMatrix[j, i] = KL2
            JSDMatrix[i, j] = JSD
            JSDMatrix[j, i] = JSD

            print(f"Comparison complete for files: {file_names[i]} and {file_names[j]}")

    return KLDMatrix, JSDMatrix

KLDMatrix, JSDMatrix = compare_all_files(data_dir)

try:
    np.savetxt("output/KLDMatrix.csv", KLDMatrix, delimiter=",")
    np.savetxt("output/JSDMatrix.csv", JSDMatrix, delimiter=",")
except IOError:
    print("Error: Failed to write output to file.")

In [None]:
# import numpy as np
# from scipy.sparse import csr_matrix
# from scipy.sparse.csgraph import minimum_spanning_tree

# # read in the KLD and JSD matrices
# KLDMatrix = np.genfromtxt("output/CSMiningData/KLDMatrix.csv", delimiter=",")
# JSDMatrix = np.genfromtxt("output/CSMiningData/JSDMatrix.csv", delimiter=",")

# # Prim's Algorithm
# # Prim's algorithm constructs the minimum spanning tree by adding edges with the minimum weight at each step.

# # JSDMatrix
# matrix_sparse = csr_matrix(JSDMatrix)
# Tcsr = minimum_spanning_tree(matrix_sparse)

# # save the minimum spanning tree as a csv file
# try:
#     np.savetxt("output/CSMiningData/prim_mst_JSD.csv", Tcsr.toarray(), delimiter=",")
# except IOError:
#     print("Error: Failed to write output to file.")

# # KLDMatrix
# matrix_sparse = csr_matrix(KLDMatrix)
# Tcsr = minimum_spanning_tree(matrix_sparse)

# # save the minimum spanning tree as a csv file
# try:
#     np.savetxt("output/CSMiningData/prim_mst_KLD.csv", Tcsr.toarray(), delimiter=",")
# except IOError:
#     print("Error: Failed to write output to file.")

In [None]:
# import numpy as np
# import networkx as nx

# # read in the KLD and JSD matrices
# KLDMatrix = np.genfromtxt("output/CSMiningData/KLDMatrix.csv", delimiter=",")
# JSDMatrix = np.genfromtxt("output/CSMiningData/JSDMatrix.csv", delimiter=",")

# # Kruskal's Algorithm
# # If you decide to implement Kruskal's algorithm, the pseudocode is as follows:

# # Create a graph from your matrices
# G_KLD = nx.from_numpy_matrix(KLDMatrix)
# G_JSD = nx.from_numpy_matrix(JSDMatrix)

# # Compute the minimum spanning tree for both matrices
# MST_KLD = nx.minimum_spanning_tree(G_KLD)
# MST_JSD = nx.minimum_spanning_tree(G_JSD)

# # Convert back to numpy matrices
# MST_matrix_KLD = nx.to_numpy_matrix(MST_KLD)
# MST_matrix_JSD = nx.to_numpy_matrix(MST_JSD)

# # save the minimum spanning trees as csv files
# try:
#     np.savetxt("output/CSMiningData/kruskals_mst_KLD.csv", MST_matrix_KLD, delimiter=",")
#     np.savetxt("output/CSMiningData/kruskals_mst_JSD.csv", MST_matrix_JSD, delimiter=",")
# except IOError:
#     print("Error: Failed to write output to file.")
