In [1]:
# print cwd
import os
os.getcwd()

'c:\\Users\\cayde\\Desktop\\Grad_School_stuff\\DataBaseManagement\\Project\\Analysis_Scripts\\mycode'

In [26]:
# ---------------------------------------------------------------------------
# toy example from 'TraceDoc.pdf'
# ---------------------------------------------------------------------------

import numpy as np
from scipy.stats import entropy
from collections import Counter

def calculate_divergence(trace1_probs, trace2_probs):
    kld = np.sum(trace1_probs * np.log(trace1_probs / trace2_probs))
    jsd = 0.5 * (np.sum(trace1_probs * np.log(trace1_probs / ((trace1_probs + trace2_probs) / 2))) 
                 + np.sum(trace2_probs * np.log(trace2_probs / ((trace1_probs + trace2_probs) / 2))))
    return kld, jsd

def get_pairs(trace):
    return [(trace[i], trace[i+1]) for i in range(len(trace) - 1)]

# Define 3 traces
trace1 = ["a1", "a2", "a3", "a1", "a3", "a2"]
trace2 = ["a1", "a4", "a3", "a1", "a2", "a1", "a2"]
trace3 = ["a1", "a4", "a2", "a3", "a2","a1", "a2", "a3"]

# ____ creating trace matrix _____
# we are trying to construct a adjacency matrix for each trace
# also there is no self-loops in the matrix.  so default the diagnoal to 0.  everything else in matrix should be 1 
# add the frequcny of the pair occuring in the trace to the matrix
# sum the rows after adding the frequency of the pair occuring in the trace to the matrix
# use this sum to take row wise probability of the matrix for every row. 
# make a matrix like this for every trace
# then between the matrix compute KLD and JSD

# ___ computing kld and jsd from trace matrixes ___
# need to take union of all the pairs from the traces we are comparing
# create 2 new matrix that is comprise of the similarity between the traces using KLD and JSD

# ___ MST ___
# run MST on the matrix


# Get pairs from each trace
pairs1 = get_pairs(trace1)
pairs2 = get_pairs(trace2)
pairs3 = get_pairs(trace3)

# Print the pairs
print(f"Pairs from trace 1: {pairs1}")
print(f"Pairs from trace 2: {pairs2}")
print(f"Pairs from trace 3: {pairs3}")

# Get the pair frequencies for each trace
pair_freq1 = Counter([tuple(pair) for pair in pairs1])
pair_freq2 = Counter([tuple(pair) for pair in pairs2])
pair_freq3 = Counter([tuple(pair) for pair in pairs3])

print(f'Pair frequencies for trace1: {pair_freq1}') 
print(f'Pair frequencies for trace2: {pair_freq2}')  
print(f'Pair frequencies for trace3: {pair_freq3}')  

# Get the unique pairs from all traces
unique_pairs = list(set(pair_freq1.keys()).union(set(pair_freq2.keys())).union(set(pair_freq3.keys())))

print(f'Unique pairs: {unique_pairs}')

# Create the probability distribution for each trace
counts1 = np.array([pair_freq1.get(pair, 0) for pair in unique_pairs], dtype=np.float64)
counts2 = np.array([pair_freq2.get(pair, 0) for pair in unique_pairs], dtype=np.float64)
counts3 = np.array([pair_freq3.get(pair, 0) for pair in unique_pairs], dtype=np.float64)

# Normalize the distributions
trace1_probs = counts1 / counts1.sum()
trace2_probs = counts2 / counts2.sum()
trace3_probs = counts3 / counts3.sum()

print(f'Normalized probabilities for trace1: {trace1_probs}')  
print(f'Normalized probabilities for trace2: {trace2_probs}')  
print(f'Normalized probabilities for trace3: {trace3_probs}')  

# Add epsilon to prevent division by zero
trace1_probs = trace1_probs + 1e-10
trace2_probs = trace2_probs + 1e-10
trace3_probs = trace3_probs + 1e-10

print(f'Normalized probabilities for trace1: {trace1_probs}')
print(f'Normalized probabilities for trace2: {trace2_probs}')
print(f'Normalized probabilities for trace3: {trace3_probs}')

# Calculate the KLD and JSD between each pair of traces
kld_matrix = np.zeros((3, 3))
jsd_matrix = np.zeros((3, 3))

traces_probs = [trace1_probs, trace2_probs, trace3_probs]

for i in range(3):
    for j in range(3):
        if i != j:
            kld, jsd = calculate_divergence(traces_probs[i], traces_probs[j])
            kld_matrix[i, j] = kld
            jsd_matrix[i, j] = jsd

print(f'KLD matrix:\n{kld_matrix}')
print(f'JSD matrix:\n{jsd_matrix}') 

Pairs from trace 1: [('a1', 'a2'), ('a2', 'a3'), ('a3', 'a1'), ('a1', 'a3'), ('a3', 'a2')]
Pairs from trace 2: [('a1', 'a4'), ('a4', 'a3'), ('a3', 'a1'), ('a1', 'a2'), ('a2', 'a1'), ('a1', 'a2')]
Pairs from trace 3: [('a1', 'a4'), ('a4', 'a2'), ('a2', 'a3'), ('a3', 'a2'), ('a2', 'a1'), ('a1', 'a2'), ('a2', 'a3')]
Pair frequencies for trace1: Counter({('a1', 'a2'): 1, ('a2', 'a3'): 1, ('a3', 'a1'): 1, ('a1', 'a3'): 1, ('a3', 'a2'): 1})
Pair frequencies for trace2: Counter({('a1', 'a2'): 2, ('a1', 'a4'): 1, ('a4', 'a3'): 1, ('a3', 'a1'): 1, ('a2', 'a1'): 1})
Pair frequencies for trace3: Counter({('a2', 'a3'): 2, ('a1', 'a4'): 1, ('a4', 'a2'): 1, ('a3', 'a2'): 1, ('a2', 'a1'): 1, ('a1', 'a2'): 1})
Unique pairs: [('a2', 'a3'), ('a2', 'a1'), ('a1', 'a4'), ('a3', 'a1'), ('a4', 'a2'), ('a4', 'a3'), ('a3', 'a2'), ('a1', 'a2'), ('a1', 'a3')]
Normalized probabilities for trace1: [0.2 0.  0.  0.2 0.  0.  0.2 0.2 0.2]
Normalized probabilities for trace2: [0.         0.16666667 0.16666667 0.1666666

In [14]:
# store path to data directory
data_dir = os.path.join(os.getcwd(), '../data/toyExampleData') # '../data/CSMiningData'  '../data/toyExampleData'
print(data_dir)

# store the number of files in the data directory
num_files = len([name for name in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, name))])
print(num_files)

c:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\mycode\../data/toyExampleData
3


In [28]:
import os
import numpy as np
from scipy.stats import entropy
from collections import Counter

def calc_kld_jsd(file1, file2, epsilon=1e-10):
    """
    Calculates Kullback-Leibler divergence and Jensen-Shannon Divergence for two files.

    Args:
    file1 (str): path to the first file.
    file2 (str): path to the second file.
    epsilon (float): a small value to prevent division by zero in KL divergence calculation.

    Returns:
    tuple: Kullback-Leibler divergence of file1 from file2, Kullback-Leibler divergence of file2 from file1, and Jensen-Shannon Divergence between file1 and file2.
    """

    # Read files and split into words
    with open(file1, 'r') as f:
        words1 = f.read().strip().split()
    with open(file2, 'r') as f:
        words2 = f.read().strip().split()

    # Create a list of pairs for each file. Each pair is a transition from one action to another.
    pairs1 = [(words1[i], words1[i+1]) for i in range(len(words1) - 1) if words1[i] and words1[i+1]]
    pairs2 = [(words2[i], words2[i+1]) for i in range(len(words2) - 1) if words2[i] and words2[i+1]]


    # Count frequency of pairs in each file. This will be the probability distribution of each file.
    pair_freq1 = Counter(pairs1)
    pair_freq2 = Counter(pairs2)

    # Get unique pairs from both files. This will be the set of all pairs in both files.
    unique_pairs = list(set(pair_freq1.keys()).union(set(pair_freq2.keys())))

    # Calculate probability distribution for each file. If a pair is not present in a file, its probability is 0.
    counts1 = np.array([pair_freq1.get(pair, 0) for pair in unique_pairs], dtype=np.float64)
    counts2 = np.array([pair_freq2.get(pair, 0) for pair in unique_pairs], dtype=np.float64)

    # Normalize the probability distributions to sum to 1.
    file1_probs = counts1 / counts1.sum()
    file2_probs = counts2 / counts2.sum()

    # Add epsilon to the probabilities to prevent division by zero in KL divergence calculation.
    file1_probs = file1_probs + epsilon
    file2_probs = file2_probs + epsilon

    # Calculate Kullback-Leibler divergence for each file from the other.
    kld12 = np.sum(np.where(file1_probs != 0, file1_probs * np.log2(file1_probs / file2_probs), 0))
    kld21 = np.sum(np.where(file2_probs != 0, file2_probs * np.log2(file2_probs / file1_probs), 0))

    # Calculate Jensen-Shannon Divergence between the two files.
    jsd = 0.5 * (kld12 + kld21)

    return kld12, kld21, jsd

def compare_all_files(data_dir):
    """
    Compares all files in a directory using Kullback-Leibler divergence and Jensen-Shannon Divergence.

    Args:
    data_dir (str): path to the directory containing the files.

    Returns:
    tuple: Matrix of Kullback-Leibler divergences and Matrix of Jensen-Shannon Divergences.
    """

    # get list of all files in the directory
    file_names = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f))]

    # get number of files
    num_files = len(file_names)

    # initialize matrices to store KLD and JSD values
    KLDMatrix = np.zeros((num_files, num_files))
    JSDMatrix = np.zeros((num_files, num_files))

    # calculate KLD and JSD for all pairs of files
    for i in range(num_files):
        for j in range(i+1, num_files):
            # calculate KLD and JSD for files i and j
            KL1, KL2, JSD = calc_kld_jsd(file_names[i], file_names[j])

            # store KLD and JSD values in the matrices
            KLDMatrix[i, j] = KL1
            KLDMatrix[j, i] = KL2
            # jsd is symmetric
            JSDMatrix[i, j] = JSD
            JSDMatrix[j, i] = JSD

            print(f"Comparison complete for files: {file_names[i]} and {file_names[j]}")

    return KLDMatrix, JSDMatrix

# run comparison on all files in the data directory

KLDMatrix, JSDMatrix = compare_all_files(data_dir)

try:
    np.savetxt("output/KLDMatrix.csv", KLDMatrix, delimiter=",")
    np.savetxt("output/JSDMatrix.csv", JSDMatrix, delimiter=",")
except IOError:
    print("Error: Failed to write output to file.")

print(f"KLD Matrix:\n{KLDMatrix}")
print(f"JSD Matrix:\n{JSDMatrix}")

Comparison complete for files: c:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\mycode\../data/toyExampleData\Trace1.txt and c:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\mycode\../data/toyExampleData\Trace2.txt
Comparison complete for files: c:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\mycode\../data/toyExampleData\Trace1.txt and c:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\mycode\../data/toyExampleData\Trace3.txt
Comparison complete for files: c:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\mycode\../data/toyExampleData\Trace2.txt and c:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\mycode\../data/toyExampleData\Trace3.txt
KLD Matrix:
[[ 0.         18.44362548 12.45019724]
 [15.51897535  0.         10.69303443]
 [13.04201011 17.42578955  0.        ]]
JSD Matri

In [None]:
# import numpy as np
# from scipy.sparse import csr_matrix
# from scipy.sparse.csgraph import minimum_spanning_tree

# # read in the KLD and JSD matrices
# KLDMatrix = np.genfromtxt("output/CSMiningData/KLDMatrix.csv", delimiter=",")
# JSDMatrix = np.genfromtxt("output/CSMiningData/JSDMatrix.csv", delimiter=",")

# # Prim's Algorithm
# # Prim's algorithm constructs the minimum spanning tree by adding edges with the minimum weight at each step.

# # JSDMatrix
# matrix_sparse = csr_matrix(JSDMatrix)
# Tcsr = minimum_spanning_tree(matrix_sparse)

# # save the minimum spanning tree as a csv file
# try:
#     np.savetxt("output/CSMiningData/prim_mst_JSD.csv", Tcsr.toarray(), delimiter=",")
# except IOError:
#     print("Error: Failed to write output to file.")

# # KLDMatrix
# matrix_sparse = csr_matrix(KLDMatrix)
# Tcsr = minimum_spanning_tree(matrix_sparse)

# # save the minimum spanning tree as a csv file
# try:
#     np.savetxt("output/CSMiningData/prim_mst_KLD.csv", Tcsr.toarray(), delimiter=",")
# except IOError:
#     print("Error: Failed to write output to file.")

In [None]:
# import numpy as np
# import networkx as nx

# # read in the KLD and JSD matrices
# KLDMatrix = np.genfromtxt("output/CSMiningData/KLDMatrix.csv", delimiter=",")
# JSDMatrix = np.genfromtxt("output/CSMiningData/JSDMatrix.csv", delimiter=",")

# # Kruskal's Algorithm
# # If you decide to implement Kruskal's algorithm, the pseudocode is as follows:

# # Create a graph from your matrices
# G_KLD = nx.from_numpy_matrix(KLDMatrix)
# G_JSD = nx.from_numpy_matrix(JSDMatrix)

# # Compute the minimum spanning tree for both matrices
# MST_KLD = nx.minimum_spanning_tree(G_KLD)
# MST_JSD = nx.minimum_spanning_tree(G_JSD)

# # Convert back to numpy matrices
# MST_matrix_KLD = nx.to_numpy_matrix(MST_KLD)
# MST_matrix_JSD = nx.to_numpy_matrix(MST_JSD)

# # save the minimum spanning trees as csv files
# try:
#     np.savetxt("output/CSMiningData/kruskals_mst_KLD.csv", MST_matrix_KLD, delimiter=",")
#     np.savetxt("output/CSMiningData/kruskals_mst_JSD.csv", MST_matrix_JSD, delimiter=",")
# except IOError:
#     print("Error: Failed to write output to file.")
