In [10]:
# ___ Libraries ___
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy
from scipy.special import kl_div

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import community as community_louvain # For Louvain community detection


In [2]:
# ___ Read data ___
data_path = r'C:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\data\mal-api-2019\all_analysis_data.txt'
labels_path = r'C:\Users\cayde\Desktop\Grad_School_stuff\DataBaseManagement\Project\Analysis_Scripts\data\mal-api-2019\labels.txt'

# Read the data from all_analysis_data.txt and labels.txt
with open(data_path, "r") as f:
    all_traces = f.read().split('\n')[:-1]  # Array of all untokenized trace documents

with open(labels_path, "r") as g:
    all_labels = g.read().split('\n')[:-1]  # Remove last blank newline from list

# Shuffle and limit data and labels to a smaller subsection for testing
all_traces, all_labels = shuffle(all_traces, all_labels, random_state=42)
all_traces = all_traces[:100]
all_labels = all_labels[:100]

# Generate trace document names 
tracedoc_names = {i: i for i in range(len(all_traces))} 

# Dictionary of trace names and corresponding traces 
name_trace_dict = {name: trace for name, trace in zip(tracedoc_names.values(), all_traces)}

# List of trace document names 
trace_document_name_list = list(name_trace_dict.keys())

# Sorted distinct set list of labels
distinct_labels = sorted(set(all_labels))

In [3]:
# ___ Functions ___ 

# Calculate similarity matrix using Cosine Similarity
def SM_cosine(documents):
    vectorizer = TfidfVectorizer()
    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    cosine_matrix = cosine_similarity(feature_matrix)  # Cosine similarity between all document vectors 
    return cosine_matrix;

# Invert similarity matrix values for Cosine MST
def inverted_matrix(similarity_matrix): 
    return [[9999 if y == 0 else 1/y for y in x] for x in similarity_matrix]

# Calculate inverted similarity matrix using Jaccard Similarity
def SM_invjaccard(documents):
    matrix = []
    for x in documents:
        temp = []
        for y in documents:
            a = set(x.split(' '))  # Tokenize document x
            b = set(y.split(' '))  # Tokenize document y
            c = a.intersection(b)  # Find intersection between two trace documents: x, y 
            j = float(len(c)) / (len(a) + len(b) - len(c))  # Jaccard calculation
            
            temp.append(9999 if j == 0 else float(1/j))  # Inverse jaccard values
        matrix.append(temp)
    return matrix;
     
# Generate a graph for similarity between trace documents and find MST
def networkX_graph(matrix, labels):
    G = nx.from_numpy_matrix(np.array(matrix))  # Create a graph from the similarity matrix
    H = nx.relabel_nodes(G, labels)  # Label nodes with names of each trace
    mst = nx.algorithms.tree.mst.minimum_spanning_tree(H)  # Generate a minimum spanning tree of similarity matrix
    return mst 

In [4]:
# ORIGINAL FAMILIES: FAMILY LABEL AND TRACES THEY CONTAIN 

original_families = []  # Trace document names belonging in each original family
all_original_fam_names = []  # List of all names

# Limiting families analyzed to smaller number of families
distinct_labels = distinct_labels[:2]

for i in distinct_labels:  # For each distinct family label
    temp = [trace_document_name_list[j] for j in range(len(all_labels)) if all_labels[j] == i]
    original_families.append(temp)
    all_original_fam_names.extend(temp)
     
original_families_traces_untokenized = [[name_trace_dict[y] for y in x] for x in original_families]    

# ALL SELECTED FAMILY (ONE, TWO, OR THREE+ FAMILIES) TRACES, LABELS, AND TRACE NAMES
all_sel_fam_traces = []
all_sel_fam_labels = []
all_sel_fam_names = {}

for x in range(len(original_families_traces_untokenized)):
    for y in original_families_traces_untokenized[x]:
        all_sel_fam_traces.append(y)
        all_sel_fam_labels.append(x)

all_sel_fam_names = {x: name for x, name in enumerate(all_original_fam_names)} 
        
all_traces_list = [all_sel_fam_traces, all_sel_fam_labels, all_sel_fam_names]

# SAVE Original Family Trace Document Data
np.savetxt("output/Original_traceDocs.csv", all_traces_list[0], delimiter=",", fmt='%s')
np.savetxt("output/Original_labels.csv", all_traces_list[1], delimiter=",", fmt='%.3e')
np.savetxt("output/Original_traceNames.csv",all_traces_list[2], delimiter=",", fmt='%.3e')


In [5]:
# INVERTED JACCARD MATRIX 
ismj = SM_invjaccard(all_sel_fam_traces)    

# SAVE Inverted Jaccard Matrix
SMj = np.asarray(ismj)
np.savetxt("output/SMj.csv", SMj, delimiter=",")

In [6]:
# INVERTED GRAPH
gij = networkX_graph(ismj, all_sel_fam_names)

# SAVE Minimum Spanning Tree Nodes and Edges
MSTjnodes = np.asarray(list(gij.nodes()))
np.savetxt("output/MSTjNodes.csv", MSTjnodes, delimiter=",", fmt='%.0f')

MSTjedges = np.asarray(list(gij.edges()))
np.savetxt("output/MSTjEdges.csv", MSTjedges, delimiter=",", fmt='%.0f')

In [13]:
# Families created from MST and Girvan Newman 
def all_traces_clusters_girvan_newman(G, name_trace_dict, maxgroups):   
    gn = nx.algorithms.community.centrality.girvan_newman(G) 

    # Find all iterations with the desired number of groups
    sets_of_cluster = next(x for x in gn if len(x) == maxgroups)

    list_of_cluster = list(sets_of_cluster) # list of communities and tracedocs within them
    target_labels = list(range(len(sets_of_cluster))) # list of distinct communities detected

    all_traces, all_labels, all_traces_names = [], [], []
    for idx, cluster in enumerate(list_of_cluster):
        for name in cluster:
            name = int(name)
            all_traces_names.append(name)
            all_traces.append(name_trace_dict[name])
            all_labels.append(idx)

    print("\nGIRVAN NEWMAN COMMUNITY DETECTION FAMILIES: \n")
    for label in target_labels:
        print(f"\n\nGroup {label}:  ", end=" ")
        print(", ".join(str(name) for idx, name in enumerate(all_traces_names) if all_labels[idx] == label))

    return [all_traces, all_labels, all_traces_names]


# Families created from MST and Louvain 
def all_traces_clusters_louvain(G, name_trace_dict):
    lv = community_louvain.best_partition(G)

    lv_traces = list(lv.keys())
    lv_groups = list(lv.values())
    lv_distinct_groups = sorted(set(lv_groups))

    lv_families  = [[trace for idx, trace in enumerate(lv_traces) if group == lv_groups[idx]] for group in lv_distinct_groups]
    
    print("\n\nLOUVAIN COMMUNITY DETECTION FAMILIES: \n")
    all_traces, all_labels, all_traces_names = [], [], []
    for idx, family in enumerate(lv_families):
        print(f"\nGroup {idx}: ", family)
        
        for name in family:
            name = int(name)
            all_traces.append(name_trace_dict[name])
            all_traces_names.append(name)
            all_labels.append(idx)

    return [all_traces, all_labels, all_traces_names]


In [15]:
# JACCARD MST GIRVAN NEWMAN 
gnj_all_traces = all_traces_clusters_girvan_newman(gij, name_trace_dict, len(distinct_labels))

# Save Jaccard Girvan Newman Communities 
np.savetxt("output/GNJ_traceDocs.csv", gnj_all_traces[0], delimiter=",", fmt='%s')
np.savetxt("output/GNJ_labels.csv", gnj_all_traces[1], delimiter=",", fmt='%.3e')
np.savetxt("output/GNJ_traceNames.csv", gnj_all_traces[2], delimiter=",", fmt='%.3e')


# JACCARD MST LOUVAIN 
lvj_all_traces = all_traces_clusters_louvain(gij, name_trace_dict)

# Save Jaccard Louvain Communities
np.savetxt("output/LVJ_traceDocs.csv", lvj_all_traces[0], delimiter=",", fmt='%s')
np.savetxt("output/LVJ_labels.csv", lvj_all_traces[1], delimiter=",", fmt='%.0f')
np.savetxt("output/LVJ_traceNames.csv", lvj_all_traces[2], delimiter=",", fmt='%.0f')


GIRVAN NEWMAN COMMUNITY DETECTION FAMILIES: 



Group 0:   65, 69, 6, 7, 45, 17, 19, 51, 85, 22, 23


Group 1:   73, 15, 48, 20, 55, 60, 30


LOUVAIN COMMUNITY DETECTION FAMILIES: 


Group 0:  [65, 7, 23, 85]

Group 1:  [20, 55, 15, 30, 60, 73]

Group 2:  [6, 17, 22, 48]

Group 3:  [19, 45, 51, 69]
