In [1]:
from directed_graph.graph import Graph, load_graph
import tqdm

graph = Graph()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os

def extract_txt_files(root_dir):
    txt_files = []

    for dirpath, _, filenames in os.walk(root_dir):
        files_in_dir = 0
        for file in filenames:
            if file.endswith('.txt'):
                full_path = os.path.join(dirpath, file)
                txt_files.append(full_path)
                files_in_dir += 1
                if files_in_dir > 22:
                    break

    return txt_files

# Example usage
root_directory = 'process_text/data/arxiv-txt-cs/'  # Replace with your root directory path
txt_file_paths = extract_txt_files(root_directory)

print(len(txt_file_paths))

In [2]:
txt_file_paths = [
    # "process_text/data/arxiv-txt-cs/cs.NE/2310.12538v2.txt"
    "process_text/data/arxiv-txt-cs/cs.NE/2310.12541v3.txt"
]

In [3]:
added_edges = set()

import csv

for path in tqdm.tqdm(txt_file_paths):
    with open(path, newline='') as csvfile:
        triplets_reader = csv.reader(csvfile, delimiter=";")
        for triplet in triplets_reader:
            agent_1, action, agent_2 = triplet
            edge = (
                agent_1,
                agent_2,
                action,
            )
            if len(agent_1) == 0 or len(agent_2) == 0 or len(action) == 0:
                continue

            # if edge[0] == "+30" or edge[1] == "+30":
            #     continue

            if "id" in edge[0] or "id" in edge[1] or "id" in edge[2]:
                continue

            if "im" in edge[0] or "im" in edge[1] or "im" in edge[2]:
                continue
            
            if "3mb" in edge[0] or "3mb" in edge[1] or "3mb" in edge[2]:
                continue

            if "10mb" in edge[0] or "10mb" in edge[1] or "10mb" in edge[2]:
                continue

            if "nsga" in edge[0] or "nsga" in edge[1] or "nsga" in edge[2]:
                continue
                
            if "%" in edge[0] or "%" in edge[1] or "%" in edge[2]:
                continue

            if edge[0].count(" ") > 2 or edge[1].count(" ") > 2 or edge[2].count(" ") > 2:
                continue

            if len(edge[0]) < 4 or len(edge[1]) < 4:
                continue

            if edge not in added_edges:
                added_edges.add(edge)
                if edge[0] not in graph.vertices:
                    graph.add_vertex(edge[0])
                if edge[1] not in graph.vertices:
                    graph.add_vertex(edge[1])
                graph.add_edge(*edge)

100%|██████████| 1/1 [00:00<00:00,  2.24it/s]


In [4]:
"""
Cluster vertices
"""

from process_graph.squeezing import squeeze

(
    (words_to_cluster, merged_words_map),
    (bigrams_to_cluster, merged_bigrams_map),
    (trigrams_to_cluster, merged_trigrams_map),
) = squeeze((0.46, 0.46 * 1.41, 0.46 * 1.71), graph.vertices.values())

In [5]:
import numpy as np

epsilon = []
max_cluster_size = []
average_cluster_size = []
median_cluster_size = []

def find_squeeze_params():    
    for eps in np.arange(0.2, 0.5, 0.02):
        (
            (words_to_cluster, merged_words_map),
            (bigrams_to_cluster, merged_bigrams_map),
            (trigrams_to_cluster, merged_trigrams_map),
        ) = squeeze((eps, eps * 1.41, eps * 1.71), graph.vertices.values())

        cluster_sizes = []
        for cluster in merged_words_map.values():
            if len(cluster) > 1:
                cluster_sizes.append(len(cluster))

        if len(cluster_sizes) < 1:
            continue
        epsilon.append(eps)
        max_cluster_size.append(np.max(cluster_sizes))
        average_cluster_size.append(np.mean(cluster_sizes))
        median_cluster_size.append(np.median(cluster_sizes))

find_squeeze_params()

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(ncols=2, figsize=(10, 2.5), layout='constrained')

axs[0].plot(epsilon, max_cluster_size)
axs[0].set_title("max cluster size")
axs[0].set_ylim((0, 25))

axs[1].plot(epsilon, average_cluster_size)
axs[1].set_title("average cluster size")
axs[1].set_ylim((0, 25));

# axs[2].plot(epsilon, median_cluster_size)
# axs[2].set_title("mean cluster size")
# axs[2].set_ylim((0, 20));

In [None]:
# print("Words:")
# for item in merged_words_map.items():
#     if len(item[1]) > 1:
#         print(item)

print("Bigrams:")
for item in merged_bigrams_map.items():
    if len(item[1]) > 1:
        print(item)

# print("Trigrams:")
# for item in merged_trigrams_map.items():
#     if len(item[1]) > 1:
#         print(item)

In [5]:
"""
Cluster edges
"""
from process_graph.edges_clustering import cluster_and_evaluate_all_sizes

labels = []
embeddings = []

for edge in graph.edges:
    labels.append(edge.label)
    embeddings.append(edge.embedding)

edge_maps = cluster_and_evaluate_all_sizes(
    embeddings,
    labels,
    {
        100: {
            "model": "DBSCAN",
            "params": {
                "eps": 0.5,
                "min_samples": 2
            }
        },
        200: {
            "model": "DBSCAN",
            "params": {
                "eps": 0.5,
                "min_samples": 2
            }
        }
    }


)


100%|██████████| 2/2 [00:00<00:00, 15.84it/s]


In [6]:
metrics, clusters = edge_maps

edge_map_word = clusters[100]
edge_map_bigram = clusters[200]

In [7]:
import tqdm

clustered_graph = Graph()

for new_vertice_words in merged_words_map.keys():
    clustered_graph.add_vertex(new_vertice_words)
for new_vertice_bigram in merged_bigrams_map.keys():
    clustered_graph.add_vertex(new_vertice_bigram)
for new_vertice_trigram in merged_trigrams_map.keys():
    clustered_graph.add_vertex(new_vertice_trigram)

def match_new_vertice(label: str) -> str:
    if label in words_to_cluster:
        return words_to_cluster[label]
    elif label in bigrams_to_cluster:
        return bigrams_to_cluster[label]
    elif label in trigrams_to_cluster:
        return trigrams_to_cluster[label]
    return label

def match_new_edge(label: str) -> str:
    if label in edge_map_word:
        return edge_map_word[label]
    if label in edge_map_bigram:
        return edge_map_bigram[label]
    return label

added_edges = set()  # keeps added_edges in (agent_1, agent_2, label) format

for edge in tqdm.tqdm(graph.edges):

    new_edge = (
        match_new_vertice(edge.agent_1),
        match_new_vertice(edge.agent_2),
        match_new_edge(edge.label)
    )
    if new_edge in added_edges:
        continue
    added_edges.add(new_edge)

    if new_edge[0] not in clustered_graph.vertices:
        clustered_graph.add_vertex(new_edge[0])
    if new_edge[1] not in clustered_graph.vertices:
        clustered_graph.add_vertex(new_edge[1])

    clustered_graph.add_edge(*new_edge)

100%|██████████| 248/248 [00:00<00:00, 2925.21it/s]


In [10]:
from directed_graph.visualize_graph import visualize_graph_ngrams

visualize_graph_ngrams(clustered_graph)

In [None]:
print(repr(graph))
print(repr(clustered_graph))

In [None]:
len(metrics['embedding_size_100']['Cluster Sizes'])

In [None]:
v_count = 0

for v in metrics['embedding_size_200']['Cluster Sizes'].values():
    if v > 20:
        v_count += 1

print(v_count)

In [None]:
print(clustered_graph)

In [None]:
from collections import defaultdict

edge_cluster_to_word_map = defaultdict(list)
for key, value in edge_map_word.items():
    edge_cluster_to_word_map[value].append(key)
for key, value in edge_map_word.items():
    edge_cluster_to_word_map[value].append(key)
edge_cluster_to_word_map = dict(edge_cluster_to_word_map)

edge_cluster_to_bigram_map = defaultdict(list)
for key, value in edge_map_bigram.items():
    edge_cluster_to_bigram_map[value].append(key)
for key, value in edge_map_bigram.items():
    edge_cluster_to_bigram_map[value].append(key)
edge_cluster_to_bigram_map = dict(edge_cluster_to_bigram_map)

In [None]:
import json

def print_beautiful_dict(d):
    """Prints a dictionary with pretty formatting."""
    print(json.dumps(d, indent=4, sort_keys=True))

print_beautiful_dict(edge_cluster_to_word_map)
# print_beautiful_dict(edge_cluster_to_bigram_map)

In [None]:
for k in edge_cluster_to_bigram_map:
    if len(edge_cluster_to_bigram_map[k]) > 2:
        print(k, edge_cluster_to_bigram_map[k][:10])