In [9]:
import tqdm

In [3]:
from directed_graph.graph import Graph, load_graph

graph = Graph()

In [4]:
import csv

paths = [
    'process_text/data/cs.NE.triplets/2310.17250v2.txt',
    'process_text/data/cs.NE.triplets/2310.17250v2.txt',
    # 'process_text/data/cs.NE.triplets/2405.06561v2.txt',
]

added_edges = set()

for path in paths:
    with open(path, newline='') as csvfile:
        triplets_reader = csv.reader(csvfile)
        for triplet in triplets_reader:
            agent_1, action, agent_2 = triplet
            edge = (agent_1, agent_2, action)
            if len(agent_1) == 0 or len(agent_2) == 0 or len(action) == 0:
                continue

            if edge not in added_edges:
                added_edges.add(edge)
                if edge[0] not in graph.vertices:
                    graph.add_vertex(edge[0], edge[0].split()[:3])
                if edge[1] not in graph.vertices:
                    graph.add_vertex(edge[1], edge[1].split()[:3])
                graph.add_edge(*edge)

In [5]:
"""
Cluster vertices
"""

from process_graph.squeezing import squeeze

(
    (words_to_cluster, merged_words_map),
    (bigrams_to_cluster, merged_bigrams_map),
    (trigrams_to_cluster, merged_trigrams_map),
) = squeeze((0.4, 1, 1.5), graph.vertices.values())

In [6]:
"""
Cluster edges
"""
from process_graph.edges_clustering import cluster_edges_by_embeddings_dbscan

edge_map = cluster_edges_by_embeddings_dbscan(graph.edges)

In [11]:
clustered_graph = Graph()

for new_vertice_words in merged_words_map.keys():
    clustered_graph.add_vertex(new_vertice_words, new_vertice_words.split()[:3])
for new_vertice_bigram in merged_bigrams_map.keys():
    clustered_graph.add_vertex(new_vertice_bigram, new_vertice_bigram.split()[:3])
for new_vertice_trigram in merged_trigrams_map.keys():
    clustered_graph.add_vertex(new_vertice_trigram, new_vertice_trigram.split()[:3])

def match_new_vertice(label: str) -> str:
    new_label = str()
    if label in words_to_cluster:
        new_label = words_to_cluster[label]
    elif label in bigrams_to_cluster:
        new_label = bigrams_to_cluster[label]
    elif label in trigrams_to_cluster:
        new_label = trigrams_to_cluster[label]
    return new_label

added_edges = set()  # keeps added_edges in (agent_1, agent_2, label) format

for edge in tqdm.tqdm(graph.edges):
    new_edge = (
        match_new_vertice(edge.agent_1),
        match_new_vertice(edge.agent_2),
        edge_map[edge.label] if edge.label in edge_map else edge.label,
    )
    if new_edge in added_edges:
        continue

    clustered_graph.add_edge(*new_edge)

100%|██████████| 781/781 [00:01<00:00, 520.45it/s]


In [12]:
print(graph)

print(clustered_graph)

Graph(
	vertices=[Vertex(concept='we', words=['we']), Vertex(concept='usage', words=['usage']), Vertex(concept='u', words=['u']), Vertex(concept='result', words=['result']), Vertex(concept='this paper', words=['this', 'paper']), Vertex(concept='\titlepgskip = 21pt machine learning', words=['\\titlepgskip', '=', '21pt']), Vertex(concept='powerful tool', words=['powerful', 'tool']), Vertex(concept='traditional machine learning algorithm', words=['traditional', 'machine', 'learning']), Vertex(concept='well defined input and output variable', words=['well', 'defined', 'input']), Vertex(concept='which', words=['which']), Vertex(concept='clear guideline', words=['clear', 'guideline']), Vertex(concept='model training', words=['model', 'training']), Vertex(concept='inference', words=['inference']), Vertex(concept='this', words=['this']), Vertex(concept='emergence', words=['emergence']), Vertex(concept='[[[formula]]]', words=['[[[formula]]]']), Vertex(concept='importance', words=['importance'])

In [13]:
from directed_graph.visualize_graph import visualize_graph_ngrams
visualize_graph_ngrams(clustered_graph)

In [None]:
from collections import defaultdict

edge_cluster_to_word_map = defaultdict(list)
for key, value in edge_map.items():
    edge_cluster_to_word_map[value].append(key)
edge_cluster_to_word_map = dict(edge_cluster_to_word_map)


In [26]:
import json

def print_beautiful_dict(d):
    """Prints a dictionary with pretty formatting."""
    print(json.dumps(d, indent=4, sort_keys=True))

print_beautiful_dict(edge_cluster_to_word_map)

{
    "affect": [
        "led to",
        "design",
        "covers",
        "select",
        "learns",
        "allows",
        "gained",
        "modify",
        "yields",
        "offers",
        "affect",
        "has of",
        "served",
        "within",
        "across",
        "behind"
    ],
    "among": [
        "plays",
        "treat",
        "cover",
        "learn",
        "merge",
        "offer",
        "poses",
        "yield",
        "holds",
        "among",
        "about",
        "under"
    ],
    "as": [
        "as",
        "at"
    ],
    "between": [
        "rely on",
        "provide",
        "support",
        "lead to",
        "explore",
        "selects",
        "reduces",
        "capture",
        "obscure",
        "enables",
        "enhance",
        "employs",
        "relaxes",
        "measure",
        "maps to",
        "involve",
        "adds to",
        "possess",
        "reached",
        "exhibit",
        "uncover",
 

In [27]:
from directed_graph.graph import save_graph

save_graph(clustered_graph, "clustered_graph.pickle")