In [121]:
import pandas as pd
import ast
import json

input_file = 'hebrew_dict/hebrew_verbs.csv'

df = pd.read_csv(input_file)

# filter by roots, binyan, and part of speech
binyans = ["pa'al", "nif'al", "pi'el", "hif'il", "hitpa'el"]
edge_categories = ["permutation", "semantic", "levenshtein-1"]
binyan = binyans[0]
edge_category = edge_categories[2]

df['Root'] = df['Root'].apply(ast.literal_eval)
df = df[df['Binyan'].apply(lambda x: x == binyan)]
df = df[df['Part of speech'].apply(lambda x: x == "Verb")]

In [122]:
def replace_final_form(hebrew_chars):
    # Define mapping of normal form characters to their final forms
    # reversed due to hebrew direction
    final_form_mapping = {
        "ץ": "צ",
        "ף": "פ",
        "ן": "נ",
        "ם": "מ",
        "ך": "כ"
    }

    # Replace final forms in the list
    return ''.join([final_form_mapping.get(char, char) for char in hebrew_chars])

In [123]:
import networkx as nx
from collections import Counter

G = nx.Graph()

node_counter_dict = {}

for index, row in df.iterrows():
    root = ''.join(row['Root'])
    word = row['Word (he)']
    meaning = row['Meaning']

    G.add_node(root, title=meaning, word=word)
    # node_counter_dict[root] = Counter(replace_final_form(root))

In [124]:
import itertools
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm
from Levenshtein import distance

def edge_levenshtein_k(n, m, k):
    return distance(replace_final_form(n), replace_final_form(m)) <= k

def is_permutation(n, m):
    return node_counter_dict[n] == node_counter_dict[m]

def add_perm_edges():
    for n, m in itertools.combinations(G.nodes(), 2):
        if is_permutation(n, m):  # edge_levenshtein_k(n, m, 1):
            G.add_edge(n, m)

G.remove_edges_from(list(G.edges))
# Compute embeddings for each node's semantic meaning
node_embeddings = {}
total_nodes = len(G)

def compute_embeddings():
    # Load a pre-trained model
    model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
    with tqdm(total=total_nodes, desc="Computing Embeddings") as pbar:
        for node, data in G.nodes(data=True):
            semantic_meaning = data.get("title", "")
            embeddings = model.encode(semantic_meaning, convert_to_tensor=True)
            node_embeddings[node] = embeddings
            pbar.update(1)


In [125]:
output_fname = binyan

def dump_semantic_cache(edges_similarity):
    node_embeddings_json = {node: emb.tolist() for node, emb in node_embeddings.items()}

    with open(f"graphs_raw/semantic/semantic_cache/{output_fname}.json", "w") as outfile:
        json.dump({"node_embeddings": node_embeddings_json, "edges_similarity": edges_similarity}, outfile)

In [126]:
# import torch

def load_semantic_cache():
    # Load node embeddings from the JSON file
    with open(f"graphs_raw/semantic/semantic_cache/{output_fname}.json", "r") as f:
        semantic_cached = json.load(f)
        loaded_node_embeddings, loaded_edge_semantic_similarity = semantic_cached["node_embeddings"], semantic_cached["edges_similarity"]
        # Convert loaded embeddings back to tensors
        # node_embeddings = {node: torch.tensor(emb) for node, emb in loaded_node_embeddings.items()}
        edges_similarity = {ast.literal_eval(nodes) : val for nodes, val in loaded_edge_semantic_similarity.items()}
        return edges_similarity


In [None]:
semantic_threshold = 0.7

def similarity_to_weight(similarity, similarity_threshold, min_weight, max_weight):
    if similarity >= similarity_threshold:
        weight = min_weight + (max_weight - min_weight) * ((similarity - similarity_threshold) / (1 - similarity_threshold))
    else:
        weight = min_weight
    return round(weight, 1)

def add_semantic_edges():
    G.remove_edges_from(list(G.edges))
    cur_combinations = list(itertools.combinations(G.nodes(), 2))
    edges_similarity = load_semantic_cache()
    with tqdm(total=len(cur_combinations), desc="Adding semantic edges") as pbar:
        for n, m in cur_combinations:
            similarity = edges_similarity.get((n , m), edges_similarity.get((m, n), 0))
            if similarity > semantic_threshold:
                G.add_edge(n, m, weight=similarity_to_weight(similarity, semantic_threshold, 0.4, 11))
            pbar.update(1)

def add_levenshtein_edges():
    for n, m in tqdm(itertools.combinations(G.nodes(), 2)):
        if edge_levenshtein_k(n, m, 1):
            G.add_edge(n, m)

add_levenshtein_edges()
# print(nx.get_edge_attributes(G, 'weight'))
print(G)

In [128]:
# resize nodes by degree
weighted_degrees = dict(G.degree(weight='weight'))

def scale_weights(w, min_weight, max_weight):
    scaled_weight = min_weight + (max_weight - min_weight) * ((w - min_weight) / (max_weight - min_weight))
    return max(min_weight, scaled_weight)

node_sizes = {node: scale_weights(weighted_degrees[node], 8, 36) for node in G.nodes()}
nx.set_node_attributes(G, node_sizes, "size")

In [129]:
nx.write_gexf(G, f"graphs_raw/{edge_category}/{output_fname}.gexf")

In [None]:
from pyvis.network import Network

# Create a new pyvis network
net = Network(notebook=True, bgcolor="#222222",
              font_color="white", height="100vh")

# filter degree
nonzero_degree_nodes = [node for node, degree in G.degree if degree != 0]

# Create a new graph with only nodes of nonzero degree
G = G.subgraph(nonzero_degree_nodes)
net.from_nx(G)

physics = False
# Disable physics for a static layout
net.force_atlas_2based = physics
if physics:
    net.repulsion = 1000
# Set physics to False to create a static layout
net.options = {
    "nodes": {
        "font": {
            "size": 36,
            "face": "tahoma"
        },
        "shape": "dot",
        "size": 10,
        "color": {
            "border": "#2B7CE9",
            "background": "#666",
            "highlight": {
                "border": "#2B7CE9",
                "background": "#848484"
            }
        },
        "shadow": {
            "enabled": True,
            "size": 15
        },
        "title": "meaning"  # Hover attribute to display semantic meaning
    },
    "edges": {
        "color": {
            "color": "#ccc",
            "highlight": "#848484"
        }
    },
    "interaction": {
        "hover": True,
        "navigationButtons": True,
        "zoomView": True,
        "dragView": physics
    },
    "physics": {
        "enabled": physics
    },
    "layout": ({
        "hierarchical": {
            "enabled": True,
            "levelSeparation": 250,  # Adjust separation between levels
            "nodeSpacing": 300,
            "treeSpacing": 200,
            "direction": "UD",
            "sortMethod": "directed"
        }
    } if not physics else {})
}

# Show the network
net.show(f"html/graphs/{edge_category}/{output_fname}.html")