In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import os

base_dir = "/content/drive/MyDrive/graph_ai_studio/country_graph_demo"

print("Base dir:", base_dir)

Base dir: /content/drive/MyDrive/graph_ai_studio/country_graph_demo


In [6]:

df_llm = pd.read_parquet(base_dir+"/"+"outputs"+"/"+"country_geo_bg_vectors.parquet")


In [7]:

df_llm.head()

Unnamed: 0,country,iso3,country_geo_bg_vector
0,Afghanistan,AFG,"[-0.08817123621702194, 0.007525322493165731, -..."
1,Albania,ALB,"[-0.02698056772351265, 0.0006709833978675306, ..."
2,Algeria,DZA,"[-0.04456178843975067, 0.01614188216626644, -0..."
3,American Samoa,ASM,"[0.05785828456282616, -0.04124083369970322, -0..."
4,Andorra,AND,"[0.0006614752346649766, 0.019504521042108536, ..."


In [8]:
df_gnn = pd.read_parquet(base_dir+"/"+"outputs"+"/"+"country_nodes_with_h.parquet")

In [9]:
df_gnn.head(2)

Unnamed: 0,country,iso3,nid,h_vector
0,Afghanistan,AFG,0,"[-1.1849541664123535, 1.1546564102172852, -2.5..."
1,Albania,ALB,1,"[-2.417677640914917, 1.6790118217468262, -1.46..."


In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# 1. Align tables by iso3
df = (
    df_llm[["country", "iso3", "country_geo_bg_vector"]]
    .merge(
        df_gnn[["iso3", "h_vector"]],
        on="iso3",
        how="inner",
    )
)

print("Rows after merge:", len(df))

countries = df["country"].values
iso3s     = df["iso3"].values
n = len(df)

# 2. Build matrices of embeddings
X_llm = np.vstack(df["country_geo_bg_vector"].values)  # (N, D_llm)
X_gnn = np.vstack(df["h_vector"].values)               # (N, D_gnn)

# 3. Cosine similarity matrices
sim_llm = cosine_similarity(X_llm)   # (N, N)
sim_gnn = cosine_similarity(X_gnn)   # (N, N)

# 4. Build pairwise table
pairs = []

for i in range(n):
    for j in range(i + 1, n):
        pairs.append({
            "country1": countries[i],
            "country2": countries[j],
            "code1": iso3s[i],
            "code2": iso3s[j],
            "sim_llm": float(sim_llm[i, j]),
            "sim_gnn": float(sim_gnn[i, j]),
            # optional: difference between spaces
            "sim_diff": float(sim_gnn[i, j] - sim_llm[i, j]),
        })

sim_df = pd.DataFrame(pairs)
sim_df.head(2)


Rows after merge: 235


Unnamed: 0,country1,country2,code1,code2,sim_llm,sim_gnn,sim_diff
0,Afghanistan,Albania,AFG,ALB,0.39484,0.482277,0.087437
1,Afghanistan,Algeria,AFG,DZA,0.234396,0.055901,-0.178495


In [13]:
sim_df.tail(2)

Unnamed: 0,country1,country2,code1,code2,sim_llm,sim_gnn,sim_diff
27493,Yemen,Zimbabwe,YEM,ZWE,0.240266,0.007795,-0.232471
27494,Zambia,Zimbabwe,ZMB,ZWE,0.721831,0.980216,0.258385


In [16]:
copy_sim_df=sim_df.copy()

In [14]:
edges_path = os.path.join(base_dir, "outputs", "country_edges_land_sea.csv")
edges = pd.read_csv(edges_path)

In [15]:
edges.tail(2)

Unnamed: 0,node1,node2,how
542,VEN,VIR,sea
543,VGB,VIR,sea


In [17]:
import pandas as pd

# 1. Normalize edges and aggregate relationship types per pair
edges_clean = edges.copy()

# clean 'how' a bit (strip, lowercase, remove trailing periods)
edges_clean["how_clean"] = (
    edges_clean["how"]
    .str.strip()
    .str.lower()
    .str.replace(r"\.$", "", regex=True)
)

# create an order-invariant key for the pair
edges_clean["key"] = edges_clean.apply(
    lambda r: tuple(sorted([r["node1"], r["node2"]])),
    axis=1
)

# collect all relationship types per pair
pair_types = (
    edges_clean
    .groupby("key")["how_clean"]
    .agg(lambda x: set(x))
    .reset_index(name="how_set")
)

def classify_relation(how_set):
    has_land = "land" in how_set
    has_sea = "sea" in how_set
    if has_land and has_sea:
        return "both"
    elif has_land:
        return "land"
    elif has_sea:
        return "sea"
    else:
        return "none"

pair_types["border_type"] = pair_types["how_set"].apply(classify_relation)

# mapping: (codeA, codeB) -> "both"/"land"/"sea"/"none"
border_map = dict(zip(pair_types["key"], pair_types["border_type"]))

# 2. Apply this mapping to sim_df

sim_df = sim_df.copy()
sim_df["key"] = sim_df.apply(
    lambda r: tuple(sorted([r["code1"], r["code2"]])),
    axis=1
)

sim_df["border_type"] = sim_df["key"].map(border_map).fillna("none")

# optional: drop helper key
sim_df = sim_df.drop(columns=["key"])

sim_df.tail(2)


Unnamed: 0,country1,country2,code1,code2,sim_llm,sim_gnn,sim_diff,border_type
27493,Yemen,Zimbabwe,YEM,ZWE,0.240266,0.007795,-0.232471,none
27494,Zambia,Zimbabwe,ZMB,ZWE,0.721831,0.980216,0.258385,land


In [18]:
# Counts per border type
counts = sim_df["border_type"].value_counts().rename("count")

# Percentages per border type
percents = (sim_df["border_type"]
            .value_counts(normalize=True)
            .mul(100)
            .round(2)
            .rename("percent"))

# Combine into one table
border_stats = pd.concat([counts, percents], axis=1)
border_stats



Unnamed: 0_level_0,count,percent
border_type,Unnamed: 1_level_1,Unnamed: 2_level_1
none,27067,98.44
land,231,0.84
sea,127,0.46
both,70,0.25


In [21]:
borders_sim_df=sim_df
borders_sim_df.head()

Unnamed: 0,country1,country2,code1,code2,sim_llm,sim_gnn,sim_diff,border_type
0,Afghanistan,Albania,AFG,ALB,0.39484,0.482277,0.087437,none
1,Afghanistan,Algeria,AFG,DZA,0.234396,0.055901,-0.178495,none
2,Afghanistan,American Samoa,AFG,ASM,0.036484,-0.264828,-0.301312,none
3,Afghanistan,Andorra,AFG,AND,0.069757,-0.04383,-0.113587,none
4,Afghanistan,Angola,AFG,AGO,0.084182,-0.396025,-0.480207,none


In [22]:
borders_sim_df = sim_df[sim_df["border_type"] != "none"].copy()
len(borders_sim_df)


428

In [29]:
def select_pairs_by_sim(
    borders_df,
    vector_type="llm",   # "llm" or "gnn"
    n=5,
    which="top",
    border_type="both",
):
    """
    Return top or bottom n pairs by similarity, for a chosen vector space,
    filtered by a single border_type.

    Parameters
    ----------
    borders_df : pd.DataFrame
        DataFrame with columns:
        country1, country2, code1, code2, border_type, and sim_llm / sim_gnn.
    vector_type : {"llm", "gnn"}, optional
        Which similarity column to use:
        - "llm" -> sim_llm
        - "gnn" -> sim_gnn
    n : int, optional
        How many rows to return (default 5).
    which : {"top", "bottom"}, optional
        Whether to return the highest ("top") or lowest ("bottom") values.
    border_type : str, optional
        Which border_type to keep, e.g. "land", "sea", "both".
        Default is "both".
    """
    if which not in ("top", "bottom"):
        raise ValueError("which must be 'top' or 'bottom'")

    # map vector_type to column name
    if vector_type == "llm":
        col_name = "sim_llm"
    elif vector_type == "gnn":
        col_name = "sim_gnn"
    else:
        raise ValueError("vector_type must be 'llm' or 'gnn'")

    df = borders_df

    # filter by single border_type string
    if border_type is not None:
        df = df[df["border_type"] == border_type]

    ascending = (which == "bottom")

    return (
        df
        .sort_values(col_name, ascending=ascending)
        [["country1", "country2", "code1", "code2", "border_type", col_name]]
        .head(n)
    )


In [27]:
# Top 5 LLM-similar land borders
top5_land_llm = select_pairs_by_sim(borders_sim_df, vector_type="gnn",
                                    n=5, which="bottom", border_type="land")
top5_land_llm

Unnamed: 0,country1,country2,code1,code2,border_type,sim_gnn
12089,Egypt,West Bank,EGY,PSE,land,0.620919
11934,Egypt,"Gaza, Gaza Strip",EGY,PSE,land,0.620919
19488,Jordan,West Bank,JOR,PSE,land,0.654754
14647,"Gaza, Gaza Strip",Jordan,PSE,JOR,land,0.654754
43,Afghanistan,China,AFG,CHN,land,0.802504


In [31]:
def top_pairs_by_border_type(
    borders_df,
    vector_type="llm",   # "llm" or "gnn"
    n=5,
    which="top",
):
    """
    For a given vector_type, return top/bottom n pairs for each border_type
    in the order: both, land, sea.
    """
    border_types = ["both", "land", "sea"]
    frames = []

    for bt in border_types:
        df_bt = select_pairs_by_sim(
            borders_df,
            vector_type=vector_type,
            n=n,
            which=which,
            border_type=bt,
        )
        # drop code1, code2
        df_bt = df_bt.drop(columns=["code1", "code2"])
        df_bt["vector_type"] = vector_type
        frames.append(df_bt)

    return pd.concat(frames, ignore_index=True)


In [43]:
# Top 3 per border type in LLM space
top_llm = top_pairs_by_border_type(borders_sim_df, vector_type="llm", n=4, which="top")
# Top 3 per border type in GNN space
top_gnn = top_pairs_by_border_type(borders_sim_df, vector_type="gnn", n=4, which="top")
# Bottom 3 per border type in LLM space
bottom_llm = top_pairs_by_border_type(borders_sim_df, vector_type="llm", n=4, which="bottom")
# Bottom 3 per border type in GNN space
bottom_gnn = top_pairs_by_border_type(borders_sim_df, vector_type="gnn", n=4, which="bottom")


In [44]:
top_llm

Unnamed: 0,country1,country2,border_type,sim_llm,vector_type
0,North Korea,South Korea,both,0.987409,llm
1,Saint Martin,Sint Maarten,both,0.947324,llm
2,Croatia,Slovenia,both,0.811101,llm
3,India,Pakistan,both,0.783749,llm
4,Czechia,Slovakia,land,0.858777,llm
5,Kenya,Tanzania,land,0.820144,llm
6,Estonia,Latvia,land,0.815736,llm
7,Croatia,Serbia,land,0.813542,llm
8,Guam,Northern Mariana Islands,sea,0.855141,llm
9,Guernsey,Jersey,sea,0.848164,llm


In [45]:
top_gnn

Unnamed: 0,country1,country2,border_type,sim_gnn,vector_type
0,Saint Martin,Sint Maarten,both,0.99959,gnn
1,Croatia,Slovenia,both,0.997822,gnn
2,Costa Rica,Panama,both,0.997599,gnn
3,Algeria,Tunisia,both,0.997013,gnn
4,Bosnia and Herzegovina,Croatia,land,0.998875,gnn
5,Bosnia and Herzegovina,Serbia,land,0.998818,gnn
6,Croatia,Serbia,land,0.99871,gnn
7,Albania,North Macedonia,land,0.998233,gnn
8,Colombia,Nicaragua,sea,0.998526,gnn
9,Niue,Tonga,sea,0.998327,gnn


In [46]:
bottom_llm

Unnamed: 0,country1,country2,border_type,sim_llm,vector_type
0,Georgia,Turkey,both,0.329582,llm
1,Syria,Turkey,both,0.389485,llm
2,Romania,Ukraine,both,0.393915,llm
3,Mozambique,South Africa,both,0.424672,llm
4,Mexico,United States,land,0.190988,llm
5,Afghanistan,China,land,0.29861,llm
6,Eswatini,South Africa,land,0.313812,llm
7,India,Nepal,land,0.315023,llm
8,French Southern and Antarctic Lands,Mozambique,sea,0.177823,llm
9,Saint Kitts and Nevis,Venezuela,sea,0.183874,llm


In [47]:
bottom_gnn

Unnamed: 0,country1,country2,border_type,sim_gnn,vector_type
0,Israel,West Bank,both,0.665023,gnn
1,"Gaza, Gaza Strip",Israel,both,0.665023,gnn
2,India,Pakistan,both,0.864399,gnn
3,Egypt,Libya,both,0.88037,gnn
4,Egypt,West Bank,land,0.620919,gnn
5,Egypt,"Gaza, Gaza Strip",land,0.620919,gnn
6,Jordan,West Bank,land,0.654754,gnn
7,"Gaza, Gaza Strip",Jordan,land,0.654754,gnn
8,Canada,Saint Pierre and Miquelon,sea,0.57825,gnn
9,French Southern and Antarctic Lands,Mozambique,sea,0.709989,gnn
