In [33]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
import os

base_dir = "/content/drive/MyDrive/graph_ai_studio/country_graph_demo"

print("Base dir:", base_dir)


Base dir: /content/drive/MyDrive/graph_ai_studio/country_graph_demo


In [35]:
import os

zip_path = base_dir + "/data/text/factbook_json_zip.zip"
extract_dir = base_dir + "/data/text/factbook"  # where all JSONs will live

print("Zip path:", zip_path)
print("Extract dir:", extract_dir)

# Create target folder if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Only unzip if folder is (almost) empty
if not any(os.scandir(extract_dir)):
    print("Extracting zip...")
    !unzip -q "$zip_path" -d "$extract_dir"
    print("Unzip done.")
else:
    print("Factbook folder already has files, skipping unzip.")

print("Sample of files in factbook dir:")
!ls "$extract_dir" | head



Zip path: /content/drive/MyDrive/graph_ai_studio/country_graph_demo/data/text/factbook_json_zip.zip
Extract dir: /content/drive/MyDrive/graph_ai_studio/country_graph_demo/data/text/factbook
Factbook folder already has files, skipping unzip.
Sample of files in factbook dir:
factbook.json


In [36]:
import os
import json
import pandas as pd
from bs4 import BeautifulSoup

factbook_dir = extract_dir  # shortcut

results = []

for root, dirs, files in os.walk(factbook_dir):
    for filename in files:
        if not filename.endswith(".json"):
            continue

        filepath = os.path.join(root, filename)

        with open(filepath, 'r', encoding='utf-8') as f:
            country_data = json.load(f)

        # --- Country name logic ---
        short_form = (
            country_data
            .get("Government", {})
            .get("Country name", {})
            .get("conventional short form")
        )
        long_form = (
            country_data
            .get("Government", {})
            .get("Country name", {})
            .get("conventional long form")
        )

        # If dict, extract the 'text' field
        if isinstance(short_form, dict):
            short_form = short_form.get("text")

        if isinstance(long_form, dict):
            long_form = long_form.get("text")

        # Final country name
        if short_form and isinstance(short_form, str) and short_form.lower() != "none":
            country_name = short_form
        elif long_form and isinstance(long_form, str) and long_form.lower() != "none":
            country_name = long_form
        else:
            country_name = country_data.get("name") or "Unknown"

        # Helper to extract and clean HTML â†’ plain text
        def get_clean_text(section, field):
            raw_html = (
                country_data
                .get(section, {})
                .get(field, {})
                .get("text", "")
            )
            return BeautifulSoup(raw_html, 'html.parser').get_text().strip()

        background = get_clean_text("Introduction", "Background")
        location   = get_clean_text("Geography", "Location")
        climate    = get_clean_text("Geography", "Climate")
        terrain    = get_clean_text("Geography", "Terrain")

        geography = " ".join([t for t in [location, climate, terrain] if t])

        # Two text variants (space-separated, no \n\n)
        bg_geo = " ".join(t for t in [background, geography] if t).strip()
        geo_bg = " ".join(t for t in [geography, background] if t).strip()

        # Only keep entries with at least some text
        if not bg_geo and not geo_bg:
            continue

        # New: natural-language variants with country name and quotes
        country_bg_geo = (
            f"Country's background: \"{background}\". "
            f"Country's geography: \"{geography}\""
        ).strip()

        country_geo_bg = (
            f"Country's geography: \"{geography}\". "
            f"Country's background: \"{background}\""
        ).strip()

        results.append({
            "country": country_name,
            "background": background,
            "geography": geography,
            "bg_geo": bg_geo,
            "geo_bg": geo_bg,
            "country_bg_geo": country_bg_geo,
            "country_geo_bg": country_geo_bg,
        })

print(f"Extracted data for {len(results)} countries.")

df_text = pd.DataFrame(results)
df_text.head()


Extracted data for 261 countries.


Unnamed: 0,country,background,geography,bg_geo,geo_bg,country_bg_geo,country_geo_bg
0,Greenland,"Greenland, the world's largest island, is abou...","Northern North America, island between the Arc...","Greenland, the world's largest island, is abou...","Northern North America, island between the Arc...","Country's background: ""Greenland, the world's ...","Country's geography: ""Northern North America, ..."
1,United States,Thirteen of Britain's American colonies broke ...,"North America, bordering both the North Atlant...",Thirteen of Britain's American colonies broke ...,"North America, bordering both the North Atlant...","Country's background: ""Thirteen of Britain's A...","Country's geography: ""North America, bordering..."
2,Saint Pierre and Miquelon,First settled by the French in the early 17th ...,"Northern North America, islands in the North A...",First settled by the French in the early 17th ...,"Northern North America, islands in the North A...","Country's background: ""First settled by the Fr...","Country's geography: ""Northern North America, ..."
3,Canada,A land of vast distances and rich natural reso...,"Northern North America, bordering the North At...",A land of vast distances and rich natural reso...,"Northern North America, bordering the North At...","Country's background: ""A land of vast distance...","Country's geography: ""Northern North America, ..."
4,Clipperton Island,This isolated atoll was named for John CLIPPER...,"Middle America, atoll in the North Pacific Oce...",This isolated atoll was named for John CLIPPER...,"Middle America, atoll in the North Pacific Oce...","Country's background: ""This isolated atoll was...","Country's geography: ""Middle America, atoll in..."


In [37]:
df_text.tail()

Unnamed: 0,country,background,geography,bg_geo,geo_bg,country_bg_geo,country_geo_bg
256,Comoros,For centuries prior to colonization in the 19t...,"Southern Africa, group of islands at the north...",For centuries prior to colonization in the 19t...,"Southern Africa, group of islands at the north...","Country's background: ""For centuries prior to ...","Country's geography: ""Southern Africa, group o..."
257,Somalia,"Between A.D. 800 and 1100, immigrant Muslim Ar...","Eastern Africa, bordering the Gulf of Aden and...","Between A.D. 800 and 1100, immigrant Muslim Ar...","Eastern Africa, bordering the Gulf of Aden and...","Country's background: ""Between A.D. 800 and 11...","Country's geography: ""Eastern Africa, borderin..."
258,Eritrea,Eritrea won independence from Italian colonial...,"Eastern Africa, bordering the Red Sea, between...",Eritrea won independence from Italian colonial...,"Eastern Africa, bordering the Red Sea, between...","Country's background: ""Eritrea won independenc...","Country's geography: ""Eastern Africa, borderin..."
259,Uganda,"An ancient crossroads for various migrations, ...","East-Central Africa, west of Kenya, east of th...","An ancient crossroads for various migrations, ...","East-Central Africa, west of Kenya, east of th...","Country's background: ""An ancient crossroads f...","Country's geography: ""East-Central Africa, wes..."
260,Mauritania,The Amazigh and Bafour people were among the e...,"Western Africa, bordering the North Atlantic O...",The Amazigh and Bafour people were among the e...,"Western Africa, bordering the North Atlantic O...","Country's background: ""The Amazigh and Bafour ...","Country's geography: ""Western Africa, borderin..."


In [38]:


# Drop rows with Unknown country
df_text = df_text[df_text["country"] != "Unknown"].copy()

# Optional: if you suspect true duplicates with identical text
df_text = df_text.drop_duplicates(subset=["country", "bg_geo"])

print("After cleaning:", len(df_text))
df_text["country"].value_counts().head(2)


After cleaning: 254


Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
Greenland,1
United States,1


In [39]:
import geopandas as gpd

land_path = base_dir + "/data/land/ne_10m_admin_0_countries.shp"
land = gpd.read_file(land_path)

print("Land columns:", land.columns.tolist())

land[["ADMIN", "NAME_LONG", "FORMAL_EN", "ISO_A3"]].head(3)


Land columns: ['featurecla', 'scalerank', 'LABELRANK', 'SOVEREIGNT', 'SOV_A3', 'ADM0_DIF', 'LEVEL', 'TYPE', 'TLC', 'ADMIN', 'ADM0_A3', 'GEOU_DIF', 'GEOUNIT', 'GU_A3', 'SU_DIF', 'SUBUNIT', 'SU_A3', 'BRK_DIFF', 'NAME', 'NAME_LONG', 'BRK_A3', 'BRK_NAME', 'BRK_GROUP', 'ABBREV', 'POSTAL', 'FORMAL_EN', 'FORMAL_FR', 'NAME_CIAWF', 'NOTE_ADM0', 'NOTE_BRK', 'NAME_SORT', 'NAME_ALT', 'MAPCOLOR7', 'MAPCOLOR8', 'MAPCOLOR9', 'MAPCOLOR13', 'POP_EST', 'POP_RANK', 'POP_YEAR', 'GDP_MD', 'GDP_YEAR', 'ECONOMY', 'INCOME_GRP', 'FIPS_10', 'ISO_A2', 'ISO_A2_EH', 'ISO_A3', 'ISO_A3_EH', 'ISO_N3', 'ISO_N3_EH', 'UN_A3', 'WB_A2', 'WB_A3', 'WOE_ID', 'WOE_ID_EH', 'WOE_NOTE', 'ADM0_ISO', 'ADM0_DIFF', 'ADM0_TLC', 'ADM0_A3_US', 'ADM0_A3_FR', 'ADM0_A3_RU', 'ADM0_A3_ES', 'ADM0_A3_CN', 'ADM0_A3_TW', 'ADM0_A3_IN', 'ADM0_A3_NP', 'ADM0_A3_PK', 'ADM0_A3_DE', 'ADM0_A3_GB', 'ADM0_A3_BR', 'ADM0_A3_IL', 'ADM0_A3_PS', 'ADM0_A3_SA', 'ADM0_A3_EG', 'ADM0_A3_MA', 'ADM0_A3_PT', 'ADM0_A3_AR', 'ADM0_A3_JP', 'ADM0_A3_KO', 'ADM0_A3_VN', 'AD

Unnamed: 0,ADMIN,NAME_LONG,FORMAL_EN,ISO_A3
0,Indonesia,Indonesia,Republic of Indonesia,IDN
1,Malaysia,Malaysia,Malaysia,MYS
2,Chile,Chile,Republic of Chile,CHL


In [40]:
len(set(land["ISO_A3"]))

237

In [41]:
def normalize_name(s):
    if not isinstance(s, str):
        return None
    return s.strip().lower()

name_iso_rows = []

for _, row in land.iterrows():
    iso = row.get("ISO_A3")
    # Skip invalid ISO codes
    if not isinstance(iso, str) or iso.strip() in ("-99", "", "None", "none"):
        continue

    candidates = [
        row.get("ADMIN"),
        row.get("NAME_LONG"),
        row.get("FORMAL_EN"),
        row.get("SOVEREIGNT"),
        row.get("NAME_EN"),   # may not exist
        row.get("BRK_NAME"),
    ]

    for c in candidates:
        c_norm = normalize_name(c)
        if c_norm:
            name_iso_rows.append({"name_clean": c_norm, "iso3": iso.strip()})

name_iso_df = pd.DataFrame(name_iso_rows).drop_duplicates()
print("Nameâ†’ISO rows:", len(name_iso_df))
name_iso_df.head()


Nameâ†’ISO rows: 508


Unnamed: 0,name_clean,iso3
0,indonesia,IDN
2,republic of indonesia,IDN
6,malaysia,MYS
12,chile,CHL
14,republic of chile,CHL


In [42]:
len(set(name_iso_df['iso3']))

236

In [43]:
set(name_iso_df['iso3']) - set(land["ISO_A3"])

set()

In [44]:
set(land["ISO_A3"])-set(name_iso_df['iso3'])

{'-99'}

In [45]:
name_to_iso_exact = (
    name_iso_df
    .dropna(subset=["name_clean", "iso3"])
    .set_index("name_clean")["iso3"]
    .to_dict()
)

all_ne_names = list(name_to_iso_exact.keys())


In [46]:
import difflib

df_text["country_clean"] = df_text["country"].astype(str).str.strip().str.lower()

def map_country_to_iso3(name_clean, cutoff=0.80):
    if not isinstance(name_clean, str):
        return None

    # exact match
    if name_clean in name_to_iso_exact:
        return name_to_iso_exact[name_clean]

    # fuzzy match
    matches = difflib.get_close_matches(name_clean, all_ne_names, n=1, cutoff=cutoff)
    if matches:
        return name_to_iso_exact[matches[0]]

    return None

df_text["iso3"] = df_text["country_clean"].apply(map_country_to_iso3)


In [47]:
df_text.shape, len(set(df_text["iso3"]))

((254, 9), 220)

In [48]:
df_text.head(2)

Unnamed: 0,country,background,geography,bg_geo,geo_bg,country_bg_geo,country_geo_bg,country_clean,iso3
0,Greenland,"Greenland, the world's largest island, is abou...","Northern North America, island between the Arc...","Greenland, the world's largest island, is abou...","Northern North America, island between the Arc...","Country's background: ""Greenland, the world's ...","Country's geography: ""Northern North America, ...",greenland,GRL
1,United States,Thirteen of Britain's American colonies broke ...,"North America, bordering both the North Atlant...",Thirteen of Britain's American colonies broke ...,"North America, bordering both the North Atlant...","Country's background: ""Thirteen of Britain's A...","Country's geography: ""North America, bordering...",united states,USA


In [49]:
len(set(df_text["iso3"]))

220

In [50]:
manual_overrides = {
    "china": "CHN",
    "people's republic of china": "CHN",

    "cote d'ivoire": "CIV",
    "cÃ´te d'ivoire": "CIV",

    "congo, democratic republic of the": "COD",
    "democratic republic of the congo": "COD",
    "dr congo": "COD",

    "congo, republic of the": "COG",
    "republic of the congo": "COG",

    "denmark": "DNK",

    "dominican republic": "DOM",

    "finland": "FIN",

    "france": "FRA",

    "united kingdom": "GBR",
    "united kingdom of great britain and northern ireland": "GBR",
    "uk": "GBR",
    "great britain": "GBR",

    "israel": "ISR",

    "myanmar": "MMR",
    "burma": "MMR",

    "netherlands": "NLD",

    "norway": "NOR",
    # CÃ´te d'Ivoire
    "cote d'ivoire": "CIV",
    "cÃ´te d'ivoire": "CIV",

    # DR Congo
    "congo, democratic republic of the": "COD",
    "democratic republic of the congo": "COD",
    "dr congo": "COD",

    # Republic of the Congo
    "congo, republic of the": "COG",
    "republic of the congo": "COG",

    # Dominican Republic
    "dominican republic": "DOM",
    "holy see (vatican city)": "VAT",
    "vatican city": "VAT",

    "central african republic": "CAF",
    "united arab emirates": "ARE",
    "holy see (vatican city)": "VAT",  # Note: Correct ISO code is VAT, not VIT
    "c&ocirc;te d'ivoire": "CIV",
    "palestine": "PSE",
    "aland islands": "ALA",
    "Ã¥land islands": "ALA",

    # Australia
    "australia": "AUS",

    # DR Congo
    "congo, democratic republic of the": "COD",
    "democratic republic of the congo": "COD",
    "dr congo": "COD",

    # Republic of the Congo
    "congo, republic of the": "COG",
    "republic of the congo": "COG",

    # Dominican Republic
    "dominican republic": "DOM",

    # New Zealand
    "new zealand": "NZL",

    # Saint Helena, Ascension, and Tristan da Cunha
    "saint helena, ascension, and tristan da cunha": "SHN",
    "saint helena, ascension and tristan da cunha": "SHN",

    # U.S. Minor Outlying Islands
    "us minor outlying islands": "UMI",
    "u.s. minor outlying islands": "UMI",

    # U.S. Virgin Islands
    "virgin islands (u.s.)": "VIR",
    "u.s. virgin islands": "VIR",
    "gaza, gaza strip": "PSE",  # Using PSE for both Gaza and West Bank
    "west bank": "PSE",
}


In [51]:
# (optional) manual_overrides dict here if you use it

def apply_manual_overrides(row):
    cc = row["country_clean"]
    if 'manual_overrides' in globals() and cc in manual_overrides:
        return manual_overrides[cc]
    return row["iso3"]

df_text["iso3"] = df_text.apply(apply_manual_overrides, axis=1)

# Final cleanup: remove missing iso3
df_text = df_text[df_text["iso3"].notna()].copy()
print("Final rows after dropping unmapped or invalid ISO:", len(df_text))
print("Any iso3 == 'None'?:", (df_text["iso3"].astype(str).str.lower() == "none").any())


Final rows after dropping unmapped or invalid ISO: 233
Any iso3 == 'None'?: False


In [52]:
len(set(df_text["iso3"]))

232

In [53]:
set(name_iso_df['iso3']) - set(df_text["iso3"] )

{'ALA', 'COD', 'COG', 'DOM', 'UMI', 'VIR'}

In [54]:
df_text.tail(2)

Unnamed: 0,country,background,geography,bg_geo,geo_bg,country_bg_geo,country_geo_bg,country_clean,iso3
259,Uganda,"An ancient crossroads for various migrations, ...","East-Central Africa, west of Kenya, east of th...","An ancient crossroads for various migrations, ...","East-Central Africa, west of Kenya, east of th...","Country's background: ""An ancient crossroads f...","Country's geography: ""East-Central Africa, wes...",uganda,UGA
260,Mauritania,The Amazigh and Bafour people were among the e...,"Western Africa, bordering the North Atlantic O...",The Amazigh and Bafour people were among the e...,"Western Africa, bordering the North Atlantic O...","Country's background: ""The Amazigh and Bafour ...","Country's geography: ""Western Africa, borderin...",mauritania,MRT


In [55]:
df_sorted = df_text[['country', 'iso3', 'background','geography','bg_geo','geo_bg','country_bg_geo','country_geo_bg']].sort_values(by="country")


In [56]:
df_sorted.tail(2)

Unnamed: 0,country,iso3,background,geography,bg_geo,geo_bg,country_bg_geo,country_geo_bg
220,Zambia,ZMB,Bantu-speaking groups mainly from the Luba and...,"Southern Africa, east of Angola, south of the ...",Bantu-speaking groups mainly from the Luba and...,"Southern Africa, east of Angola, south of the ...","Country's background: ""Bantu-speaking groups m...","Country's geography: ""Southern Africa, east of..."
222,Zimbabwe,ZWE,The hunter-gatherer San people first inhabited...,"Southern Africa, between South Africa and Zamb...",The hunter-gatherer San people first inhabited...,"Southern Africa, between South Africa and Zamb...","Country's background: ""The hunter-gatherer San...","Country's geography: ""Southern Africa, between..."


In [58]:
output_path = base_dir + "/outputs/country_bg_geo_factbook.csv"
df_sorted.to_csv(output_path, index=False)