##imports

In [1]:
import pandas as pd
import altair as alt
import kagglehub

In [2]:
import os
repo_url = "https://github.com/est-her393/data-visual-final.git"
repo_dir = "data-visual-final"

if os.path.exists(repo_dir) and os.listdir(repo_dir):
    print(f"Directory '{repo_dir}' already exists. Skipping clone.")
else:
    print(f"Cloning '{repo_url}' into '{repo_dir}'...")
    !git clone {repo_url}

print("\nFiles in repo:")
print(os.listdir(repo_dir))

Directory 'data-visual-final' already exists. Skipping clone.

Files in repo:
['owid-co2-data.csv', 'annual-co2-emissions-per-country.csv', 'gain.csv', 'vulnerability.csv', 'co2-fossil-plus-land-use.csv', 'annual-co-emissions-by-region.csv', '.git', 'GPD.csv', 'GINI.csv', 'DVP_v3.ipynb', 'readiness.csv']


## kaggle datset for lattitude

In [3]:
# Download latest version
path = kagglehub.dataset_download("hserdaraltan/countries-by-continent")

print("Path to dataset files:", path)
continent_df= pd.read_csv(
    "/kaggle/input/countries-by-continent/Countries by continents.csv"
)
continent_df.head()

Using Colab cache for faster access to the 'countries-by-continent' dataset.
Path to dataset files: /kaggle/input/countries-by-continent


Unnamed: 0,Continent,Country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina


In [4]:
# Download latest version
path = kagglehub.dataset_download("nelgiriyewithana/countries-of-the-world-2023")

print("Path to dataset files:", path)


alt.data_transformers.disable_max_rows()

kaggle_df = pd.read_csv(
    "/kaggle/input/countries-of-the-world-2023/world-data-2023.csv"
)

kaggle_df.head()


Using Colab cache for faster access to the 'countries-of-the-world-2023' dataset.
Path to dataset files: /kaggle/input/countries-of-the-world-2023


Unnamed: 0,Country,Density\n(P/Km2),Abbreviation,Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,Calling Code,Capital/Major City,Co2-Emissions,...,Out of pocket health expenditure,Physicians per thousand,Population,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,Latitude,Longitude
0,Afghanistan,60,AF,58.10%,652230,323000.0,32.49,93.0,Kabul,8672,...,78.40%,0.28,38041754,48.90%,9.30%,71.40%,11.12%,9797273,33.93911,67.709953
1,Albania,105,AL,43.10%,28748,9000.0,11.78,355.0,Tirana,4536,...,56.90%,1.2,2854191,55.70%,18.60%,36.60%,12.33%,1747593,41.153332,20.168331
2,Algeria,18,DZ,17.40%,2381741,317000.0,24.28,213.0,Algiers,150006,...,28.10%,1.72,43053054,41.20%,37.20%,66.10%,11.70%,31510100,28.033886,1.659626
3,Andorra,164,AD,40.00%,468,,7.2,376.0,Andorra la Vella,469,...,36.40%,3.33,77142,,,,,67873,42.506285,1.521801
4,Angola,26,AO,47.50%,1246700,117000.0,40.73,244.0,Luanda,34693,...,33.40%,0.21,31825295,77.50%,9.20%,49.10%,6.89%,21061025,-11.202692,17.873887


#visualization curration

In [5]:
import pandas as pd
import altair as alt

alt.data_transformers.disable_max_rows()

# ======================================
# 1 loading daa
# ======================================
gini_df = pd.read_csv(f"{repo_dir}/GINI.csv")
co2_df  = pd.read_csv(f"{repo_dir}/owid-co2-data.csv")
coords  = pd.read_csv("/kaggle/input/countries-by-continent/Countries by continents.csv")
world_df = pd.read_csv("/kaggle/input/countries-of-the-world-2023/world-data-2023.csv")

# EXPECTED COLUMNS (testing ):
# gini_df:  'Country Name', 'Country Code', 'Indicator Name', 'Indicator Code', '1960'..'2024'
# co2_df:   'country', 'year', 'co2', ...
# coords:   'Country', 'Continent', ...
# world_df: 'Country', 'Latitude', 'Longitude', ...

# ======================================
# 2. Normalize country names
# ======================================
def norm(x):
    return str(x).strip().lower()

gini_df = gini_df.copy()
co2_df  = co2_df.copy()
coords  = coords.copy()
world_df = world_df.copy()

gini_df["country_norm"]  = gini_df["Country Name"].apply(norm)
co2_df["country_norm"]   = co2_df["country"].apply(norm)
coords["country_norm"]   = coords["Country"].apply(norm)
world_df["country_norm"] = world_df["Country"].apply(norm)

# ======================================
# 3. Get latest GINI per country (from year columns)
# ======================================
year_cols = [c for c in gini_df.columns if c.isdigit()]

gini_long = gini_df.melt(
    id_vars=["Country Name", "Country Code", "Indicator Name", "Indicator Code", "country_norm"],
    value_vars=year_cols,
    var_name="year",
    value_name="GINI"
)


gini_latest = (
    gini_long.sort_values("year")
             .dropna(subset=["GINI"])
             .groupby("country_norm")
             .tail(1)[["country_norm", "Country Name", "GINI", "year"]]
)

# ======================================
# 4. Get latest CO2 per country
# ======================================
co2_latest = (
    co2_df.sort_values("year")
          .dropna(subset=["co2"])
          .groupby("country_norm")
          .tail(1)[["country_norm", "country", "year", "co2"]]
          .rename(columns={"co2": "CO2"})
)

# ======================================
# 5. Prep continent + coordinates
# ======================================
coords_slim = coords[["country_norm", "Continent"]]

world_coords = world_df[["country_norm", "Country", "Latitude", "Longitude"]].rename(
    columns={"Country": "Country_latlon"}
)

# ======================================
# 6. Merge into ONE master dataframe
# ======================================
df = (
    gini_latest.rename(columns={"Country Name": "Country"})
      .merge(co2_latest[["country_norm", "CO2"]], on="country_norm", how="left")
      .merge(world_coords, on="country_norm", how="left")
      .merge(coords_slim, on="country_norm", how="left")
)

# Fix coords so they match the projection
df["Latitude"]  = pd.to_numeric(df["Latitude"], errors="coerce")
df["Longitude"] = pd.to_numeric(df["Longitude"], errors="coerce")
# If longitudes are in [0, 360), wrap them to [-180, 180]
df.loc[df["Longitude"] > 180, "Longitude"] = df.loc[df["Longitude"] > 180, "Longitude"] - 360

# ======================================
# 6b. Clean final country names + harmonize for topojson
# ======================================
df["Country_final"] = df["Country"].astype(str).str.strip()

# Base: start with Country_final
df["country_map"] = df["Country_final"]

# Apply specific overrides to match Natural Earth topojson
df.loc[df["Country_final"] == "United States", "country_map"] = "United States of America"
df.loc[df["Country_final"] == "Democratic Republic of Congo", "country_map"] = "Democratic Republic of the Congo"
df.loc[df["Country_final"] == "Congo", "country_map"] = "Republic of the Congo"
df.loc[df["Country_final"] == "Czechia", "country_map"] = "Czech Republic"
df.loc[df["Country_final"] == "Eswatini", "country_map"] = "Swaziland"
df.loc[df["Country_final"] == "North Macedonia", "country_map"] = "Macedonia"
df.loc[df["Country_final"] == "South Korea", "country_map"] = "Korea, South"
df.loc[df["Country_final"] == "North Korea", "country_map"] = "Korea, North"
df.loc[df["Country_final"] == "Cape Verde", "country_map"] = "Cabo Verde"
df.loc[df["Country_final"] == "Ivory Coast", "country_map"] = "CÃ´te d'Ivoire"
df.loc[df["Country_final"] == "Micronesia (country)", "country_map"] = "Micronesia"

# Metric for labels: CO2 per inequality
df["CO2_per_GINI"] = df["CO2"] / df["GINI"]

# Explicit domain for GINI color scale
gini_min = float(df["GINI"].min())
gini_max = float(df["GINI"].max())

# ======================================
# 7. Choropleth: GINI (yellowâ†’red) + graticule
# ======================================
world_topo = alt.topo_feature(
    "https://cdn.jsdelivr.net/npm/world-atlas@2/countries-110m.json",
    "countries"
)

# Background grid
graticule = (
    alt.Chart(alt.graticule())
      .mark_geoshape(
          fill=None,
          stroke="lightgray",
          strokeWidth=0.4
      )
)

# Main choropleth
choro = (
    alt.Chart(world_topo)
      .mark_geoshape(stroke="white", strokeWidth=0.4)
      .transform_lookup(
          lookup="properties.name",
          from_=alt.LookupData(
              df,
              key="country_map",
              fields=["Country_final", "Continent", "GINI", "CO2", "CO2_per_GINI"]
          )
      )
      .encode(
          color=alt.condition(
              "datum.GINI != null",
              alt.Color(
                  "GINI:Q",
                  title="Income inequality (GINI)",
                  scale=alt.Scale(
                      scheme="yelloworangered",
                      domain=[gini_min, gini_max],
                      nice=False
                  ),
                  legend=alt.Legend(
                      orient="bottom",
                      title="Income inequality (GINI)",
                      tickCount=5
                  )
              ),
              alt.value("#4d4d4d")
          ),
          tooltip=[
              alt.Tooltip("Country_final:N",  title="Country"),
              alt.Tooltip("Continent:N"),
              alt.Tooltip("GINI:Q",          title="GINI",      format=".2f"),
              alt.Tooltip("CO2:Q",           title="COâ‚‚",       format=".2f"),
              alt.Tooltip("CO2_per_GINI:Q",  title="COâ‚‚ / GINI",format=".2f")
          ]
      )
)

# ======================================
# 8. Bubbles + labels for top 12 CO2_per_GINI
# ======================================
bubble_df = df.dropna(subset=["Latitude", "Longitude", "CO2_per_GINI"]).copy()
bubble_df = bubble_df.sort_values("CO2_per_GINI", ascending=False).head(12)

bubbles = (
    alt.Chart(bubble_df)
      .mark_circle(opacity=0.85)
      .encode(
          longitude="Longitude:Q",
          latitude="Latitude:Q",
          size=alt.Size(
              "CO2_per_GINI:Q",
              title="COâ‚‚ per GINI index",
              scale=alt.Scale(range=[300, 2000]),
              legend=alt.Legend(
                  title="COâ‚‚ per GINI index",
                  orient="right"
              )
          ),
          color=alt.value("#1a9850"),
          tooltip=[
              alt.Tooltip("Country_final:N", title="Country"),
              alt.Tooltip("Continent:N"),
              alt.Tooltip("GINI:Q",         title="GINI",        format=".2f"),
              alt.Tooltip("CO2:Q",          title="COâ‚‚",         format=".2f"),
              alt.Tooltip("CO2_per_GINI:Q", title="COâ‚‚ per GINI",format=".2f")
          ]
      )
)
labels = (
    alt.Chart(bubble_df)
      .mark_text(
          align="center",
          baseline="middle",
          fontSize=9,
          fontWeight="bold",
          opacity=1
      )
      .encode(
          longitude="Longitude:Q",
          latitude="Latitude:Q",
          text=alt.Text("CO2_per_GINI:Q", format=".1f")
      )
)


# ======================================
# 9. Continent labels
# ======================================
continent_centers = pd.DataFrame({
    "Continent": ["North America", "South America", "Europe", "Africa", "Asia"],
    "Latitude":  [ 50,             -20,             55,       0,        30,     ],
    "Longitude": [-100,            -60,             15,       20,       90,    ]
})

continent_labels = (
    alt.Chart(continent_centers)
      .mark_text(
          fontSize=12,
          fontWeight="lighter",
          opacity=0.35
      )
      .encode(
          longitude="Longitude:Q",
          latitude="Latitude:Q",
          text="Continent:N"
      )
)

# ======================================
# 10. Final full-world map
# ======================================
final_map = (
    graticule + choro + bubbles + labels + continent_labels
).project(
    type="equalEarth"
).properties(
    width=1000,
    height=520,
    title="Global Inequality vs COâ‚‚ Emissions (Full-World View)"
).configure_view(
    stroke=None
).configure_legend(
    labelFontSize=11,
    titleFontSize=12
)

final_map
