In [2]:
import pandas as pd
import os

In [3]:
common_countries = pd.read_csv("../data/common_files/common_countries.csv")
common_iso_codes = set(common_countries["ISO"])
print(len(common_iso_codes))

71


In [None]:
# Directory containing CSV files
data_dir = "../data/clean_data/"
json_output_dir = "../data/analysis_data/json_files"
csv_ouput_dir = "../data/analysis_data/csv_files"

# Loop through each CSV in the directory
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(data_dir, file)
        df = pd.read_csv(file_path)

        # Assuming the ISO code column is named "ISO"
        df_filtered = df[df["ISO"].isin(common_iso_codes)]
        df_filtered.to_csv(os.path.join(csv_ouput_dir, file), index=False)


        # Convert to JSON and save
        json_filename = os.path.splitext(file)[0] + ".json"  # Change extension
        df_filtered.to_json(os.path.join(json_output_dir, json_filename), orient="records", indent=2)

In [26]:
# Directory containing CSV files
data_dir = "../data/analysis_data/csv_files/"
merged_df = pd.DataFrame()
columns_to_drop = {"Rank", "SERIES Description", "TIME_PERIOD", "Year", "Country"}

# Loop through each CSV and merge data
for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        file_path = os.path.join(data_dir, file)
        df = pd.read_csv(file_path)
        df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

        if merged_df.empty:
            merged_df = df[df["ISO"].isin(common_iso_codes)]
        else:
            df = df[df["ISO"].isin(common_iso_codes)]
            merged_df = pd.merge(merged_df, df, on=["ISO"], how="outer")


In [27]:
# List of "bad" metrics where lower values are worse
bad_metrics = ["MMR"]

# Normalize all numeric columns
for col in merged_df.columns:
    if col not in ["ISO"]:
        # Invert "bad" metrics (min-max normalization, flipped for bad metrics)
        if col in bad_metrics:
            max_val = merged_df[col].max()
            merged_df[col] = (max_val - merged_df[col]) / (max_val - merged_df[col].min())
        else:
            # Normalization for good metrics (min-max normalization)
            min_val = merged_df[col].min()
            max_val = merged_df[col].max()
            merged_df[col] = (merged_df[col] - min_val) / (max_val - min_val)


In [28]:
country_column = pd.read_csv('../data/analysis_data/csv_files/enrollment_parity_clean.csv')
country_column = country_column[['ISO', 'Country']]
merged_df = pd.merge(merged_df, country_column, on='ISO', how='outer')

# Save merged data as JSON
merged_df.to_json("../data/common_files/merged_data.json", orient="records", indent=2)

In [29]:
import geopandas as gpd

# Load your geojson file
geojson_path = "../data/common_files/countries.geojson"
gdf = gpd.read_file(geojson_path)

# Merge the geojson with the normalized data
gdf = gdf.merge(merged_df, left_on="ISO_A3", right_on="ISO", how="left")

# Now gdf contains both geometry and data
gdf.to_file("../data/common_files/merged_geojson.geojson", driver="GeoJSON")