In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
data = Path("../data")
plot_path = data / "plots"
map_path = data / "graph3_map"
stat_path = map_path / "stats"
geojson_path = map_path / "geojson"

import json

# Load geojson data

- `custom.geo.json` contains less GeoJSON data than `custom50.json` but stores for each feature/country its name in Japanese (the `name_ja` property), so we need to use it
- However there are a few inconsistencies in the names (`admin` property) between the two files that we first need to fix

In [2]:
# Load custom.geo.json
with open(geojson_path / "custom.geo.json") as f:
    custom = json.load(f)
df_custom = pd.DataFrame(
    [
        {
            "country": feature["properties"]["admin"],
            "country_aff": feature["properties"]["sovereignt"],
            "name_ja": feature["properties"]["name_ja"]
        }
        for feature in custom["features"]
    ]
)

In [3]:
# Load custom50.json
with open(geojson_path / "custom50.json") as f:
    custom50 = json.load(f)

df_custom50 = pd.DataFrame(
    [
        {
            "country": feature["properties"]["admin"],
            "country_aff": feature["properties"]["sovereignt"],
        }
        for feature in custom50["features"]
    ]
)

As mentioned above, `custom50.json` contains more data than `custom.geo.json`:

In [4]:
# The countries that are in custom50 but not in custom
set(df_custom50["country"]) - set(df_custom["country"])

{'Aland',
 'American Samoa',
 'Andorra',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Aruba',
 'Ashmore and Cartier Islands',
 'Bahrain',
 'Barbados',
 'Bermuda',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Cape Verde',
 'Cayman Islands',
 'Comoros',
 'Cook Islands',
 'Curaçao',
 'Czech Republic',
 'Dominica',
 'Faroe Islands',
 'Federated States of Micronesia',
 'French Polynesia',
 'French Southern and Antarctic Lands',
 'Grenada',
 'Guam',
 'Guernsey',
 'Guinea Bissau',
 'Heard Island and McDonald Islands',
 'Hong Kong S.A.R.',
 'Indian Ocean Territories',
 'Isle of Man',
 'Jersey',
 'Kiribati',
 'Liechtenstein',
 'Macao S.A.R',
 'Macedonia',
 'Maldives',
 'Malta',
 'Marshall Islands',
 'Mauritius',
 'Monaco',
 'Montserrat',
 'Nauru',
 'Niue',
 'Norfolk Island',
 'Northern Mariana Islands',
 'Palau',
 'Pitcairn Islands',
 'Republic of Congo',
 'Saint Barthelemy',
 'Saint Helena',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Martin',
 'Saint Pierre a

In [5]:
# The countries that are in custom but not in custom50
set(df_custom["country"]) - set(df_custom50["country"])

{'Czechia',
 'Guinea-Bissau',
 'North Macedonia',
 'Republic of the Congo',
 'eSwatini'}

Those correspond to inconsistencies. We fix them by applying the following mapping:
- Czechia -> Czech Republic
- Guinea-Bissau -> Guinea Bissau
- North Macedonia -> Macedonia
- Republic of the Congo -> Republic of Congo
- eSwatini -> Swaziland

In [6]:
custom_to_custom50 = {
    "Czechia": "Czech Republic",
    "Guinea-Bissau": "Guinea Bissau",
    "North Macedonia": "Macedonia",
    "Republic of the Congo": "Republic of Congo",
    "eSwatini": "Swaziland"
}

# Modify the custom json file by changing the all the "admin" and "sovereignt" fields that need it according to the mapping defined in the markdown cell just above
df_custom["country"] = df_custom["country"].apply(lambda x: custom_to_custom50[x] if x in custom_to_custom50 else x)
df_custom["country_aff"] = df_custom["country_aff"].apply(lambda x: custom_to_custom50[x] if x in custom_to_custom50 else x)

# Now the countries that are in custom are a subset of those in custom50
assert set(df_custom["country"]).issubset(set(df_custom50["country"]))

Now we can "join" `custom` to `custom50` by adding the `name_ja` field to `custom50` that correspond to the `name_ja` field of `custom` (based on `admin`)


In [7]:
for feature in custom50["features"]:
    if feature["properties"]["admin"] in df_custom["country"].values:
        feature["properties"]["name_ja"] = df_custom[df_custom["country"] == feature["properties"]["admin"]]["name_ja"].values[0]
    else:
        feature["properties"]["name_ja"] = ""

# Contains now the name_ja field for each feature/country
with open(geojson_path / "custom50_processed.json", "w") as f:
    json.dump(custom50, f)

# Join GeoJSON data with stats data
- There are inconsistencies between the names found by the geomapping and the names in the GeoJSON data that we need to fix
- Some countries are missing in the GeoJSON data so we need to add them

In [8]:
df_countries = pd.read_csv(stat_path / "country_num_users.csv")
df_countries["country"].sort_values().to_csv(map_path / "countries.csv", index=False)

In [9]:
# The countries that are in countries but not in custom50 (do not have geojson or have different names)
set(df_countries["country"]) - set(df_custom50["country"])

{'Bahamas',
 'Bouvet Island',
 'Brunei Darussalam',
 'Cabo Verde',
 'Canarias',
 'Christmas Island',
 'Congo',
 'Congo DRC',
 'Curacao',
 "Côte d'Ivoire",
 'Eswatini',
 'French Guiana',
 'North Macedonia',
 'Null Island',
 'Palestinian Territory',
 'Pitcairn',
 'Russian Federation',
 'Réunion',
 'Serbia',
 'Svalbard',
 'Tanzania',
 'US Virgin Islands',
 'United States',
 'Vatican City'}

We fix those corresponding to inconsistencies hem by applying the following mapping:

- Bahamas -> The Bahamas
- Brunei Darussalam -> Brunei
"Congo": "Republic of Congo",
"Côte d'Ivoire": "Ivory Coast",
"Serbia": "Republic of Serbia",
"Tanzania": "United Republic of Tanzania",
"United States": "United States of America",

- Cabo Verde -> Cape Verde
"Congo DRC": "Democratic Republic of the Congo",
"Curacao": "Curaçao",
"Eswatini": "Swaziland",
"North Macedonia": "Macedonia",
"Palestinian Territory": "Palestine",
"Pitcairn": "Pitcairn Islands",
"Russian Federation": "Russia",
"US Virgin Islands": "United States Virgin Islands",
"Vatican City": "Vatican"



Those who need separate GeoJson data are:
- Christmas Island (Australia)
- French Guiana (France)
- Réunion (France)
- Bouvet Island (Norway)
- Svalbard (Norway)
- Canarias (Spain)
- Null Island (None)


In [10]:
# The countries that are in custom50 but not in countries (do not have users or have different names) - 61
set(df_custom50["country"]) - set(df_countries["country"])

{'Aland',
 'American Samoa',
 'Ashmore and Cartier Islands',
 'British Indian Ocean Territory',
 'Brunei',
 'Burundi',
 'Cape Verde',
 'Central African Republic',
 'Cook Islands',
 'Curaçao',
 'Democratic Republic of the Congo',
 'East Timor',
 'Eritrea',
 'Ethiopia',
 'Falkland Islands',
 'Federated States of Micronesia',
 'French Southern and Antarctic Lands',
 'Gambia',
 'Guinea Bissau',
 'Heard Island and McDonald Islands',
 'Hong Kong S.A.R.',
 'Indian Ocean Territories',
 'Ivory Coast',
 'Japan',
 'Lesotho',
 'Macao S.A.R',
 'Macedonia',
 'Malawi',
 'Montserrat',
 'Nauru',
 'Niger',
 'Niue',
 'Norfolk Island',
 'Northern Cyprus',
 'Palau',
 'Palestine',
 'Pitcairn Islands',
 'Republic of Congo',
 'Republic of Serbia',
 'Russia',
 'Rwanda',
 'Saint Barthelemy',
 'Saint Helena',
 'Saint Martin',
 'Sao Tome and Principe',
 'Siachen Glacier',
 'Sint Maarten',
 'Somaliland',
 'South Sudan',
 'Swaziland',
 'Tajikistan',
 'The Bahamas',
 'Tonga',
 'Turkmenistan',
 'United Republic of Ta

In [11]:
country_name_map = {
    "Bahamas": "The Bahamas",
    "Brunei Darussalam": "Brunei",
    "Cabo Verde": "Cape Verde",
    "Congo": "Republic of Congo",
    "Congo DRC": "Democratic Republic of the Congo",
    "Curacao": "Curaçao",
    "Côte d'Ivoire": "Ivory Coast",
    "Eswatini": "Swaziland",
    "North Macedonia": "Macedonia",
    "Palestinian Territory": "Palestine",
    "Pitcairn": "Pitcairn Islands",
    "Russian Federation": "Russia",
    "Serbia": "Republic of Serbia",
    "Tanzania": "United Republic of Tanzania",
    "US Virgin Islands": "United States Virgin Islands",
    "United States": "United States of America",
    "Vatican City": "Vatican"
}

Those who need separate GeoJson data are:
- Christmas Island (Australia)
- French Guiana (France)
- Réunion (France)
- Bouvet Island (Norway)
- Svalbard (Norway)
- Canarias (Spain)
- Null Island: As this place is located in the middle of the Atlantic Ocean, we discard it from the dataset

# Australia
- Christmas Island: The coordinates are stored in the "Indian Ocean Territories" feature in the GeoJSON data

In [12]:
# Same as above but for Indian Ocean Territories
indian_ocean_feature_index, indian_ocean_feature = [(i, feature) for i, feature in enumerate(custom50["features"]) if feature["properties"]["admin"] == "Indian Ocean Territories"][0]
indian_ocean_geojson = indian_ocean_feature["geometry"]["coordinates"]
init_len_indian_ocean_geojson = len(indian_ocean_geojson) # 3

In [13]:
christmas_geojson = indian_ocean_geojson[2]
indian_ocean_geojson.pop(2)

# Add a new feature for Christmas Island
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Christmas Island",
        "sovereignt": "Australia",
        "name_ja": "クリスマス島"
    },
    "geometry": {
        "type": "Polygon",
        "coordinates": christmas_geojson
    }
})

# Remove Christmas Island from Indian Ocean Territories
custom50["features"][indian_ocean_feature_index]["geometry"]["coordinates"] = indian_ocean_geojson


# Norway
- Bouvet Island: We add manually the coordinates of Bouvet Island in the GeoJSON data, from [SITE](???)
- Svalbard: The coordinates are stored in the Norway feature in the GeoJSON data

In [14]:
norway_feature_index, norway_feature = [(i, feature) for i, feature in enumerate(custom50["features"]) if feature["properties"]["admin"] == "Norway"][0]
norway_geojson = norway_feature["geometry"]["coordinates"]
init_len_norway_geojson = len(norway_geojson)

In [16]:
norway_geojson, svalbard_geojson = norway_geojson[:23], norway_geojson[23:]

# Add a new feature for Svalbard
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Svalbard",
        "sovereignt": "Norway",
        "name_ja": "スヴァールバル諸島"
    },
    "geometry": {
        "type": "MultiPolygon",
        "coordinates": svalbard_geojson
    }
})

# Remove Svalbard from Norway
custom50["features"][norway_feature_index]["geometry"]["coordinates"] = norway_geojson

In [17]:
bouvet_geojson = [
    [
        [
            3.3573532104492,
            -54.382956707841295
        ],
        [
            3.34911346435547,
            -54.38355652073028
        ],
        [
            3.34877014160156,
            -54.3883547082951
        ],
        [
            3.33606719970703,
            -54.39255266225887
        ],
        [
            3.30448150634766,
            -54.39515118044875
        ],
        [
            3.2862854003906,
            -54.3917530850907
        ],
        [
            3.2842254638672,
            -54.396950058174404
        ],
        [
            3.27701568603516,
            -54.40814130293444
        ],
        [
            3.2842254638672,
            -54.431712859468426
        ],
        [
            3.29246520996094,
            -54.439300814458164
        ],
        [
            3.3123779296875,
            -54.45387373222012
        ],
        [
            3.32988739013666,
            -54.456069201735104
        ],
        [
            3.35460662841797,
            -54.450879720396834
        ],
        [
            3.40919494628906,
            -54.450280891757465
        ],
        [
            3.4270477294922,
            -54.44489103986841
        ],
        [
            3.43906402587885,
            -54.42052804806503
        ],
        [
            3.43013763427734,
            -54.40374510665786
        ],
        [
            3.39443206787104,
            -54.39015388401024
        ],
        [
            3.36387634277344,
            -54.38795488075697
        ],
        [
            3.3573532104492,
            -54.382956707841295
        ]
    ]
]

# Add a new feature for Bouvet Island
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Bouvet Island",
        "sovereignt": "Norway",
        "name_ja": "ブーベ島"
    },
    "geometry": {
        "type": "Polygon",
        "coordinates": bouvet_geojson
    }
})


# France

- Réunion
- French Guiana

Both coordinates are stored in the France feature in the GeoJSON data

In [18]:
# Select the feature whose "admin" is "France"
france_feature_index, france_feature = [(i, feature) for i, feature in enumerate(custom50["features"]) if feature["properties"]["admin"] == "France"][0]
france_geojson = france_feature["geometry"]["coordinates"]
init_len_france_geojson = len(france_geojson)

In [19]:
reunion_geojson = france_geojson[0]
french_guiana_geojson = france_geojson[2]

In [20]:
# Add a new feature in custom50 for Reunion
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Réunion",
        "sovereignt": "France",
        "name_ja": "レユニオン"
    },
    "geometry": {
        "type": "Polygon",
        "coordinates": reunion_geojson
    }
})


# Add a new feature in custom50 for French Guiana
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "French Guiana",
        "sovereignt": "France",
        "name_ja": "フランス領ギアナ"
    },
    "geometry": {
        "type": "Polygon",
        "coordinates": french_guiana_geojson
    }
})

In [21]:
# Remove the elements at index 0 and 2 of the coordinates of the France feature
france_geojson.pop(2)
france_geojson.pop(0)
custom50["features"][france_feature_index]["geometry"]["coordinates"] = france_geojson

In [22]:
# Among all the arrays of coordinates present in the geojson, search for the ones whose second coordinate value is between -55 and -54


# Spain

- Canarias: The coordinates are stored in the Spain feature in the GeoJSON data

In [23]:
# Get the index and the feature for Spain
spain_feature_index, spain_feature = [(i, feature) for i, feature in enumerate(custom50["features"]) if feature["properties"]["admin"] == "Spain"][0]
# spain_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "Spain"][0]
spain_geojson = spain_feature["geometry"]["coordinates"]
init_len_spain_geojson = len(spain_geojson)

In [24]:
canarias_geojson, spain_geojson = spain_geojson[:7], spain_geojson[7:]

# Add a new feature in custom50 for Canarias
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Canarias",
        "sovereignt": "Spain",
        "name_ja": "カナリア諸島"
    },
    "geometry": {
        "type": "MultiPolygon",
        "coordinates": canarias_geojson
    }
})

custom50["features"][spain_feature_index]["geometry"]["coordinates"] = spain_geojson

# Store the processed GeoJSON data

In [25]:
with open(geojson_path / "custom50_processed.json", "w") as f:
    json.dump(custom50, f)

# Tests

In [26]:
with open(geojson_path / "custom50_processed.json", "r") as f:
    custom50 = json.load(f)

In [27]:
indian_ocean_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "Indian Ocean Territories"][0]
indian_ocean_geojson = indian_ocean_feature["geometry"]["coordinates"]
assert len(indian_ocean_geojson) == init_len_indian_ocean_geojson - 1

In [28]:
norway_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "Norway"][0]
norway_geojson = norway_feature["geometry"]["coordinates"]
assert len(norway_geojson) == init_len_norway_geojson - 9

In [29]:
france_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "France"][0]
france_geojson = france_feature["geometry"]["coordinates"]
assert len(france_geojson) == init_len_france_geojson - 2

In [30]:
spain_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "Spain"][0]
spain_geojson = spain_feature["geometry"]["coordinates"]
assert len(spain_geojson) == init_len_spain_geojson - 7