In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
data = Path("../data")
plot_path = data / "plots"
map_path = data / "graph3_map"
geojson_path = map_path / "geojson"

# Users

302,673 unique users, 55,280 unique location strings

In [2]:
users = pd.read_csv(data / "UserList.csv")
print("Number of unique users: {:,}".format(users["user_id"].nunique()))
print("Number of unique locations: {:,}".format(users["location"].nunique()))
users

Number of unique users: 302,673
Number of unique locations: 55,280


Unnamed: 0,username,user_id,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,access_rank,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes
0,karthiga,2255153,3,49,1,0,0,55.31,Female,"Chennai, India",1990-04-29,,2013-03-03,2014-02-04 01:32:00,7.43,0.0,3391.0
1,RedvelvetDaisuki,1897606,61,396,39,0,206,118.07,Female,Manila,1995-01-01,,2012-12-13,1900-05-13 02:47:00,6.78,80.0,7094.0
2,Damonashu,37326,45,195,27,25,59,83.70,Male,"Detroit,Michigan",1991-08-01,,2008-02-13,1900-03-24 12:48:00,6.15,6.0,4936.0
3,bskai,228342,25,414,2,5,11,167.16,Male,"Nayarit, Mexico",1990-12-14,,2009-08-31,2014-05-12 16:35:00,8.27,1.0,10081.0
4,shuzzable,2347781,36,72,16,2,25,35.48,,,,,2013-03-25,2015-09-09 21:54:00,9.06,7.0,2154.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302670,ScruffyPuffy,3119025,0,27,0,0,0,7.92,,,,,2013-09-06,2014-10-10 09:04:00,0.00,0.0,477.0
302671,Torasori,3975907,22,239,0,4,176,86.88,Male,"Latvia, Riga",1998-11-18,,2014-07-30,2018-05-24 21:34:46,8.98,47.0,5313.0
302672,onpc,1268417,5,169,2,5,24,38.36,Male,,,,2012-04-23,2016-12-28 14:35:00,7.72,0.0,2280.0
302673,HMicca,1289601,11,73,2,2,16,119.97,Female,"Birmingham, England",1995-08-12,,2012-05-05,2012-11-15 08:10:00,8.89,11.0,7049.0


# Prepare data for country EDA

##¬†Remove the rows with null location and strip all locations

- 302,673 users -> 156,774 users (-48%)
- 55,280 locations -> 53,762 locations (-3%)

In [3]:
# Keep users with non-null location
users = users[users["location"].notnull()]

# strip all locations
users.loc[:, "location"] = users.loc[:, "location"].str.strip()

print("Number of unique users: {:,}".format(users["user_id"].nunique()))
print("Number of unique locations: {:,}".format(users["location"].nunique()))

Number of unique users: 156,774
Number of unique locations: 53,762


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users.loc[:, "location"] = users.loc[:, "location"].str.strip()


## Map location strings to country names

In [4]:
from functools import partial
from geopy.geocoders import Nominatim
from deep_translator import GoogleTranslator

import country_converter as coco

coco.logging.getLogger().setLevel(coco.logging.ERROR)

translator = GoogleTranslator(source="auto", target="en")
geolocator = Nominatim(user_agent="GoogleV3")
geocode = partial(geolocator.geocode, language="en")


def findCountry(location):
    if location.isdigit():
        print(location, "(is digit)")
        return ""

    try:
        result = geocode(location, timeout=5)
    except:
        print(location, "(geocode)")
        return ""
    if result is not None:
        country = result.address.split(",")[-1].strip()
        return country

    country = coco.convert(names=location, to="name_short")
    if country != "not found":
        return country

    translated = translator.translate(location)
    if translated is None:
        print(location, "(translation not found)")
        return ""
    if translated == location:
        print(location, "(not found)")
        return ""

    try:
        result = geocode(translated, timeout=5)
    except:
        print(location, "->", translated, "(geocode)")
        return ""
    if result is not None:
        country = result.address.split(",")[-1].strip()
        return country

    country = coco.convert(names=translated, to="name_short")
    if country != "not found":
        return country
    print(location, "->", translated, "(not found)")
    return ""

## Cleaning locations

- Fix some locations who were mapped to an incorrect country

##¬†Merge with users dataset


In [5]:
df_ltc_clean = pd.read_csv(map_path / "location_to_country_clean.csv")
#¬†Transform df_ltc_clean to a dictionary with key "location" and value "country"
dict_ltc_clean = df_ltc_clean.set_index("location").to_dict()["country"]
dict_ltc_clean

{'Afghan': 'Afghanistan',
 'Albania': 'Albania',
 'Albania / Tirana': 'Albania',
 'Albania, Tirana': 'Albania',
 'Albania,Tirana': 'Albania',
 'Durres': 'Albania',
 'Durres, Albania': 'Albania',
 'Kavaje, Albania': 'Albania',
 'Tirana , Albania': 'Albania',
 'Tirana, Albania': 'Albania',
 'Vlore, Albania': 'Albania',
 'tirana,Albania': 'Albania',
 'tirane': 'Albania',
 'ALG': 'Algeria',
 'ALGERIA/BMR': 'Algeria',
 'ANNABA': 'Algeria',
 'Alger,Algeria': 'Algeria',
 'Algeria': 'Algeria',
 'Algeria ,Chlef': 'Algeria',
 'Algeria, Alger': 'Algeria',
 'Algeria, Algiers': 'Algeria',
 'Algeria, Chlef': 'Algeria',
 'Algeria,Barika': 'Algeria',
 'Algeria,Setif': 'Algeria',
 'Algeria-mascara': 'Algeria',
 'Algeria/Tebessa/Cheria': 'Algeria',
 'Algeria_Batna_Bou Zoran': 'Algeria',
 'Algiers': 'Algeria',
 'Algiers, Algeria': 'Algeria',
 'Alg√©rie,Jijel': 'Algeria',
 'Annaba': 'Algeria',
 'Annaba, Algeria': 'Algeria',
 'Annaba, algeria': 'Algeria',
 'Batna': 'Algeria',
 'Batna,Algeria': 'Algeria',
 

In [6]:
users["country"] = users["location"].map(dict_ltc_clean)
users

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users["country"] = users["location"].map(dict_ltc_clean)


Unnamed: 0,username,user_id,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,access_rank,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes,country
0,karthiga,2255153,3,49,1,0,0,55.31,Female,"Chennai, India",1990-04-29,,2013-03-03,2014-02-04 01:32:00,7.43,0.0,3391.0,India
1,RedvelvetDaisuki,1897606,61,396,39,0,206,118.07,Female,Manila,1995-01-01,,2012-12-13,1900-05-13 02:47:00,6.78,80.0,7094.0,Philippines
2,Damonashu,37326,45,195,27,25,59,83.70,Male,"Detroit,Michigan",1991-08-01,,2008-02-13,1900-03-24 12:48:00,6.15,6.0,4936.0,United States
3,bskai,228342,25,414,2,5,11,167.16,Male,"Nayarit, Mexico",1990-12-14,,2009-08-31,2014-05-12 16:35:00,8.27,1.0,10081.0,Mexico
5,terune_uzumaki,327311,5,5,0,0,0,15.20,Female,"Malaysia, Kuantan",1998-08-24,,2010-05-10,2012-10-18 19:06:00,9.70,6.0,920.0,Malaysia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302658,Queenjdlols,3495411,4,123,6,4,49,22.55,Female,Head in the clouds body on the ground,,,2014-01-16,1900-04-29 00:19:00,6.77,3.0,1375.0,
302662,TheClockworkGuy,4273353,5,77,8,12,61,48.03,Male,Yharnam,,,2014-11-23,1900-05-22 02:49:00,6.85,0.0,2826.0,United States
302668,ichinitan,4531649,3,24,5,0,246,8.72,,"Alexandria, Virginia",2000-05-12,,2015-04-01,2017-10-12 14:22:00,9.46,0.0,528.0,United States
302671,Torasori,3975907,22,239,0,4,176,86.88,Male,"Latvia, Riga",1998-11-18,,2014-07-30,2018-05-24 21:34:46,8.98,47.0,5313.0,Latvia


##¬†Cleaning countries

### Remove users with undefined country

- 156,774 users -> 136,275 users (-13%)

In [7]:
num_unique_users = users["user_id"].nunique()

# Users whose country is undefined (country is "" or is null)
num_undefined = users[users["country"].isnull() | (users["country"] == "")]["user_id"].nunique()
print("Number of users whose country is undefined: {:,} ({:.2f}%)".format(num_undefined, 100 * num_undefined / num_unique_users))

users = users[users["country"].notnull() & (users["country"] != "")]
print("Number of users whose country is defined: {:,}".format(users["user_id"].nunique()))

Number of users whose country is undefined: 20,499 (13.08%)
Number of users whose country is defined: 136,275


### Remove users with ambiguous country

- 136,275 users -> 135,917 users (-0.26%)

In [8]:
num_unique_users = users["user_id"].nunique()

# Users whose country is undefined (country is "" or is null)
num_ambiguous = users[users["country"] == "ambiguous"]["user_id"].nunique()
print("Number of users whose country is ambiguous: {:,} ({:.2f}%)".format(num_ambiguous, 100 * num_ambiguous / num_unique_users))

users = users[users["country"] != "ambiguous"]
print("Number of users whose country is defined: {:,}".format(users["user_id"].nunique()))

Number of users whose country is ambiguous: 358 (0.26%)
Number of users whose country is defined: 135,917


### Duplicate the rows of the users who are located in several countries

In [9]:
# for all the users who have a "country" column of the form "['', '']", remove the [] and ''. For instance, "['Finland', 'Sweden']" becomes "Finland, Sweden"
num_unique_users = users["user_id"].nunique()

users.loc[:, "country"] = (
    users.loc[:, "country"]
    .str.replace(r"^\[\'", "")
    .str.replace(r"\'\]$", "")
    .str.replace(r"\'\,\s\'", ", ")
)

# for the users who have a "country" column of the form "country1, country2, ..., countryn", duplicate the user n-1 times and assign the country1, country2, ..., countryn to the new users
users_multiple_countries = users[users["country"].str.contains(",")]
users_multiple_countries.loc[:, "country"] = users_multiple_countries.loc[:, "country"].str.split(", ")
users_multiple_countries = users_multiple_countries.explode("country")

# merge the results with the original dataframe
users = pd.concat(
    [users[~users["country"].str.contains(",")], users_multiple_countries], axis=0
)

# Check that the number of unique users is the same after the operation
assert users["user_id"].nunique() == num_unique_users
users

  .str.replace(r"^\[\'", "")
  .str.replace(r"\'\]$", "")
  .str.replace(r"\'\,\s\'", ", ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_multiple_countries.loc[:, "country"] = users_multiple_countries.loc[:, "country"].str.split(", ")


Unnamed: 0,username,user_id,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,access_rank,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes,country
0,karthiga,2255153,3,49,1,0,0,55.31,Female,"Chennai, India",1990-04-29,,2013-03-03,2014-02-04 01:32:00,7.43,0.0,3391.0,India
1,RedvelvetDaisuki,1897606,61,396,39,0,206,118.07,Female,Manila,1995-01-01,,2012-12-13,1900-05-13 02:47:00,6.78,80.0,7094.0,Philippines
2,Damonashu,37326,45,195,27,25,59,83.70,Male,"Detroit,Michigan",1991-08-01,,2008-02-13,1900-03-24 12:48:00,6.15,6.0,4936.0,United States
3,bskai,228342,25,414,2,5,11,167.16,Male,"Nayarit, Mexico",1990-12-14,,2009-08-31,2014-05-12 16:35:00,8.27,1.0,10081.0,Mexico
5,terune_uzumaki,327311,5,5,0,0,0,15.20,Female,"Malaysia, Kuantan",1998-08-24,,2010-05-10,2012-10-18 19:06:00,9.70,6.0,920.0,Malaysia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286854,AdamNyandere,4539831,11,43,7,1,25,47.37,Male,Singapore / Malaysia,1984-11-18,,2015-04-05,2018-05-23 07:28:00,9.07,60.0,3967.0,Singapore
289896,Roxas_the_key,242758,4,63,19,0,2,42.19,Female,"Germany,Greece",1991-09-11,,2009-10-05,2012-09-27 12:01:00,8.63,0.0,2521.0,Germany
289896,Roxas_the_key,242758,4,63,19,0,2,42.19,Female,"Germany,Greece",1991-09-11,,2009-10-05,2012-09-27 12:01:00,8.63,0.0,2521.0,Greece
297997,Tachibana-chan,5078714,17,105,2,4,37,24.65,Female,India/Saudi Arabia,2004-01-25,,2015-12-27,1900-05-02 03:48:00,7.32,0.0,1530.0,India


### Remove users in Japan

- 135,917 users -> 133,795 users (-1.56%)

In [10]:
num_unique_users = users["user_id"].nunique()

# Users whose country is undefined (country is "" or is null)
num_users_japan = users[users["country"] == "Japan"]["user_id"].nunique()
print("Number of users whose country is Japan: {:,} ({:.2f}%)".format(num_users_japan, 100 * num_users_japan / num_unique_users))

users = users[users["country"] != "Japan"]
print("Number of users whose country is Japan: {:,}".format(users["user_id"].nunique()))

Number of users whose country is Japan: 2,134 (1.57%)
Number of users whose country is Japan: 133,795


##¬†Comparison with the GeoJSON data

In [11]:
import json

with open(geojson_path / "custom50_ja.json", "r") as f:
    custom50 = json.load(f)

countries_geojson = pd.DataFrame(
    [
        {
            "country": feature["properties"]["admin"],
        }
        for feature in custom50["features"]
    ]
)["country"].values

print(len(countries_geojson), "countries")
countries_geojson

241 countries


array(['Aruba', 'Antigua and Barbuda', 'Anguilla', 'Saint Barthelemy',
       'The Bahamas', 'Bermuda', 'Belize', 'Costa Rica', 'Barbados',
       'Cuba', 'Cura√ßao', 'Cayman Islands', 'Dominica',
       'Dominican Republic', 'Grenada', 'Guatemala', 'Honduras', 'Canada',
       'Greenland', 'Haiti', 'Jamaica', 'Saint Kitts and Nevis',
       'Saint Lucia', 'Saint Martin', 'Nicaragua', 'Montserrat', 'Panama',
       'Puerto Rico', 'Mexico', 'Saint Pierre and Miquelon',
       'El Salvador', 'Sint Maarten', 'Turks and Caicos Islands',
       'Trinidad and Tobago', 'Saint Vincent and the Grenadines',
       'British Virgin Islands', 'United States Virgin Islands',
       'Argentina', 'Bolivia', 'United States of America', 'Colombia',
       'Brazil', 'Ecuador', 'Falkland Islands', 'Chile', 'Guyana', 'Peru',
       'Suriname', 'Uruguay', 'Paraguay', 'United Arab Emirates',
       'Venezuela', 'Afghanistan', 'Armenia', 'Azerbaijan', 'Bangladesh',
       'Bahrain', 'Brunei', 'Bhutan', 'North

In [12]:
countries_db = users["country"].unique()
print(len(countries_db), "countries")
countries_db

204 countries


array(['India', 'Philippines', 'United States', 'Mexico', 'Malaysia',
       'Netherlands', 'Poland', 'Sweden', 'United Kingdom', 'Hungary',
       'Australia', 'Canada', 'Brazil', 'Austria', 'Germany', 'Iran',
       'Romania', 'Russian Federation', 'Bulgaria', 'Argentina', 'Egypt',
       'Portugal', 'Venezuela', 'France', 'Saudi Arabia', 'Ukraine',
       'Georgia', 'Indonesia', 'Vietnam', 'Montenegro', 'China', 'Israel',
       'Tunisia', 'Spain', 'Lithuania', 'Thailand', 'Belgium', 'Turkey',
       'Singapore', 'Norway', 'United Arab Emirates', 'Pakistan',
       'Puerto Rico', 'Finland', 'New Zealand', 'Italy', 'Serbia',
       'Ireland', 'Latvia', 'Croatia', 'Guatemala',
       'Bosnia and Herzegovina', 'Switzerland', 'Greece', 'Paraguay',
       'Albania', 'Chile', 'Bangladesh', 'Morocco', 'Colombia',
       'Brunei Darussalam', 'Uzbekistan', 'South Africa', 'Denmark',
       'Taiwan', 'Oman', 'El Salvador', 'Kuwait', 'Uruguay', 'Estonia',
       'Lebanon', 'Qatar', 'Maldives',

###¬†Countries that are in our dataset  but not in the GeoJSON data 

In [13]:
# The countries that are in countries_db but not in countries_geojson
set(countries_db) - set(countries_geojson)

{'Bahamas',
 'Bouvet Island',
 'Brunei Darussalam',
 'Cabo Verde',
 'Canarias',
 'Christmas Island',
 'Congo',
 'Congo DRC',
 'Curacao',
 "C√¥te d'Ivoire",
 'Eswatini',
 'French Guiana',
 'North Macedonia',
 'Null Island',
 'Palestinian Territory',
 'Pitcairn',
 'Russian Federation',
 'R√©union',
 'Serbia',
 'Svalbard',
 'Tanzania',
 'US Virgin Islands',
 'United States',
 'Vatican City'}

They divide in two categories:

- The countries who have an name that is inconsistent with the one in the GeoJSON data -> we need to rename them
    - "Bahamas": "The Bahamas",
    - "Brunei Darussalam": "Brunei",
    - "Cabo Verde": "Cape Verde",
    - "Congo": "Republic of Congo",
    - "Congo DRC": "Democratic Republic of the Congo",
    - "C√¥te d'Ivoire": "Ivory Coast",
    - "Curacao": "Cura√ßao",
    - "Eswatini": "Swaziland",
    - "North Macedonia": "Macedonia",
    - "Palestinian Territory": "Palestine",
    - "Pitcairn": "Pitcairn Islands",
    - "Russian Federation": "Russia",
    - "Serbia": "Republic of Serbia",
    - "Tanzania": "United Republic of Tanzania",
    - "US Virgin Islands": "United States Virgin Islands",
    - "United States": "United States of America",
    - "Vatican City": "Vatican"

- The countries that are missing in the GeoJSON data so we need to add them -> we need to find the coordinates of their boundaries and add them to the GeoJSON data
    - Christmas Island (Australia)
    - French Guiana (France)
    - R√©union (France)
    - Bouvet Island (Norway)
    - Svalbard (Norway)
    - Canarias (Spain)
    - Null Island: As this place is located in the middle of the Atlantic Ocean, we discard it from the dataset

### Rename inconsistent countries

In [14]:
db_to_geojson = {
    "Bahamas": "The Bahamas",
    "Brunei Darussalam": "Brunei",
    "Cabo Verde": "Cape Verde",
    "Congo": "Republic of Congo",
    "Congo DRC": "Democratic Republic of the Congo",
    "Curacao": "Cura√ßao",
    "C√¥te d'Ivoire": "Ivory Coast",
    "Eswatini": "Swaziland",
    "North Macedonia": "Macedonia",
    "Palestinian Territory": "Palestine",
    "Pitcairn": "Pitcairn Islands",
    "Russian Federation": "Russia",
    "Serbia": "Republic of Serbia",
    "Tanzania": "United Republic of Tanzania",
    "US Virgin Islands": "United States Virgin Islands",
    "United States": "United States of America",
    "Vatican City": "Vatican"
}

In [15]:
#¬†Map the countries in user
users["country"] = users["country"].replace(db_to_geojson)
users

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users["country"] = users["country"].replace(db_to_geojson)


Unnamed: 0,username,user_id,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,access_rank,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes,country
0,karthiga,2255153,3,49,1,0,0,55.31,Female,"Chennai, India",1990-04-29,,2013-03-03,2014-02-04 01:32:00,7.43,0.0,3391.0,India
1,RedvelvetDaisuki,1897606,61,396,39,0,206,118.07,Female,Manila,1995-01-01,,2012-12-13,1900-05-13 02:47:00,6.78,80.0,7094.0,Philippines
2,Damonashu,37326,45,195,27,25,59,83.70,Male,"Detroit,Michigan",1991-08-01,,2008-02-13,1900-03-24 12:48:00,6.15,6.0,4936.0,United States of America
3,bskai,228342,25,414,2,5,11,167.16,Male,"Nayarit, Mexico",1990-12-14,,2009-08-31,2014-05-12 16:35:00,8.27,1.0,10081.0,Mexico
5,terune_uzumaki,327311,5,5,0,0,0,15.20,Female,"Malaysia, Kuantan",1998-08-24,,2010-05-10,2012-10-18 19:06:00,9.70,6.0,920.0,Malaysia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286854,AdamNyandere,4539831,11,43,7,1,25,47.37,Male,Singapore / Malaysia,1984-11-18,,2015-04-05,2018-05-23 07:28:00,9.07,60.0,3967.0,Singapore
289896,Roxas_the_key,242758,4,63,19,0,2,42.19,Female,"Germany,Greece",1991-09-11,,2009-10-05,2012-09-27 12:01:00,8.63,0.0,2521.0,Germany
289896,Roxas_the_key,242758,4,63,19,0,2,42.19,Female,"Germany,Greece",1991-09-11,,2009-10-05,2012-09-27 12:01:00,8.63,0.0,2521.0,Greece
297997,Tachibana-chan,5078714,17,105,2,4,37,24.65,Female,India/Saudi Arabia,2004-01-25,,2015-12-27,1900-05-02 03:48:00,7.32,0.0,1530.0,India


In [16]:
countries_db = users["country"].unique()
print(len(countries_db), "countries")
countries_db

204 countries


array(['India', 'Philippines', 'United States of America', 'Mexico',
       'Malaysia', 'Netherlands', 'Poland', 'Sweden', 'United Kingdom',
       'Hungary', 'Australia', 'Canada', 'Brazil', 'Austria', 'Germany',
       'Iran', 'Romania', 'Russia', 'Bulgaria', 'Argentina', 'Egypt',
       'Portugal', 'Venezuela', 'France', 'Saudi Arabia', 'Ukraine',
       'Georgia', 'Indonesia', 'Vietnam', 'Montenegro', 'China', 'Israel',
       'Tunisia', 'Spain', 'Lithuania', 'Thailand', 'Belgium', 'Turkey',
       'Singapore', 'Norway', 'United Arab Emirates', 'Pakistan',
       'Puerto Rico', 'Finland', 'New Zealand', 'Italy',
       'Republic of Serbia', 'Ireland', 'Latvia', 'Croatia', 'Guatemala',
       'Bosnia and Herzegovina', 'Switzerland', 'Greece', 'Paraguay',
       'Albania', 'Chile', 'Bangladesh', 'Morocco', 'Colombia', 'Brunei',
       'Uzbekistan', 'South Africa', 'Denmark', 'Taiwan', 'Oman',
       'El Salvador', 'Kuwait', 'Uruguay', 'Estonia', 'Lebanon', 'Qatar',
       'Maldives',

We retrieve the countries that need separate GeoJSON data

In [17]:
countries_db_mapped = users["country"].unique()
assert len(countries_db_mapped) == len(countries_db)
# The countries that are in countries_db but not in countries_geojson
set(countries_db_mapped) - set(countries_geojson)

{'Bouvet Island',
 'Canarias',
 'Christmas Island',
 'French Guiana',
 'Null Island',
 'R√©union',
 'Svalbard'}

Save the user processed data

In [18]:
users.to_csv(map_path / "user_country.csv", index=False)

### Add missing GeoJSON data

####¬†Australia
- Christmas Island: The coordinates are stored in the "Indian Ocean Territories" feature in the GeoJSON data

In [19]:
#¬†Same as above but for Indian Ocean Territories
indian_ocean_feature_index, indian_ocean_feature = [(i, feature) for i, feature in enumerate(custom50["features"]) if feature["properties"]["admin"] == "Indian Ocean Territories"][0]
indian_ocean_geojson = indian_ocean_feature["geometry"]["coordinates"]
init_len_indian_ocean_geojson = len(indian_ocean_geojson) #¬†3

In [20]:
christmas_geojson = indian_ocean_geojson[2]
indian_ocean_geojson.pop(2)

# Add a new feature for Christmas Island
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Christmas Island",
        "sovereignt": "Australia",
        "name_sort": "Christmas Island",
        "name_ja": "„ÇØ„É™„Çπ„Éû„ÇπÂ≥∂"
    },
    "geometry": {
        "type": "Polygon",
        "coordinates": christmas_geojson
    }
})

#¬†Remove Christmas Island from Indian Ocean Territories
custom50["features"][indian_ocean_feature_index]["geometry"]["coordinates"] = indian_ocean_geojson


####¬†Norway
- Bouvet Island: We add manually the coordinates of Bouvet Island in the GeoJSON data, from [SITE](???)
- Svalbard: The coordinates are stored in the Norway feature in the GeoJSON data

In [21]:
norway_feature_index, norway_feature = [(i, feature) for i, feature in enumerate(custom50["features"]) if feature["properties"]["admin"] == "Norway"][0]
norway_geojson = norway_feature["geometry"]["coordinates"]
init_len_norway_geojson = len(norway_geojson)

In [22]:
norway_geojson, svalbard_geojson = norway_geojson[:23], norway_geojson[23:]

#¬†Add a new feature for Svalbard
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Svalbard",
        "sovereignt": "Norway",
        "name_sort": "Svalbard",
        "name_ja": "„Çπ„É¥„Ç°„Éº„É´„Éê„É´Ë´∏Â≥∂"
    },
    "geometry": {
        "type": "MultiPolygon",
        "coordinates": svalbard_geojson
    }
})

#¬†Remove Svalbard from Norway
custom50["features"][norway_feature_index]["geometry"]["coordinates"] = norway_geojson

In [23]:
bouvet_geojson = [
    [
        [
            3.3573532104492,
            -54.382956707841295
        ],
        [
            3.34911346435547,
            -54.38355652073028
        ],
        [
            3.34877014160156,
            -54.3883547082951
        ],
        [
            3.33606719970703,
            -54.39255266225887
        ],
        [
            3.30448150634766,
            -54.39515118044875
        ],
        [
            3.2862854003906,
            -54.3917530850907
        ],
        [
            3.2842254638672,
            -54.396950058174404
        ],
        [
            3.27701568603516,
            -54.40814130293444
        ],
        [
            3.2842254638672,
            -54.431712859468426
        ],
        [
            3.29246520996094,
            -54.439300814458164
        ],
        [
            3.3123779296875,
            -54.45387373222012
        ],
        [
            3.32988739013666,
            -54.456069201735104
        ],
        [
            3.35460662841797,
            -54.450879720396834
        ],
        [
            3.40919494628906,
            -54.450280891757465
        ],
        [
            3.4270477294922,
            -54.44489103986841
        ],
        [
            3.43906402587885,
            -54.42052804806503
        ],
        [
            3.43013763427734,
            -54.40374510665786
        ],
        [
            3.39443206787104,
            -54.39015388401024
        ],
        [
            3.36387634277344,
            -54.38795488075697
        ],
        [
            3.3573532104492,
            -54.382956707841295
        ]
    ]
]

#¬†Add a new feature for Bouvet Island
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Bouvet Island",
        "sovereignt": "Norway",
        "name_sort": "Bouvet Island",
        "name_ja": "„Éñ„Éº„ÉôÂ≥∂"
    },
    "geometry": {
        "type": "Polygon",
        "coordinates": bouvet_geojson
    }
})


#### France

- R√©union
- French Guiana

Both coordinates are stored in the France feature in the GeoJSON data

In [24]:
# Select the feature whose "admin" is "France"
france_feature_index, france_feature = [(i, feature) for i, feature in enumerate(custom50["features"]) if feature["properties"]["admin"] == "France"][0]
france_geojson = france_feature["geometry"]["coordinates"]
init_len_france_geojson = len(france_geojson)

In [25]:
reunion_geojson = france_geojson[0]
french_guiana_geojson = france_geojson[2]

In [26]:
#¬†Add a new feature in custom50 for Reunion
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "R√©union",
        "sovereignt": "France",
        "name_sort": "R√©union",
        "name_ja": "„É¨„É¶„Éã„Ç™„É≥"
    },
    "geometry": {
        "type": "Polygon",
        "coordinates": reunion_geojson
    }
})


# Add a new feature in custom50 for French Guiana
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "French Guiana",
        "sovereignt": "France",
        "name_sort": "French Guiana",
        "name_ja": "„Éï„É©„É≥„ÇπÈ†ò„ÇÆ„Ç¢„Éä"
    },
    "geometry": {
        "type": "Polygon",
        "coordinates": french_guiana_geojson
    }
})

In [27]:
# Remove the elements at index 0 and 2 of the coordinates of the France feature
france_geojson.pop(2)
france_geojson.pop(0)
custom50["features"][france_feature_index]["geometry"]["coordinates"] = france_geojson

In [28]:
#¬†Among all the arrays of coordinates present in the geojson, search for the ones whose second coordinate value is between -55 and -54


####¬†Spain

- Canarias: The coordinates are stored in the Spain feature in the GeoJSON data

In [29]:
# Get the index and the feature for Spain
spain_feature_index, spain_feature = [(i, feature) for i, feature in enumerate(custom50["features"]) if feature["properties"]["admin"] == "Spain"][0]
# spain_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "Spain"][0]
spain_geojson = spain_feature["geometry"]["coordinates"]
init_len_spain_geojson = len(spain_geojson)

In [30]:
canarias_geojson, spain_geojson = spain_geojson[:7], spain_geojson[7:]

#¬†Add a new feature in custom50 for Canarias
custom50["features"].append({
    "type": "Feature",
    "properties": {
        "admin": "Canarias",
        "sovereignt": "Spain",
        "name_sort": "Canarias",
        "name_ja": "„Ç´„Éä„É™„Ç¢Ë´∏Â≥∂"
    },
    "geometry": {
        "type": "MultiPolygon",
        "coordinates": canarias_geojson
    }
})

custom50["features"][spain_feature_index]["geometry"]["coordinates"] = spain_geojson

Save the processed GeoJSON data

In [31]:
with open(geojson_path / "custom50_processed.json", "w") as f:
    json.dump(custom50, f)

Tests

In [32]:
with open(geojson_path / "custom50_processed.json", "r") as f:
    custom50 = json.load(f)

In [33]:
indian_ocean_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "Indian Ocean Territories"][0]
indian_ocean_geojson = indian_ocean_feature["geometry"]["coordinates"]
assert len(indian_ocean_geojson) == init_len_indian_ocean_geojson - 1

In [34]:
norway_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "Norway"][0]
norway_geojson = norway_feature["geometry"]["coordinates"]
assert len(norway_geojson) == init_len_norway_geojson - 9

In [35]:
france_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "France"][0]
france_geojson = france_feature["geometry"]["coordinates"]
assert len(france_geojson) == init_len_france_geojson - 2

In [36]:
spain_feature = [feature for feature in custom50["features"] if feature["properties"]["admin"] == "Spain"][0]
spain_geojson = spain_feature["geometry"]["coordinates"]
assert len(spain_geojson) == init_len_spain_geojson - 7