# Analysis of Country Tags and Mapping to Country Codes

Goal: have a standardized association of products with countries through country codes.

Data source is the OpenFoodFacts `country_tags` field. The values are country names in several languages.

Correct mapping is complicated due to several issues:
- wrong lc prefix for country name
- wrong country name, e.g. russia for russian federation

In [1]:
import pandas as pd
import gettext
import unicodedata
from pycountry import countries, LOCALES_DIR

In [2]:
# Load the file
file_path = "../data/countries_frequency.csv"
df = pd.read_csv(file_path)

# Display the first few rows to inspect the structure of the data
df.head()

Unnamed: 0,country,frequency
0,en:france,1109929
1,en:united-states,712983
2,en:spain,333584
3,en:germany,319896
4,en:italy,250943


## Using pycountry

In [3]:
def remove_prefix(name):
    return name.split(":", 1)[-1]  # Remove the prefix (e.g., "en:", "fr:")

def replace_dash(name):
    return name.replace("-", " ")

def get_country_info(name):
    try:
        country = countries.lookup(name)
        country_code = country.alpha_2
        #flag = chr(127397 + ord(country_code[0])) + chr(127397 + ord(country_code[1]))
        flag = country.flag
        return country_code, flag
    except LookupError:
        return None, None

country_map = {
    "russia": "Russian Federation",
    "turkey": "Türkiye",
    "turkiye": "Türkiye",
    "brunei": "Brunei Darussalam",
    "the bahamas": "Bahamas",
    "reunion": "Réunion",
    "france la reunion": "Réunion",
    "deutschland": "Germany",
    "frankreich": "France",
    "schweiz": "Switzerland",
    "vereinigte staaten von amerika": "United States",
    "francia": "France",
    "belgique": "Belgium",
    "etats unis": "United States",
    "estados unidos": "United States",
    "cote d ivoire": "Côte d'Ivoire",
    "palestinian territories": "Palestine, State of",
    "democratic republic of the congo": "Congo, The Democratic Republic of the",
    "ישראל": "Israel",
}

def map_country_names(name):
    if name in country_map:
        return country_map[name]
    return name

#df.country.apply(remove_prefix).apply(replace_dash)

result = df.country.apply(remove_prefix).apply(replace_dash).apply(map_country_names).apply(get_country_info)
# df[["country_code", "emoji_flag"]] =


result = pd.DataFrame.from_records(result, columns=["code", "flag"])

countries_df = df.merge(result, left_index=True, right_index=True)
countries_df

Unnamed: 0,country,frequency,code,flag
0,en:france,1109929,FR,🇫🇷
1,en:united-states,712983,US,🇺🇸
2,en:spain,333584,ES,🇪🇸
3,en:germany,319896,DE,🇩🇪
4,en:italy,250943,IT,🇮🇹
...,...,...,...,...
672,ar:المغرب-🇲🇦,1,,
673,en:meksyk,1,,
674,en:ประเทศฝรั่งเศส,1,,
675,en:dkmdmd,1,,


In [4]:
countries.get(alpha_2='RU')

Country(alpha_2='RU', alpha_3='RUS', flag='🇷🇺', name='Russian Federation', numeric='643')

In [5]:
countries.get(alpha_2='TR')

Country(alpha_2='TR', alpha_3='TUR', flag='🇹🇷', name='Türkiye', numeric='792', official_name='Republic of Türkiye')

In [6]:
countries.get(alpha_2='RE')

Country(alpha_2='RE', alpha_3='REU', flag='🇷🇪', name='Réunion', numeric='638')

In [7]:
countries.get(alpha_2='AX')

Country(alpha_2='AX', alpha_3='ALA', flag='🇦🇽', name='Åland Islands', numeric='248')

In [8]:
countries.get(alpha_2='CI')

Country(alpha_2='CI', alpha_3='CIV', flag='🇨🇮', name="Côte d'Ivoire", numeric='384', official_name="Republic of Côte d'Ivoire")

In [9]:
countries.get(alpha_2='PS')

Country(alpha_2='PS', alpha_3='PSE', flag='🇵🇸', name='Palestine, State of', numeric='275', official_name='the State of Palestine')

In [10]:
countries.get(alpha_2='CD')

Country(alpha_2='CD', alpha_3='COD', flag='🇨🇩', name='Congo, The Democratic Republic of the', numeric='180')

In [11]:
countries.get(alpha_2='BN')

Country(alpha_2='BN', alpha_3='BRN', flag='🇧🇳', name='Brunei Darussalam', numeric='096')

In [12]:
countries.get(alpha_2='BS')

Country(alpha_2='BS', alpha_3='BHS', flag='🇧🇸', name='Bahamas', numeric='044', official_name='Commonwealth of the Bahamas')

In [13]:
countries.get(alpha_2='BQ')

Country(alpha_2='BQ', alpha_3='BES', flag='🇧🇶', name='Bonaire, Sint Eustatius and Saba', numeric='535', official_name='Bonaire, Sint Eustatius and Saba')

In [14]:
countries.get(alpha_2='CC')

Country(alpha_2='CC', alpha_3='CCK', flag='🇨🇨', name='Cocos (Keeling) Islands', numeric='166')

In [15]:
countries.get(alpha_2='CV')

Country(alpha_2='CV', alpha_3='CPV', flag='🇨🇻', name='Cabo Verde', numeric='132', official_name='Republic of Cabo Verde')

In [16]:
countries.get(alpha_2='CW')

Country(alpha_2='CW', alpha_3='CUW', flag='🇨🇼', name='Curaçao', numeric='531', official_name='Curaçao')

In [17]:
countries.get(alpha_2='MO')

Country(alpha_2='MO', alpha_3='MAC', flag='🇲🇴', name='Macao', numeric='446', official_name='Macao Special Administrative Region of China')

In [18]:
countries.get(alpha_2='CV')

Country(alpha_2='CV', alpha_3='CPV', flag='🇨🇻', name='Cabo Verde', numeric='132', official_name='Republic of Cabo Verde')

In [19]:
countries.get(alpha_2='MF')

Country(alpha_2='MF', alpha_3='MAF', flag='🇲🇫', name='Saint Martin (French part)', numeric='663')

In [20]:
countries.get(alpha_2='ST')

Country(alpha_2='ST', alpha_3='STP', flag='🇸🇹', name='Sao Tome and Principe', numeric='678', official_name='Democratic Republic of Sao Tome and Principe')

In [21]:
countries.get(alpha_2='SX')

Country(alpha_2='SX', alpha_3='SXM', flag='🇸🇽', name='Sint Maarten (Dutch part)', numeric='534', official_name='Sint Maarten (Dutch part)')

In [22]:
countries.get(alpha_2='UM')

Country(alpha_2='UM', alpha_3='UMI', flag='🇺🇲', name='United States Minor Outlying Islands', numeric='581')

In [23]:
countries.get(alpha_2='VA')

Country(alpha_2='VA', alpha_3='VAT', flag='🇻🇦', name='Holy See (Vatican City State)', numeric='336')

In [24]:
countries.get(alpha_2='VG')

Country(alpha_2='VG', alpha_3='VGB', flag='🇻🇬', name='Virgin Islands, British', numeric='092', official_name='British Virgin Islands')

In [25]:
countries.get(alpha_2='XK')

In [26]:
countries_df[countries_df.code.isna()]

Unnamed: 0,country,frequency,code,flag
10,en:world,70577,,
106,en:yugoslavia,627,,
130,en:european-union,309,,
151,en:scotland,175,,
161,en:en,133,,
...,...,...,...,...
672,ar:المغرب-🇲🇦,1,,
673,en:meksyk,1,,
674,en:ประเทศฝรั่งเศส,1,,
675,en:dkmdmd,1,,


In [27]:
countries.search_fuzzy("palestin")

[Country(alpha_2='PS', alpha_3='PSE', flag='🇵🇸', name='Palestine, State of', numeric='275', official_name='the State of Palestine')]

In [28]:
countries.lookup("Palestine, State of")

Country(alpha_2='PS', alpha_3='PSE', flag='🇵🇸', name='Palestine, State of', numeric='275', official_name='the State of Palestine')

In [29]:
def lookup_country(name):
    try:
        # Attempt a direct lookup
        country = countries.lookup(name)
        return country.name
    except LookupError:
        # If direct lookup fails, check alternative names
        for country in countries:
            if name.lower() in [alt.lower() for alt in country.name.split(",")]:
                return country.name
            if hasattr(country, "official_name") and name.lower() == country.official_name.lower():
                return country.name
        return None

lookup_country("Frankreich")

---

## Using Gettext for ISO3166-1 translations

In [30]:
trans = gettext.translation('iso3166-1', LOCALES_DIR, languages=['fr'])

In [31]:
trans.install()

In [32]:
_("Netherlands")

'Pays-Bas'

In [33]:
target_languages = ['en', 'de', 'fr', 'es']

country_map = {}

# Function to normalize country names (lowercase, remove accents)
def normalize(name):
    return unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('utf-8').lower()

# Iterate over countries and store in map
for country in countries:
    for lang in target_languages:
        if lang == 'en':
            normalized_name = normalize(country.name)
            country_map[normalized_name] = (lang, country.alpha_2, country.flag)
        try:
            translation = gettext.translation('iso3166-1', LOCALES_DIR, languages=[lang])
            translated_name = translation.gettext(country.name)
            normalized_name = normalize(translated_name)
            country_map[normalized_name] = (lang, country.alpha_2, country.flag)
        except FileNotFoundError:
            continue


In [34]:
codes = pd.DataFrame.from_dict(country_map, orient="index", columns=['lang', 'code', 'flag'])
codes

Unnamed: 0,lang,code,flag
aruba,es,AW,🇦🇼
afghanistan,fr,AF,🇦🇫
afganistan,es,AF,🇦🇫
angola,es,AO,🇦🇴
anguilla,fr,AI,🇦🇮
...,...,...,...
sambia,de,ZM,🇿🇲
zambie,fr,ZM,🇿🇲
zimbabwe,fr,ZW,🇿🇼
simbabwe,de,ZW,🇿🇼


In [35]:
codes.lang.value_counts()

lang
es    249
fr    191
de    159
en    110
Name: count, dtype: int64

In [36]:
codes[codes.code == 'AW']

Unnamed: 0,lang,code,flag
aruba,es,AW,🇦🇼


In [37]:
codes[codes.index.str.startswith('russi')]

Unnamed: 0,lang,code,flag
russian federation,en,RU,🇷🇺
russische foderation,de,RU,🇷🇺
"russie, federation de",fr,RU,🇷🇺


In [38]:
def get_info(name):
    if name == '':
        (None, None, None)
    for c in codes.index:
        if c.startswith(name):
            return country_map.get(c)
    return (None, None, None)
    #return country_map.get(name, (None, None, None))
    
norm = df.country.apply(remove_prefix).apply(replace_dash).apply(normalize).apply(get_info)
norm

0            (fr, FR, 🇫🇷)
1            (de, UM, 🇺🇲)
2            (en, ES, 🇪🇸)
3            (en, DE, 🇩🇪)
4            (en, IT, 🇮🇹)
              ...        
672    (None, None, None)
673    (None, None, None)
674          (es, AW, 🇦🇼)
675    (None, None, None)
676          (es, AW, 🇦🇼)
Name: country, Length: 677, dtype: object

In [39]:
result = pd.DataFrame.from_records(norm, columns=['lang', 'code', 'flag'])
result = df.merge(result, left_index=True, right_index=True)
result

Unnamed: 0,country,frequency,lang,code,flag
0,en:france,1109929,fr,FR,🇫🇷
1,en:united-states,712983,de,UM,🇺🇲
2,en:spain,333584,en,ES,🇪🇸
3,en:germany,319896,en,DE,🇩🇪
4,en:italy,250943,en,IT,🇮🇹
...,...,...,...,...,...
672,ar:المغرب-🇲🇦,1,,,
673,en:meksyk,1,,,
674,en:ประเทศฝรั่งเศส,1,es,AW,🇦🇼
675,en:dkmdmd,1,,,


In [40]:
result[result.code.isna()]#.head(20)

Unnamed: 0,country,frequency,lang,code,flag
10,en:world,70577,,,
23,en:czech-republic,13124,,,
52,en:turkey,3525,,,
70,en:south-korea,1652,,,
87,en:cote-d-ivoire,1080,,,
...,...,...,...,...,...
669,en:ranska,1,,,
671,en:francie,1,,,
672,ar:المغرب-🇲🇦,1,,,
673,en:meksyk,1,,,


---

In [41]:
long_list = pd.concat([df.country.apply(remove_prefix).apply(replace_dash), df.frequency], axis=1)
long_list = long_list[long_list.frequency > 1]
long_list.drop(columns=['frequency']).to_csv("country-names.txt", index=False)
long_list

Unnamed: 0,country,frequency
0,france,1109929
1,united states,712983
2,spain,333584
3,germany,319896
4,italy,250943
...,...,...
463,zambia english,2
464,london,2
465,švicarska,2
466,middle east,2


In [42]:
long_list.frequency[long_list.frequency > 1].sum()

3908881

In [43]:
long_list.frequency[long_list.frequency == 1].sum()

0

In [44]:
pd.read_csv("country-names-translated.txt", sep=' - ', engine='python')

Unnamed: 0,FR,France
0,US,United States
1,ES,Spain
2,DE,Germany
3,IT,Italy
4,GB,United Kingdom
...,...,...
352,SG,Singapore (English)
353,SE,Sweden (Swedish)
354,KR,South Korea
355,SA,Saudi Arabia
