# Data used for the interactive map
Cleaning the openfoodfacts dataset as well as calculating scores for manufactuing and raw resources for each country

In [1]:
import pandas as pd
import json
import pycountry

In [2]:
ROOT_PATH = '../..'

In [4]:
# Define the columns we're interested in
usecols = ['origins', 'manufacturing_places', 'countries']

# Specify the data types for these columns
dtypes = {col: str for col in usecols}

# Load the data, entire openfoodfacts dataset
df_raw = pd.read_csv(f'{ROOT_PATH}/datasets/openfoodfacts.csv', delimiter='\t', usecols=usecols, dtype=str)

In [6]:
def country_name(country_code):
    """Convert country code to country name using pycountry library"""
    try:
        country_code = country_code[2:] #remove the 'en:' prefix
        return pycountry.countries.get(alpha_2=country_code).name
    except AttributeError:
        return None

def clean_country_names(country, not_found_dict):
    """Clean the country name fields"""
    if pd.isnull(country):
        return None

    country = country.strip().lower()
        
    countries_found = []
    
    other_accepted = ['bolivia', 'brasil', 'vietnam', 'czech republic']
    
    # Split the string into words and check each word
    for word in country.split(','):
        # Check if word is a country code
        country_name_from_code = country_name(word)
        if country_name_from_code is not None:
            countries_found.append(country_name_from_code)
        # Check if word is a country name
        elif pycountry.countries.get(name=word.capitalize()) is not None:
            countries_found.append(word.capitalize())
            
        elif word.lower() in other_accepted:
            countries_found.append(word.capitalize())
            
        # not found
        else:
            not_found_dict[word] = not_found_dict.get(word, 0) + 1

    # assums first country is the main country
    if len(countries_found) != 0:
        return countries_found[0]

    return None

def clean_country_field(field, not_found_dict):
    if pd.isnull(field):
        return None
    
    # Split on comma
    parts = field.split(',')
    
    # Clean each part separately
    cleaned_parts = [clean_country_names(part.strip(), not_found_dict) for part in parts]
    
    # Remove None parts
    cleaned_parts = [part for part in cleaned_parts if part is not None]
    
    # Join cleaned parts back together with comma
    cleaned_field = ', '.join(cleaned_parts)
    
    return cleaned_field

In [16]:
map1 = {
    'turkey': ['turquie'],
    'thailand': ['thaïlande'],
    'united states': ['états-unis', 'en:united states'],
    'peru': ['pérou', 'perú'],
    'czech republic': ['czech republic', 'česko'],
    'poland': ['pologne'],
    'algeria': ['algérie'],
    'croatia': ['hrvatska'],
    'ivory coast': ['ivory coast'],
    'germany': ['alemania'],
    'united kingdom': ['en:united kingdom'],
    'reunion': ['la réunion'],
    'netherlands': ['niederlande'],
    'spain': ['andalucía', 'comunidad valenciana', 'navarra', 'castilla-la mancha', 'castilla y león'],
    'mexico': ['estado de méxico', 'en:mexico', 'mexique'],
    'australia': ['en:australia'],
    'argentina': ['en:argentina', 'buenos aires', 'argentine'],
    'romania': ['en:romania'],
    'denmark': ['danmark'],
    'brazil': ['brésil'],
    'austria': ['en:austria'],
    'ireland': ['irlande'],
    'taiwan': ['taiwan'],
    'france': ['polynésie française'],
    'japan': ['japon'],
}

map2 = {
    'france': ['france', 'en:fr', 'en:france', 'france,europe','belgique,france', 'bretagne', 'bretagne', 'francia', 'frankreich', 'fr', 'normandie', 'pays de la loire', 'vendée', 'provence', 'french', 'paris', 'francais', 'francaise', 'parisien', 'parisienne', 'french republic', 'finistère'],
    'spain': ['españa','en:es', 'en:spain','espagne','spain', 'españa', 'espagne', 'españa', 'spanien'],
    'germany': ['deutschland','germany','allemagne','en:de', 'deutschland', 'deutschland', 'en:germany'],
    'united kingdom': ['en:uk','united kingdom','en:gb','uk','uk', 'royaume-uni'],
    'belgium': ['belgique','en:be','belgique,france','belgium','belgique,france', 'belgique'],
    'italy': ['italia','en:it', 'en:italy','italie', 'italien', 'italia', 'italie', 'conserve italia s.p.a.'],
    'switzerland': ['suisse','switzerland', 'suisse', 'schweiz', 'en:switzerland'],
    'netherlands': ['netherlands', 'holland', 'pays-bas', 'nederland'],
    'denmark': ['denmark','dänemark'],
    'portugal': ['portugal','en:portugal'],
    'greece': ['greece','en:gr','grèce','en:greece'],
    'sweden': ['sweden','en:se','en:sweden', 'sverige'],
    'norway': ['norway','en:no','en:norway'],
    'croatia': ['croatia','en:hr','en:croatia'],
    'albania': ['albania','en:al','en:albania'],
    'canada': ['canada','en:ca','en:canada', 'québec', 'brossard québec'],
    'mexico': ['méxico', 'maxico', 'ciudad de méxico'],
    'poland': ['polska', 'polen', 'polska'],
    'austria': ['österreich'],
    'bolivia': ['bolivia'],
    'tunisia': ['tunisie'],
    'united states': ['estados unidos', 'united states', 'usa'],
    'finland': ['suomi'],
    'romania': ['românia'],
    'morocco' : ['maroc'],
    'Ivory Coast': ['côte d\'ivoire', "C\u00f4te d'ivoire"],
    'Hungary': ['magyarország'],
    'China': ['chine'],
    'Russian Federation': ['россия', 'russia'],
    'Peru': ['perú'],
    'Slovakia': ['slovensko'],
    'czech republic': ['česká republika'],
}

for k,v in map1.items():
    if k in map2:
        map1[k] = map2[k] + v
        
for k,v in map2.items():
    if k not in map1:
        map1[k] = v

In [17]:
df = df_raw.dropna(subset=['countries', 'manufacturing_places'], how='any')

# Convert the country names to a standard format
df = df.applymap(lambda s: s.strip().lower() if isinstance(s, str) else s)


def apply_alias_mapping(alias_mapping, s):
    if isinstance(s, str):
        # Split the string into individual values
        values = s.split(',')
        # Apply the mapping to each value
        mapped_values = [alias_mapping.get(
            value.strip().lower(), value) for value in values]
        # Join the mapped values back together
        return ','.join(mapped_values)
    else:
        return s


# Define a mapping from country aliases to the standard country name
# Note: All country names and aliases are now in lowercase
country_mapping = map1

# Flatten the mapping to make it easy to apply
alias_mapping = {alias.lower(): country for country,
                 aliases in country_mapping.items() for alias in aliases}

# Apply the mapping to each column
df = df.applymap(lambda x: apply_alias_mapping(alias_mapping, x))


# Clean the origin fields
not_found_dict = {}
df['origins'] = df['origins'].apply(
    lambda x: clean_country_field(x, not_found_dict))
df['manufacturing_places'] = df['manufacturing_places'].apply(
    lambda x: clean_country_field(x, not_found_dict))
df['countries'] = df['countries'].apply(
    lambda x: clean_country_field(x, not_found_dict))

# Drop the rows where at least one element is missing.
df = df.dropna(how='any')

# post remapping
country_mapping_post = {
    'Usa': ['united states'],
    'England': ['united kingdom'],
    'Russia': ['russian federation'],
    'Brazil': ['brasil']
}
alias_mapping_post = {alias.lower(): country for country,
                 aliases in country_mapping_post.items() for alias in aliases}
df = df.applymap(lambda x: apply_alias_mapping(alias_mapping_post, x))

# sort not_found_dict by value and print out top 10 not found words
sorted_not_found_dict = sorted(
    not_found_dict.items(), key=lambda kv: kv[1], reverse=True)
print(sorted_not_found_dict[:100])
total_misses = 0
for v in sorted_not_found_dict:
    total_misses += v[1]
print(total_misses)

[('union européenne', 2841), ('', 630), ('european union', 569), ('alsace', 548), ('eu', 510), ('ue', 503), ('rhône-alpes', 488), ('belgien', 478), ('morbihan', 449), ('en:belgium', 440), ('espanha', 418), ('reunion', 392), ('србија', 384), ('německo', 371), ('savoie', 370), ('centre-val de loire', 354), ('loiret', 353), ('nord-pas-de-calais', 347), ('sarthe', 345), ('loire-atlantique', 342), ('nord', 328), ('quiberon', 323), ('europäische union', 319), ('frankrijk', 310), ('basse-normandie', 309), ('angleterre', 308), ('calvados', 307), ('europe', 306), ('frança', 305), ('ivory coast', 301), ('ille-et-vilaine', 294), ('en:ch', 294), ('formec biffi spa', 293), ('norvège', 278), ('méjannes-lès-alès (30)', 277), ('belgië', 275), ('sud-ouest', 274), ('agricultura ue', 273), ('българия', 272), ('bélgica', 270), ('autriche', 269), ('germania', 268), ('isère', 267), ('francie', 266), ('bourgogne', 264), ('la paz', 262), ('aveyron', 256), ('union européenne et non union européenne', 255), ('c

In [131]:
df_raw[df_raw['countries'] == 'Peru']

Unnamed: 0,origins,manufacturing_places,countries
48027,,,Peru
505576,,,Peru
611199,,,Peru
728337,,,Peru
824135,,,Peru
2151164,Perú,Perú,Peru
2178104,"Lima,Peru","Lima,Peru",Peru
2206985,Perú,"Lima,Perú",Peru
2222230,,,Peru
2222584,,,Peru


output format:
[
  {
    "bolivia": {
      "score": 0.2,
      "manufacturing": {
        "France": {
          "score": 0.2,
          "origin" {
            "Norway": 0.1
          }
        },
        "Germany": 0.12,
      }
    },
    ""
  }
]

In [18]:
coutry_count = {}

for index, row in df.iterrows():
  cosumptions = row['countries']
  origins = row['origins']
  manufacturings = row['manufacturing_places']
  
  for cosumption in cosumptions.split(','):
    cosumption = cosumption.strip()
    
    if cosumption == "":
      continue
    
    if cosumption not in coutry_count:
      coutry_count[cosumption] = {}
      coutry_count[cosumption]["score"] = 1
      coutry_count[cosumption]["manufacturing"] = {}
    else:
      coutry_count[cosumption]["score"] += 1
      
    for manufacturing in manufacturings.split(','):
      manufacturing = manufacturing.strip()
      
      if manufacturing == "":
        continue
        
      if manufacturing not in coutry_count[cosumption]["manufacturing"]:
        coutry_count[cosumption]["manufacturing"][manufacturing] = {}
        coutry_count[cosumption]["manufacturing"][manufacturing]["score"] = 1
        coutry_count[cosumption]["manufacturing"][manufacturing]["origin"] = {}
      else:
        coutry_count[cosumption]["manufacturing"][manufacturing]["score"] += 1
        
      for origin in origins.split(','):
        origin = origin.strip()
        if origin == "":
          continue
        
        if origin not in coutry_count[cosumption]["manufacturing"][manufacturing]["origin"]:
          coutry_count[cosumption]["manufacturing"][manufacturing]["origin"][origin] = 1
        else:
          coutry_count[cosumption]["manufacturing"][manufacturing]["origin"][origin] += 1
        

In [19]:
# normalize value
consume_score = 0

for consume in coutry_count.values():
  consume_score += consume["score"]
  
  manufacture_score = 0
  for manufacturing in consume["manufacturing"].values():
    manufacture_score += manufacturing["score"]
    
    origin_score = 0
    for origin in manufacturing["origin"].values():
      origin_score += origin
    for k, origin in manufacturing["origin"].items():
      manufacturing["origin"][k] /= origin_score
  
  for manufacturing in consume["manufacturing"].values():
    manufacturing["score"] /= manufacture_score
    
for consume in coutry_count.values():
  consume["score"] /= consume_score


In [20]:

# remove country if score is less than 0.01
th = 0.01

for ck in list(coutry_count.keys()):
    consume = coutry_count[ck]
    
    for mk in list(consume["manufacturing"].keys()):
        manufacturing = consume["manufacturing"][mk]
        for ok in list(manufacturing["origin"].keys()):
            origin = manufacturing["origin"][ok]
            if origin < th:
                del coutry_count[ck]["manufacturing"][mk]["origin"][ok]
                
        if manufacturing["score"] < th or not manufacturing["origin"]:
            del coutry_count[ck]["manufacturing"][mk]
            continue
          
    if not consume["manufacturing"]:
        del coutry_count[ck]
        


In [21]:
# Write the data to a JSON file
with open(f'./country_count.json', 'w') as f:
    json.dump(coutry_count, f, indent=4)