# Unifying country names

### Trading matrix data:
Country names and codes are in data\FAOSTAT\clean_data\area_country_codes.csv

### country_to_continent data:
I made this data with ChatGPT. Missing "Venezuela" for instance.

### Geopandas countries:
I downloaded the geopandas countries data from https://www.naturalearthdata.com/downloads/10m-cultural-vectors/10m-admin-0-countries-2/


**Will convert everything to lower case and try to match everything to geopandas


In [1]:
import csv
from geopy.distance import great_circle
import pandas as pd
import numpy as np
import geopandas as gpd

In [2]:
fao_countries = pd.read_csv(r"..\data\FAOSTAT\clean_data\fao_country_codes_corrected.csv", encoding="ISO-8859-1")
countries_df = pd.read_csv(r"..\data\FAOSTAT\clean_data\food_trading\country_to_continent.csv") #initial csv done with ChatGPT
geopandas_countries = pd.read_csv(r"..\data\FAOSTAT\clean_data\geopandas_countries.csv")

In [3]:
datasets = [fao_countries, countries_df, geopandas_countries]

for table in datasets:
    print('\n', table.head())


    Area Code         Area
0          1      Armenia
1          2  Afghanistan
2          3      Albania
3          4      Algeria
4          7       Angola

   Continent      Country           Capital   Latitude  Longitude
0      Asia  Afghanistan             Kabul  34.526000  69.181000
1    Europe      Albania            Tirana  41.327500  19.818900
2    Africa      Algeria           Algiers  36.737232   3.086472
3    Europe      Andorra  Andorra la Vella  42.506300   1.521800
4    Africa       Angola            Luanda  -8.839988  13.289437

        continent                   country
0        oceania                      fiji
1         africa                  tanzania
2         africa                 w. sahara
3  north america                    canada
4  north america  united states of america


In [4]:
def get_categorical(df):
  """
  Identifies categorical columns in a DataFrame

  Args:
      df (pandas.DataFrame): The DataFrame to identify categorical columns in

  Returns:
      list: A list containing the names of categorical columns
  """
  l_cat = []
  for col in df.columns:
    if df[col].dtype.kind == 'O':  # Check for object dtype (categorical)
      l_cat.append(col)
  return l_cat

In [5]:
# Convert to lower case every object column in all tables
for table in datasets:
    for col in table.columns:
        if col in get_categorical(table):
            table[col] = table[col].astype(str).str.lower()


## I will focus in keeping country codes from fao_countries but only countries in geopandas which is cleaner

In [6]:
fao_countries = fao_countries[fao_countries['Area Code'] < 1000] # This are regions. I exclude from cleaning

In [7]:
fao_list = fao_countries['Area'].tolist()
fao_list.sort()

In [8]:
mycountries_list = countries_df['Country'].tolist()
mycountries_list.sort()

In [9]:
geopandas_list = geopandas_countries['country'].tolist()
geopandas_list.sort()

In [10]:
print(len(fao_list))

print(len(mycountries_list))

print(len(geopandas_list))

#220
#203
#177

220
225
177


In [11]:
for i in fao_list:
    if i not in mycountries_list:
        print(i)

cã´te d'ivoire
johnston island
midway island
svalbard and jan mayen islands
yugoslav sfr


In [12]:
for i in fao_list:
    if i not in geopandas_list:
        print(i)

antigua and barbuda
bahrain
barbados
bouvet island
cabo verde
canton and enderbury islands
comoros
cook islands
czechoslovakia
cã´te d'ivoire
dominica
faroe islands
french polynesia
grenada
guadeloupe
heard and mcdonald islands
johnston island
kiribati
maldives
malta
marshall islands
martinique
mauritius
micronesia (federated states of)
midway island
monaco
nauru
niue
réunion
saint kitts and nevis
saint lucia
saint vincent and the grenadines
samoa
sao tome and principe
serbia and montenegro
seychelles
singapore
south georgia and the south sandwich islands
svalbard and jan mayen islands
tokelau
tonga
tuvalu
ussr
wake island
yugoslav sfr


# Complete myconuntry dataframe to calculate distances

In [13]:
merged_df = pd.merge(countries_df, fao_countries, how='outer', left_on='Country', right_on='Area')


In [14]:
missing = merged_df[merged_df['Country'].isnull() & merged_df['Area'].notnull()]

In [15]:
missing #after completing

Unnamed: 0,Continent,Country,Capital,Latitude,Longitude,Area Code,Area
51,,,,,,107.0,cã´te d'ivoire
104,,,,,,111.0,johnston island
133,,,,,,139.0,midway island
202,,,,,,260.0,svalbard and jan mayen islands
235,,,,,,248.0,yugoslav sfr


**I did not save anything here cause I modified the csv directly**

# Add continents to FAO country codes 

In [21]:
countries_df.head()

Unnamed: 0,Continent,Country,Capital,Latitude,Longitude
0,asia,afghanistan,kabul,34.526,69.181
1,europe,albania,tirana,41.3275,19.8189
2,africa,algeria,algiers,36.737232,3.086472
3,europe,andorra,andorra la vella,42.5063,1.5218
4,africa,angola,luanda,-8.839988,13.289437


In [22]:
fao_countries.head()

Unnamed: 0,Area Code,Area
0,1,armenia
1,2,afghanistan
2,3,albania
3,4,algeria
4,7,angola


In [23]:
merged_df = pd.merge(fao_countries, countries_df, how = 'left', left_on = 'Area', right_on = 'Country')

In [27]:
merged_df.drop(columns = ['Area'], inplace = True)

In [29]:
merged_df.to_csv(r"..\data\FAOSTAT\clean_data\food_trading\country_to_continent_codes.csv", index = False)