# Get FAOSTAT Country Data
---

In [1]:
# Load dependencies
import pandas as pd

## Load Datasets
---

In [2]:
# List available datasets
QCL_df = pd.read_csv('data/faostat/QCL_raw.csv')
RL_df = pd.read_csv('data/faostat/RL_raw.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/faostat/QCL_raw.csv'

## Analyze QCL Country Data
---

In [3]:
# get years with data for each country retrieved
# 0. convert years to strings
QCL_df.Year = QCL_df.Year.astype(str)
# 1. get string of years using groupby and sum
QCLayr_df = pd.DataFrame(QCL_df.groupby('Area').sum()['Year'])
# 2. loop through rows and create boolean mask for available data by area and year
for i in QCLayr_df.index:
    for j in [QCLayr_df.Year[i][n:n+4] for n in range(0, len(QCLayr_df.Year[i]), 4)]:
        QCLayr_df.loc[i,j] = 1
# 3. replace NaN with 0
QCLayr_df.fillna(0, inplace=True)
# 4. remove string column
QCLayr_df.drop(columns=['Year'],inplace=True)
QCLayr_df

Unnamed: 0_level_0,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Albania,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Algeria,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Angola,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Antigua and Barbuda,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Viet Nam,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Yemen,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Yugoslav SFR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zambia,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
# check for countries with no data for any given year
QCLayr_df.loc[(QCLayr_df!=1).any(axis=1)].index

Index(['Armenia', 'Azerbaijan', 'Belarus', 'Belgium', 'Belgium-Luxembourg',
       'Bosnia and Herzegovina', 'Croatia', 'Czechia', 'Czechoslovakia',
       'Eritrea', 'Estonia', 'Ethiopia', 'Ethiopia PDR', 'French Guiana',
       'Georgia', 'Guadeloupe', 'Kazakhstan', 'Kyrgyzstan', 'Latvia',
       'Lithuania', 'Luxembourg', 'Martinique',
       'Micronesia (Federated States of)', 'Montenegro', 'Nauru',
       'North Macedonia', 'Republic of Moldova', 'Russian Federation',
       'Réunion', 'Serbia', 'Serbia and Montenegro', 'Slovakia', 'Slovenia',
       'South Sudan', 'Sudan', 'Sudan (former)', 'Tajikistan', 'Turkmenistan',
       'USSR', 'Ukraine', 'Uzbekistan', 'Yugoslav SFR'],
      dtype='object', name='Area')

## Analyze RL Country Data
---

In [17]:
# get years with data for each country retrieved
# 0. convert years to strings
RL_df.Year = RL_df.Year.astype(str)
# 1. get string of years using groupby and sum
RLayr_df = pd.DataFrame(RL_df.groupby('Area').sum()['Year'])
# 2. loop through rows and create boolean mask for available data by area and year
for i in RLayr_df.index:
    for j in [RLayr_df.Year[i][n:n+4] for n in range(0, len(RLayr_df.Year[i]), 4)]:
        RLayr_df.loc[i,j] = 1
# 3. replace NaN with 0
RLayr_df.fillna(0, inplace=True)
# 4. remove string column
RLayr_df.drop(columns=['Year'],inplace=True)
RLayr_df

Unnamed: 0_level_0,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Albania,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Algeria,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
American Samoa,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Andorra,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Western Sahara,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Yemen,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Yugoslav SFR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zambia,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
RLayr_df.index

Index(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia',
       ...
       'Uzbekistan', 'Vanuatu', 'Venezuela (Bolivarian Republic of)',
       'Viet Nam', 'Wallis and Futuna Islands', 'Western Sahara', 'Yemen',
       'Yugoslav SFR', 'Zambia', 'Zimbabwe'],
      dtype='object', name='Area', length=247)

In [19]:
# check for countries with no data for any given year
RLayr_df.loc[(RLayr_df!=1).any(axis=1)].index

Index(['Anguilla', 'Armenia', 'Azerbaijan', 'Belarus', 'Belgium',
       'Belgium-Luxembourg', 'Bonaire, Sint Eustatius and Saba',
       'Bosnia and Herzegovina', 'Croatia', 'Curaçao', 'Czechia',
       'Czechoslovakia', 'Eritrea', 'Estonia', 'Ethiopia', 'Ethiopia PDR',
       'Georgia', 'Kazakhstan', 'Kyrgyzstan', 'Latvia', 'Lithuania',
       'Luxembourg', 'Marshall Islands', 'Micronesia (Federated States of)',
       'Montenegro', 'Netherlands Antilles (former)', 'North Macedonia',
       'Northern Mariana Islands', 'Pacific Islands Trust Territory', 'Palau',
       'Republic of Moldova', 'Russian Federation', 'Saint Barthélemy',
       'Saint Martin (French part)', 'Serbia', 'Serbia and Montenegro',
       'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'South Sudan',
       'Sudan', 'Sudan (former)', 'Tajikistan', 'Turkmenistan', 'USSR',
       'Ukraine', 'Uzbekistan', 'Yugoslav SFR'],
      dtype='object', name='Area')

## Analyze Historical Country Boundary Changes
---

In [8]:
# create dictionary for changing political boundaries through data time frame
history={'Saint Kitts and Nevis':['Anguilla'], #1979/1980
         'Pacific Islands Trust Territory':['Marshall Islands','Palau','Northern Mariana Islands','Micronesia (Federated States of)'], #1990/1991
         'USSR':['Armenia','Azerbaijan','Belarus','Estonia','Georgia','Kazakhstan','Kyrgyzstan','Latvia','Lithuania',
         'Republic of Moldova','Russian Federation','Tajikistan','Turkmenistan','Ukraine','Uzbekistan'], #1991/1992
         'Yugoslav SFR':['Croatia','Slovenia','North Macedonia','Serbia and Montenegro','Bosnia and Herzegovina'], #1991/1992
         'Czechoslovakia':['Czechia','Slovakia'], #1992/1993
         'Ethiopia PDR':['Ethiopia','Eritrea'], #1992/1993
         'Belgium-Luxembourg':['Belgium','Luxembourg'], #1999/2000
         'Serbia and Montenegro':['Serbia','Montenegro'], #2005/2006
         'Guadeloupe':['Saint Barthélemy','Saint Martin (French part)'], #2010/2011
         'Netherlands Antilles (former)':['Curaçao','Sint Maarten (Dutch part)','Bonaire, Sint Eustatius and Saba'], #2010/2011
         'Sudan (former)':['South Sudan','Sudan']} #2010/2011
history_countries = [*history.keys()]
for x in history.values(): history_countries.extend(x)

In [9]:
# check remaining 
QCLayr_df.loc[(QCLayr_df!=1).any(axis=1) & ~QCLayr_df.index.isin(history_countries)].index

Index(['French Guiana', 'Martinique', 'Nauru', 'Réunion'], dtype='object', name='Area')

In [10]:
# check remaining 
RLayr_df.loc[(RLayr_df!=1).any(axis=1) & ~RLayr_df.index.isin(history_countries)].index

Index([], dtype='object', name='Area')

In [11]:
history_df = pd.DataFrame({'Former':history.keys(),'Succession':history.values(),
                           'Exists':[1,0,0,0,0,0,0,0,1,0,0]})
history_df

Unnamed: 0,Former,Succession,Exists
0,Saint Kitts and Nevis,[Anguilla],1
1,Pacific Islands Trust Territory,"[Marshall Islands, Palau, Northern Mariana Isl...",0
2,USSR,"[Armenia, Azerbaijan, Belarus, Estonia, Georgi...",0
3,Yugoslav SFR,"[Croatia, Slovenia, North Macedonia, Serbia an...",0
4,Czechoslovakia,"[Czechia, Slovakia]",0
5,Ethiopia PDR,"[Ethiopia, Eritrea]",0
6,Belgium-Luxembourg,"[Belgium, Luxembourg]",0
7,Serbia and Montenegro,"[Serbia, Montenegro]",0
8,Guadeloupe,"[Saint Barthélemy, Saint Martin (French part)]",1
9,Netherlands Antilles (former),"[Curaçao, Sint Maarten (Dutch part), Bonaire, ...",0


In [12]:
history_df.to_csv('data/faostat/history.csv',index=False)

## Examine Unique Countries
---

In [13]:
countries = set(QCL_df.Area.unique()).intersection(set(RL_df.Area.unique()))\
    .difference(set([row['Former'] for i,row in history_df.iterrows() if row['Exists']==0]))

In [15]:
QCL_df.loc[QCL_df.Area.isin(['China','China, Hong Kong SAR', 'China, mainland','China, Taiwan Province of'])\
    & (QCL_df.Year=='2022') & (QCL_df.Item=='Rice')]

Unnamed: 0,Domain Code,Domain,Area Code,Area,Element Code,Element,Item Code,Item,Year Code,Year,Unit,Value
8595,QCL,Crops and livestock products,351,China,5312,Area harvested,27,Rice,2022,2022,ha,29689000
8781,QCL,Crops and livestock products,96,"China, Hong Kong SAR",5312,Area harvested,27,Rice,2022,2022,ha,0
9153,QCL,Crops and livestock products,41,"China, mainland",5312,Area harvested,27,Rice,2022,2022,ha,29450000
9514,QCL,Crops and livestock products,214,"China, Taiwan Province of",5312,Area harvested,27,Rice,2022,2022,ha,239000


In [23]:
countries.remove('China')
len(countries)

KeyError: 'China'

In [22]:
pd.Series(list(countries)).to_csv('data/faostat/countries_list.csv',index=False)