# Clean FAOSTAT Data
---

In [1]:
# Load dependencies
import pandas as pd

## Load Datasets
---

In [43]:
# List available datasets
QCL_df = pd.read_csv('data/faostat/QCL_init.csv')
RL_df = pd.read_csv('data/faostat/RL_init.csv')
hist_df = pd.read_csv('data/faostat/history.csv')

In [44]:
hist_df.columns

Index(['Former', 'Succession', 'Exists'], dtype='object')

## Clean QCL 'Area': Project historic data onto current countries
---

In [4]:
# add historic data to existing countries
for i,row in hist_df.iterrows():
    if row['Former'] in QCL_df.Area.unique():
        for n in row['Succession']:
            if n in QCL_df.Area.unique():
                QCL_df.loc[n,'Value'] += QCL_df.loc[row['Former'],'Value']
        # remove countries that no longer exist
        if not row['Exists']: QCL_df.drop(QCL_df.index[QCL_df.Area==row['Former']], inplace = True)

In [25]:
# remove 'China' ie. china (total) from data
QCL_df.drop(QCL_df.index[QCL_df.Area=='China'], inplace = True)

## Clean QCL 'Item': Merge 'Maize (corn)' and 'Green corn (maize)'
---

In [8]:
QCL_df.columns

Index(['Area', 'Item', 'Year', 'Unit', 'Value'], dtype='object')

In [9]:
QCL_df.Item.unique()

array(['Maize (corn)', 'Rice', 'Cattle and Buffaloes', 'Soya beans',
       'Cocoa beans', 'Coffee, green', 'Oil palm fruit',
       'Green corn (maize)'], dtype=object)

In [10]:
QCL_df.count()

Area     47868
Item     47868
Year     47868
Unit     47868
Value    47868
dtype: int64

In [11]:
QCL_corn = QCL_df.loc[QCL_df.Item.isin(['Maize (corn)','Green corn (maize)'])]
QCL_corn.count()

Area     12035
Item     12035
Year     12035
Unit     12035
Value    12035
dtype: int64

In [12]:
QCL_else = QCL_df.loc[~QCL_df.Item.isin(['Maize (corn)','Green corn (maize)'])]
QCL_else.count()

Area     35833
Item     35833
Year     35833
Unit     35833
Value    35833
dtype: int64

In [13]:
QCL_corn.loc[:,'AreaYear'] = [x+str(y) for x,y in zip(QCL_corn.Area,QCL_corn.Year)]
QCL_corn.tail(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  QCL_corn.loc[:,'AreaYear'] = [x+str(y) for x,y in zip(QCL_corn.Area,QCL_corn.Year)]


Unnamed: 0,Area,Item,Year,Unit,Value,AreaYear
48816,Zimbabwe,Maize (corn),2022,ha,1000000,Zimbabwe2022


In [14]:
QCL_corns = QCL_corn.groupby('AreaYear').sum()
QCL_corns.tail(1)

Unnamed: 0_level_0,Area,Item,Year,Unit,Value
AreaYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Zimbabwe2022,ZimbabweZimbabwe,Green corn (maize)Maize (corn),4044,haha,1001819


In [15]:
QCL_corna = QCL_corns.loc[QCL_corns.Unit=='ha']
QCL_corna.loc[:,'Item'] = 'Green corn/Maize'
QCL_corna = QCL_corna.reset_index()[['Area','Item','Year','Unit','Value']]

In [16]:
QCL_cornb = QCL_corns.loc[QCL_corns.Unit=='haha']
QCL_cornb.loc[:,'Area'] = [s[0:len(s)-4] for s in QCL_cornb.index]
QCL_cornb.loc[:,'Year'] = [int(s[len(s)-4:]) for s in QCL_cornb.index]
QCL_cornb.loc[:,'Unit'] = 'ha'
QCL_cornb.loc[:,'Item'] = 'Green corn/Maize'
QCL_cornb = QCL_cornb.reset_index()[['Area','Item','Year','Unit','Value']]

In [17]:
QCL_clean = pd.concat([QCL_else,QCL_corna,QCL_cornb])
QCL_clean.reset_index(inplace=True)
QCL_clean.tail(1)

Unnamed: 0,index,Area,Item,Year,Unit,Value
45433,2433,Zimbabwe,Green corn/Maize,2022,ha,1001819


## Export QCL to csv
---

In [18]:
QCL_clean.to_csv('data/faostat/QCL_clean.csv')

## Clean RL 'Area': Project historic data onto current countries
---

In [45]:
for i,row in hist_df.iterrows():
    if row['Former'] in RL_df.Area.unique():
        for n in row['Succession']:
            if n in RL_df.Area.unique():
                RL_df.loc[n,'Value'] += RL_df.loc[row['Former'],'Value']
        if not row['Exists']: RL_df.drop(RL_df.index[RL_df.Area==row['Former']], inplace = True)

In [46]:
# remove 'China' ie. china (total) from data
RL_df.drop(RL_df.index[RL_df.Area=='China'], inplace = True)

## Clean RL 'Area': 
---

In [48]:
RL_df.tail(1)

Unnamed: 0,Area,Item,Year,Unit,Value
75131,Zimbabwe,Primary Forest,2017,1000 ha,801.0


In [49]:
RL_df.Item.unique()

array(['Land area', 'Cropland', 'Permanent meadows and pastures',
       'Forest land', 'Naturally regenerating forest', 'Planted Forest',
       'Other land', 'Primary Forest', 'Farm buildings and Farmyards',
       'Land used for aquaculture'], dtype=object)

In [54]:
RL_df.loc[:,'AreaYear'] = [x+str(y) for x,y in zip(RL_df.Area,RL_df.Year)]
RL_df.tail(1)

Unnamed: 0,Area,Item,Year,Unit,Value,AreaYear
75131,Zimbabwe,Primary Forest,2017,1000 ha,801.0,Zimbabwe2017


In [73]:
RL_dense = RL_df.loc[RL_df.Item.isin(['Land area','Cropland','Permanent meadows and pastures','Forest land',
                                      'Other land', 'Farm buildings and Farmyards'])]\
    [['Item','Value','AreaYear']].groupby('AreaYear').agg(list)
RL_dense

Unnamed: 0_level_0,Item,Value
AreaYear,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan1961,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7750.0, 30000.0]"
Afghanistan1962,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7800.0, 30000.0]"
Afghanistan1963,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7850.0, 30000.0]"
Afghanistan1964,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7905.0, 30000.0]"
Afghanistan1965,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7910.0, 30000.0]"
...,...,...
Zimbabwe2018,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 3367.6304, 12100.0, 17536.72, 5680.6..."
Zimbabwe2019,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 3187.6768, 12100.0, 17490.65, 5906.6..."
Zimbabwe2020,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 3278.8634, 12100.0, 17444.58, 5861.5..."
Zimbabwe2021,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 3136.4378, 12100.0, 17398.51, 6050.0..."


In [74]:
RL_dense['Remainder'] = [ x[0] - sum(x[1:len(x)]) for x in RL_dense['Value']]

In [75]:
RL_dense

Unnamed: 0_level_0,Item,Value,Remainder
AreaYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan1961,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7750.0, 30000.0]",27473.0
Afghanistan1962,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7800.0, 30000.0]",27423.0
Afghanistan1963,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7850.0, 30000.0]",27373.0
Afghanistan1964,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7905.0, 30000.0]",27318.0
Afghanistan1965,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7910.0, 30000.0]",27313.0
...,...,...,...
Zimbabwe2018,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 3367.6304, 12100.0, 17536.72, 5680.6...",0.0
Zimbabwe2019,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 3187.6768, 12100.0, 17490.65, 5906.6...",0.0
Zimbabwe2020,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 3278.8634, 12100.0, 17444.58, 5861.5...",0.0
Zimbabwe2021,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 3136.4378, 12100.0, 17398.51, 6050.0...",0.0


In [76]:
RL_dense.loc[RL_dense.Remainder>1]

Unnamed: 0_level_0,Item,Value,Remainder
AreaYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan1961,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7750.0, 30000.0]",27473.0000
Afghanistan1962,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7800.0, 30000.0]",27423.0000
Afghanistan1963,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7850.0, 30000.0]",27373.0000
Afghanistan1964,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7905.0, 30000.0]",27318.0000
Afghanistan1965,"[Land area, Cropland, Permanent meadows and pa...","[65223.0, 7910.0, 30000.0]",27313.0000
...,...,...,...
Zimbabwe1985,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 2575.8283, 9835.0]",26274.1717
Zimbabwe1986,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 2693.394, 9870.0]",26121.6060
Zimbabwe1987,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 2594.6825, 9900.0]",26190.3175
Zimbabwe1988,"[Land area, Cropland, Permanent meadows and pa...","[38685.0, 2688.4391, 9950.0]",26046.5609


## Export RL Data
---

## Inspect RL Data
---