In [190]:
import pandas as pd
import numpy as np
import time

# `SYB65_145_202209_Land.csv`

In [191]:
data = pd.read_csv('data/raw/SYB65_145_202209_Land.csv')
data.columns = ['region_country_area', 'land', 'year', 'metric_type', 'value', 'footnotes', 'source']

data = data.drop(data.index[0])

print(f"Shape is: {data.shape}")
data.head()

Shape is: (6285, 7)


Unnamed: 0,region_country_area,land,year,metric_type,value,footnotes,source
1,1,"Total, all countries or areas",2005,Land area (thousand hectares),13020109.0,,Food and Agriculture Organization of the Unite...
2,1,"Total, all countries or areas",2005,Arable land (thousand hectares),1366253.0,,Food and Agriculture Organization of the Unite...
3,1,"Total, all countries or areas",2005,Permanent crops (thousand hectares),145136.0,,Food and Agriculture Organization of the Unite...
4,1,"Total, all countries or areas",2005,Forest cover (thousand hectares),4132183.0,,Food and Agriculture Organization of the Unite...
5,1,"Total, all countries or areas",2005,Arable land (% of total land area),10.5,,Food and Agriculture Organization of the Unite...


In [192]:
data.drop(['region_country_area', 'footnotes', 'source'], axis=1, inplace=True)

print(f"Shape is: {data.shape}")
data.head()

Shape is: (6285, 4)


Unnamed: 0,land,year,metric_type,value
1,"Total, all countries or areas",2005,Land area (thousand hectares),13020109.0
2,"Total, all countries or areas",2005,Arable land (thousand hectares),1366253.0
3,"Total, all countries or areas",2005,Permanent crops (thousand hectares),145136.0
4,"Total, all countries or areas",2005,Forest cover (thousand hectares),4132183.0
5,"Total, all countries or areas",2005,Arable land (% of total land area),10.5


In [193]:
data['value'] = data['value'].str.replace(',', '').astype(float)
data.head()

Unnamed: 0,land,year,metric_type,value
1,"Total, all countries or areas",2005,Land area (thousand hectares),13020109.0
2,"Total, all countries or areas",2005,Arable land (thousand hectares),1366253.0
3,"Total, all countries or areas",2005,Permanent crops (thousand hectares),145136.0
4,"Total, all countries or areas",2005,Forest cover (thousand hectares),4132183.0
5,"Total, all countries or areas",2005,Arable land (% of total land area),10.5


# `Add country_code found in other files`

In [194]:
data2 = pd.read_csv('data/cleaned/forest-area-km.csv')

print(f"Shape is: {data2.shape}")
data2.head()

Shape is: (7846, 4)


Unnamed: 0,Entity,Code,Year,Forest area
0,Afghanistan,AFG,1990,1208440.0
1,Afghanistan,AFG,1991,1208440.0
2,Afghanistan,AFG,1992,1208440.0
3,Afghanistan,AFG,1993,1208440.0
4,Afghanistan,AFG,1994,1208440.0


In [195]:
data2 = data2[data2['Code'].isna() == False]

In [196]:
unique_entities = data2['Entity'].unique()
unique_country_codes = data2['Code'].unique()

country_with_code_list = [(country, country_code) for country, country_code in zip(unique_entities, unique_country_codes)]
len(country_with_code_list)

224

In [197]:
unique_lands = data['land'].unique()
len(unique_lands)

279

In [198]:
# convert the array of tuples into a dataframe
countries_df = pd.DataFrame(country_with_code_list, columns=["country_name", "country_code"])
countries_df.head()

Unnamed: 0,country_name,country_code
0,Afghanistan,AFG
1,Albania,ALB
2,Algeria,DZA
3,American Samoa,ASM
4,Andorra,AND


In [199]:
# merge the two dataframes based on "land" column in df and "country_name" column in countries_df, using left merge
data = pd.merge(data, countries_df, left_on="land", right_on="country_name", how="left")

data.drop('country_name', axis=1, inplace=True)

data.head()

Unnamed: 0,land,year,metric_type,value,country_code
0,"Total, all countries or areas",2005,Land area (thousand hectares),13020109.0,
1,"Total, all countries or areas",2005,Arable land (thousand hectares),1366253.0,
2,"Total, all countries or areas",2005,Permanent crops (thousand hectares),145136.0,
3,"Total, all countries or areas",2005,Forest cover (thousand hectares),4132183.0,
4,"Total, all countries or areas",2005,Arable land (% of total land area),10.5,


In [200]:
countries = ['Viet Nam', 'Venezuela (Boliv. Rep. of)', 'Bolivia (Plurin. State of)', 'Brunei Darussalam', 
             'Cabo Verde', 'China, Hong Kong SAR', 'China, Macao SAR', 'Dem. People\'s Rep. Korea', 
             'Dem. Rep. of the Congo', 'Falkland Islands (Malvinas)', 'French Guiana', 'Iran (Islamic Republic of)', 
             'Polynesia', 'Bonaire, St. Eustatius & Saba', 'Côte d’Ivoire', 'Russian Federation', 
             'United Rep. of Tanzania', 'United States of America']
codes = ['VNM', 'VEN', 'BOL', 'BRN', 
         'CPV', 'CHN', 'CHN', 'PRK', 
         'COD', 'FLK', 'GUF', 'IRN', 
         'PYF', 'BES', 'CIV', 'RUS', 
         'TZA', 'USA']


manual_countries_df = pd.DataFrame({
    'country' : countries,
    'code': codes
})

# merge the two dataframes based on "land" column in df and "country_name" column in countries_df, using left merge
data = pd.merge(data, manual_countries_df, left_on="land", right_on="country", how="left")

data.drop('country', axis=1, inplace=True)

data.head()

Unnamed: 0,land,year,metric_type,value,country_code,code
0,"Total, all countries or areas",2005,Land area (thousand hectares),13020109.0,,
1,"Total, all countries or areas",2005,Arable land (thousand hectares),1366253.0,,
2,"Total, all countries or areas",2005,Permanent crops (thousand hectares),145136.0,,
3,"Total, all countries or areas",2005,Forest cover (thousand hectares),4132183.0,,
4,"Total, all countries or areas",2005,Arable land (% of total land area),10.5,,


In [202]:
data['country_code'].fillna(data['code'], inplace=True)
data.drop('code', axis=1, inplace=True)

In [203]:
data.to_csv('data/cleaned/SYB65_145_202209_Land.csv', index=False)