# Discovery

By now, world population is in constant increase. The more the people, the more the food we need that translate in an increase of use of natural resources.


But which are the product we produce that exploit most resources? Which countries contribute to the exploitation of natural resources?

**Goal**: Discovering which are the top 10 products that use more water, need more land and emit more gas. Then discovering which countries are the most producer of each category


# Data Selection

In [1]:
import pandas as pd
import numpy as np
import sidetable 
from dataprep.clean import clean_country
from dataprep.clean import clean_df
from dataprep.clean import validate_lat_long
from dataprep.clean import clean_headers
import country_converter as coco
import plotly 
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go 
from raceplotly.plots import barplot
from plotly.subplots import make_subplots

In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.options.display.max_colwidth = 100
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:20,.2f}'.format)

In [3]:
fao_df = pd.read_csv('FAO.csv', encoding='latin-1')
food_pr_df = pd.read_csv('Food_Production.csv')
pop_df = pd.read_csv('population_total_long.csv')

# Data Cleaning

### FAO Data

In [4]:
# Checking what the dataset looks like
fao_df.head()

Unnamed: 0,Area Abbreviation,Area Code,Area,Item Code,Item,Element Code,Element,Unit,latitude,longitude,Y1961,Y1962,Y1963,Y1964,Y1965,Y1966,Y1967,Y1968,Y1969,Y1970,Y1971,Y1972,Y1973,Y1974,Y1975,Y1976,Y1977,Y1978,Y1979,Y1980,Y1981,Y1982,Y1983,Y1984,Y1985,Y1986,Y1987,Y1988,Y1989,Y1990,Y1991,Y1992,Y1993,Y1994,Y1995,Y1996,Y1997,Y1998,Y1999,Y2000,Y2001,Y2002,Y2003,Y2004,Y2005,Y2006,Y2007,Y2008,Y2009,Y2010,Y2011,Y2012,Y2013
0,AFG,2,Afghanistan,2511,Wheat and products,5142,Food,1000 tonnes,33.94,67.71,1928.0,1904.0,1666.0,1950.0,2001.0,1808.0,2053.0,2045.0,2154.0,1819.0,1963.0,2215.0,2310.0,2335.0,2434.0,2512.0,2282.0,2454.0,2443.0,2129.0,2133.0,2068.0,1994.0,1851.0,1791.0,1683.0,2194.0,1801.0,1754.0,1640.0,1539.0,1582.0,1840.0,1855.0,1853.0,2177.0,2343.0,2407.0,2463.0,2600.0,2668.0,2776.0,3095.0,3249.0,3486.0,3704.0,4164.0,4252.0,4538.0,4605.0,4711.0,4810,4895
1,AFG,2,Afghanistan,2805,Rice (Milled Equivalent),5142,Food,1000 tonnes,33.94,67.71,183.0,183.0,182.0,220.0,220.0,195.0,231.0,235.0,238.0,213.0,205.0,233.0,246.0,246.0,255.0,263.0,235.0,254.0,270.0,259.0,248.0,217.0,217.0,197.0,186.0,200.0,193.0,202.0,191.0,199.0,197.0,249.0,218.0,260.0,319.0,254.0,326.0,347.0,270.0,372.0,411.0,448.0,460.0,419.0,445.0,546.0,455.0,490.0,415.0,442.0,476.0,425,422
2,AFG,2,Afghanistan,2513,Barley and products,5521,Feed,1000 tonnes,33.94,67.71,76.0,76.0,76.0,76.0,76.0,75.0,71.0,72.0,73.0,74.0,71.0,70.0,72.0,76.0,77.0,80.0,60.0,65.0,64.0,64.0,60.0,55.0,53.0,51.0,48.0,46.0,46.0,47.0,46.0,43.0,43.0,40.0,50.0,46.0,41.0,44.0,50.0,48.0,43.0,26.0,29.0,70.0,48.0,58.0,236.0,262.0,263.0,230.0,379.0,315.0,203.0,367,360
3,AFG,2,Afghanistan,2513,Barley and products,5142,Food,1000 tonnes,33.94,67.71,237.0,237.0,237.0,238.0,238.0,237.0,225.0,227.0,230.0,234.0,223.0,219.0,225.0,240.0,244.0,255.0,185.0,203.0,198.0,202.0,189.0,174.0,167.0,160.0,151.0,145.0,145.0,148.0,145.0,135.0,132.0,120.0,155.0,143.0,125.0,138.0,159.0,154.0,141.0,84.0,83.0,122.0,144.0,185.0,43.0,44.0,48.0,62.0,55.0,60.0,72.0,78,89
4,AFG,2,Afghanistan,2514,Maize and products,5521,Feed,1000 tonnes,33.94,67.71,210.0,210.0,214.0,216.0,216.0,216.0,235.0,232.0,236.0,200.0,201.0,216.0,228.0,231.0,234.0,240.0,228.0,234.0,228.0,226.0,210.0,199.0,192.0,182.0,173.0,170.0,154.0,148.0,137.0,144.0,126.0,90.0,141.0,150.0,159.0,108.0,90.0,99.0,72.0,35.0,48.0,89.0,63.0,120.0,208.0,233.0,249.0,247.0,195.0,178.0,191.0,200,200


In [5]:
# Checking the shape
fao_df.shape

(21477, 63)

In [6]:
# Checking columns dtypes
fao_dtypes, clean_fao = clean_df(fao_df, clean_header=False, standardize_missing_values='ignore', downcast_memory=False)
fao_dtypes

Data Type Detection Report:
	These data types are supported by DataPrep to clean: ['country', 'coordinate', 'address']


Unnamed: 0,semantic_data_type,atomic_data_type
Area Abbreviation,country,string
Area Code,integer,integer
Area,country,string
Item Code,integer,integer
Item,string,string
Element Code,integer,integer
Element,string,string
Unit,address,string
latitude,coordinate,floating
longitude,coordinate,floating


In [7]:
# Changing *Y2012* and *Y2013* data type from int to float
clean_fao[['Y2012', 'Y2013']] = clean_fao[['Y2012', 'Y2013']].astype(float)

# Checking column names and eventually fixing them
clean_fao.columns

Index(['Area Abbreviation', 'Area Code', 'Area', 'Item Code', 'Item',
       'Element Code', 'Element', 'Unit', 'latitude', 'longitude', 'Y1961',
       'Y1962', 'Y1963', 'Y1964', 'Y1965', 'Y1966', 'Y1967', 'Y1968', 'Y1969',
       'Y1970', 'Y1971', 'Y1972', 'Y1973', 'Y1974', 'Y1975', 'Y1976', 'Y1977',
       'Y1978', 'Y1979', 'Y1980', 'Y1981', 'Y1982', 'Y1983', 'Y1984', 'Y1985',
       'Y1986', 'Y1987', 'Y1988', 'Y1989', 'Y1990', 'Y1991', 'Y1992', 'Y1993',
       'Y1994', 'Y1995', 'Y1996', 'Y1997', 'Y1998', 'Y1999', 'Y2000', 'Y2001',
       'Y2002', 'Y2003', 'Y2004', 'Y2005', 'Y2006', 'Y2007', 'Y2008', 'Y2009',
       'Y2010', 'Y2011', 'Y2012', 'Y2013'],
      dtype='object')

In [8]:
#renaming columns in snake case and the years columns without the 'Y'
clean_fao = clean_headers(clean_fao, case='snake', replace={'Y':''})
clean_fao.columns

Column Headers Cleaning Report:
	61 values cleaned (96.83%)


Index(['area_abbreviation', 'area_code', 'area', 'item_code', 'item',
       'element_code', 'element', 'unit', 'latitude', 'longitude', '1961',
       '1962', '1963', '1964', '1965', '1966', '1967', '1968', '1969', '1970',
       '1971', '1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979',
       '1980', '1981', '1982', '1983', '1984', '1985', '1986', '1987', '1988',
       '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997',
       '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2011', '2012', '2013'],
      dtype='object')

In [9]:
# variable that store the number of rows before removing duplicates
before = clean_fao.shape[0]
print(f'Number of rows before dropping duplicates: {before:>6}')

# removing duplicates if there any
clean_fao = clean_fao.drop_duplicates(keep='first')

# variables that stores the number of rows after removing duplicates
after = clean_fao.shape[0]
print(f'Number of rows after dropping duplicates: {clean_fao.shape[0] :>7}')

# printing the result
if before == after:
    print('No duplicates were found')
else:
    print(f'{before - after} duplicates were found and removed')

Number of rows before dropping duplicates:  21477
Number of rows after dropping duplicates:   21477
No duplicates were found


#### Checking qualitative columns values

In [10]:
years = clean_fao.columns[clean_fao.columns.get_loc('1961'):]
clean_fao.stb.counts(exclude=['number'])

Unnamed: 0,count,unique,most_freq,most_freq_count,least_freq,least_freq_count
unit,21477,1,1000 tonnes,21477,1000 tonnes,21477
element,21477,2,Food,17528,Feed,3949
item,21477,115,Milk - Excluding Butter,558,"Meat, Aquatic Mammals",3
area_abbreviation,21477,169,CHN,541,LSO,75
area,21477,174,Spain,150,Lesotho,75


In [11]:
to_check = ['area', 'area_abbreviation', 'item']

for col in to_check:
    print(f'{col}\n{sorted(clean_fao[col].unique())}', end=f'\n{"-"*200}\n')

area
['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China, Hong Kong SAR', 'China, Macao SAR', 'China, Taiwan Province of', 'China, mainland', 'Colombia', 'Congo', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czechia', "Côte d'Ivoire", "Democratic People's Republic of Korea", 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Icel

#### Fixing Area column values

After a close look, some of the Area names are not correct. let's correct them: <br>
With the function clean_country *Taiwan*, *Macau* and *Hong Kong* would fall under *China*, let's remove it from those country before applying the function

In [12]:
clean_fao['area'] = clean_fao['area'].apply(lambda x: 'Taiwan' if 'Taiwan' in x else x)
clean_fao['area'] = clean_fao['area'].apply(lambda x: 'Hong Kong' if 'Hong Kong' in x else x)
clean_fao['area'] = clean_fao['area'].apply(lambda x: 'Macau' if 'Macao' in x else x)

Cleaning country names with clean_country

In [13]:
clean_fao = clean_country(clean_fao, 'area', input_format=('name', 'official'), output_format='name', fuzzy_dist=2, inplace=True, errors='raise')

  0%|          | 0/9 [00:00<?, ?it/s]

Country Cleaning Report:
	2559 values cleaned (11.92%)
Result contains 21477 (100.0%) values in the correct format and 0 null values (0.0%)


 *Area Abbreviation* and the *Area* count do not match. Let's investigate why

In [14]:
# checking if there are more than 1 country with the same area code
a_code_unique = clean_fao.groupby('area_abbreviation')['area_clean'].nunique()
a_code_unique[a_code_unique > 1]

area_abbreviation
AZE    2
CHN    4
THA    2
Name: area_clean, dtype: int64

As suspected, 3 *Area Abbreviation* are associated with more than 1 country. Let's find out which countries fall under the same code

In [15]:
cond = clean_fao['area_abbreviation'].isin(['AZE', 'THA', 'CHN'])
clean_fao.loc[cond].groupby('area_abbreviation')['area_clean'].unique()

area_abbreviation
AZE                [Azerbaijan, Bahamas]
CHN    [Hong Kong, Macau, China, Taiwan]
THA                [Thailand, Macedonia]
Name: area_clean, dtype: object

The result of the investigation evidenciate that *Bahamas*, *Macedonia*, *Taiwan*, *Macau* and *Hong Kong* fall under the wrong *Area abbreviation* code.

Fixing the wrong Area Abbreviation values

In [16]:
cond = clean_fao['area_clean'] == 'Bahamas'
clean_fao.loc[cond, 'area_abbreviation'] = 'BHS'

cond = clean_fao['area_clean'] == 'Hong Kong'
clean_fao.loc[cond, 'area_abbreviation'] = 'HKG'

cond = clean_fao['area_clean'] == 'Macau'
clean_fao.loc[cond, 'area_abbreviation'] = 'MAC'

cond = clean_fao['area_clean'] == 'Taiwan'
clean_fao.loc[cond, 'area_abbreviation'] = 'TWN'

cond = clean_fao['area_clean'] == 'Macedonia'
clean_fao.loc[cond, 'area_abbreviation'] = 'MKD'

#### Dropping unecessary columns

In [17]:
clean_fao.drop(columns=['area_code', 'item_code', 'element_code', 'unit'], inplace=True)

#### Checking latitude and longitude values

Let's check if latitude and longitude contains some non valide values. 

In [18]:
validate_lat_long(clean_fao["latitude"], lat_long=False, lat=True).all()

True

In [19]:
validate_lat_long(clean_fao["longitude"], lat_long=False, lon=True).all()

True

Latitude ranges from -90 to 90,  longitude ranges from -180 to 180

In [20]:
clean_fao[['longitude', 'latitude']].agg([min, max])

Unnamed: 0,longitude,latitude
min,-172.1,-40.9
max,179.41,64.96


Both max and min of latitude and longitude fall under the accepted range

#### Checking if there are some invalid values in the *Years* columns

Let's check if are there any negative numbers in production

In [21]:
cond = clean_fao.loc[:,'1961':'2013'].agg([min])
cond.T[cond.T['min'] < 0]

Unnamed: 0,min
2012,-169.0
2013,-246.0


Now let's investigate which country has a negative amount of production for which *Item* and *Element* (food or feed)

In [22]:
clean_fao.loc[clean_fao[['2013', '2012']].idxmin().unique(), ['area_clean', 'item', 'element']]

Unnamed: 0,area_clean,item,element
10082,Japan,Oats,Food


Negative number in production must be an error of input, just drop the entire row

In [23]:
clean_fao.drop(labels=10082, inplace=True)

#### Checking for missing values

In [24]:
clean_fao.stb.missing(clip_0=True, style=True)

Unnamed: 0,missing,total,percent
1961,3539,21476,16.48%
1976,3539,21476,16.48%
1989,3539,21476,16.48%
1988,3539,21476,16.48%
1987,3539,21476,16.48%
1962,3539,21476,16.48%
1985,3539,21476,16.48%
1984,3539,21476,16.48%
1983,3539,21476,16.48%
1982,3539,21476,16.48%


Let's create a dataframe containing only the years variables

In [25]:
year_df = clean_fao[years]

First let's drop all the rows that contain all NaN values, if there are any

In [26]:
clean_fao = clean_fao.dropna(how='all')

Since the only NaN values in the dataframe are in the numeric columns, let's fill all the remaining NaN values with 0

In [27]:
clean_fao.fillna(0, inplace=True)

Lastly we  create a list of index of the rows of the years dataframe that has only 0 values

In [28]:
idx_to_drop = year_df.loc[(year_df == 0).all(axis=1)].index

Finally droping those rows

In [29]:
clean_fao = clean_fao.drop(labels=idx_to_drop)

Updating the years dataset with the NaN value dropped

In [30]:
year_df = clean_fao[years]

### Food Production Data

In [31]:
food_pr_df.head()

Unnamed: 0,Food product,Land use change,Animal Feed,Farm,Processing,Transport,Packging,Retail,Total_emissions,Eutrophying emissions per 1000kcal (gPO₄eq per 1000kcal),Eutrophying emissions per kilogram (gPO₄eq per kilogram),Eutrophying emissions per 100g protein (gPO₄eq per 100 grams protein),Freshwater withdrawals per 1000kcal (liters per 1000kcal),Freshwater withdrawals per 100g protein (liters per 100g protein),Freshwater withdrawals per kilogram (liters per kilogram),Greenhouse gas emissions per 1000kcal (kgCO₂eq per 1000kcal),Greenhouse gas emissions per 100g protein (kgCO₂eq per 100g protein),Land use per 1000kcal (m² per 1000kcal),Land use per kilogram (m² per kilogram),Land use per 100g protein (m² per 100g protein),Scarcity-weighted water use per kilogram (liters per kilogram),Scarcity-weighted water use per 100g protein (liters per 100g protein),Scarcity-weighted water use per 1000kcal (liters per 1000 kilocalories)
0,Wheat & Rye (Bread),0.1,0.0,0.8,0.2,0.1,0.1,0.1,1.4,,,,,,,,,,,,,,
1,Maize (Meal),0.3,0.0,0.5,0.1,0.1,0.1,0.0,1.1,,,,,,,,,,,,,,
2,Barley (Beer),0.0,0.0,0.2,0.1,0.0,0.5,0.3,1.1,,,,,,,,,,,,,,
3,Oatmeal,0.0,0.0,1.4,0.0,0.1,0.1,0.0,1.6,4.28,11.23,8.64,183.91,371.08,482.4,0.95,1.91,2.9,7.6,5.85,18786.2,14450.92,7162.1
4,Rice,0.0,0.0,3.6,0.1,0.1,0.1,0.1,4.0,9.51,35.07,49.39,609.98,3166.76,2248.4,1.21,6.27,0.76,2.8,3.94,49576.3,69825.77,13449.89


In [32]:
food_pr_df.describe()

Unnamed: 0,Land use change,Animal Feed,Farm,Processing,Transport,Packging,Retail,Total_emissions,Eutrophying emissions per 1000kcal (gPO₄eq per 1000kcal),Eutrophying emissions per kilogram (gPO₄eq per kilogram),Eutrophying emissions per 100g protein (gPO₄eq per 100 grams protein),Freshwater withdrawals per 1000kcal (liters per 1000kcal),Freshwater withdrawals per 100g protein (liters per 100g protein),Freshwater withdrawals per kilogram (liters per kilogram),Greenhouse gas emissions per 1000kcal (kgCO₂eq per 1000kcal),Greenhouse gas emissions per 100g protein (kgCO₂eq per 100g protein),Land use per 1000kcal (m² per 1000kcal),Land use per kilogram (m² per kilogram),Land use per 100g protein (m² per 100g protein),Scarcity-weighted water use per kilogram (liters per kilogram),Scarcity-weighted water use per 100g protein (liters per 100g protein),Scarcity-weighted water use per 1000kcal (liters per 1000 kilocalories)
count,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,33.0,38.0,27.0,30.0,26.0,38.0,33.0,27.0,33.0,38.0,27.0,38.0,26.0,30.0
mean,1.26,0.45,3.47,0.25,0.2,0.27,0.07,5.97,27.18,46.14,52.77,504.19,1437.98,932.61,5.63,13.52,12.42,29.26,29.11,36607.43,59196.44,17380.58
std,3.36,0.92,7.08,0.37,0.16,0.34,0.11,10.5,46.45,82.81,52.03,539.13,1441.98,1297.0,10.61,19.43,28.35,78.49,49.31,56891.28,89928.19,16232.08
min,-2.1,0.0,0.1,0.0,0.0,0.0,0.0,0.2,0.71,0.69,3.38,0.72,32.38,0.0,0.07,0.26,0.27,0.33,3.0,0.0,421.25,4.1
25%,0.0,0.0,0.35,0.0,0.1,0.1,0.0,0.85,4.21,3.75,17.86,106.93,373.57,105.5,0.63,4.03,1.31,1.11,5.09,3325.07,11018.4,2969.12
50%,0.2,0.0,0.8,0.1,0.1,0.1,0.0,1.6,7.0,11.46,37.33,338.06,1083.33,417.1,1.35,6.5,2.98,6.87,7.94,14533.05,20917.21,12605.26
75%,0.8,0.0,2.2,0.3,0.2,0.3,0.15,6.0,26.32,45.84,55.3,694.81,1832.39,1340.38,5.34,14.98,6.61,14.92,23.0,35960.18,70651.72,28056.47
max,16.3,2.9,39.4,1.3,0.8,1.6,0.3,59.6,197.36,365.29,185.05,2062.18,6003.33,5605.2,50.95,93.3,119.49,369.81,184.81,229889.8,431620.0,49735.88


In [33]:
food_pr_df.shape

(43, 23)

In [34]:
food_pr_df.columns

Index(['Food product', 'Land use change', 'Animal Feed', 'Farm', 'Processing',
       'Transport', 'Packging', 'Retail', 'Total_emissions',
       'Eutrophying emissions per 1000kcal (gPO₄eq per 1000kcal)',
       'Eutrophying emissions per kilogram (gPO₄eq per kilogram)',
       'Eutrophying emissions per 100g protein (gPO₄eq per 100 grams protein)',
       'Freshwater withdrawals per 1000kcal (liters per 1000kcal)',
       'Freshwater withdrawals per 100g protein (liters per 100g protein)',
       'Freshwater withdrawals per kilogram (liters per kilogram)',
       'Greenhouse gas emissions per 1000kcal (kgCO₂eq per 1000kcal)',
       'Greenhouse gas emissions per 100g protein (kgCO₂eq per 100g protein)',
       'Land use per 1000kcal (m² per 1000kcal)',
       'Land use per kilogram (m² per kilogram)',
       'Land use per 100g protein (m² per 100g protein)',
       'Scarcity-weighted water use per kilogram (liters per kilogram)',
       'Scarcity-weighted water use per 100g protein 

In [35]:
food_pr_df = clean_headers(food_pr_df, case='snake')
food_pr_df.columns

Column Headers Cleaning Report:
	23 values cleaned (100.0%)


Index(['food_product', 'land_use_change', 'animal_feed', 'farm', 'processing',
       'transport', 'packging', 'retail', 'total_emissions',
       'eutrophying_emissions_per_1000kcal_g_p_oeq_per_1000kcal',
       'eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram',
       'eutrophying_emissions_per_100g_protein_g_p_oeq_per_100_grams_protein',
       'freshwater_withdrawals_per_1000kcal_liters_per_1000kcal',
       'freshwater_withdrawals_per_100g_protein_liters_per_100g_protein',
       'freshwater_withdrawals_per_kilogram_liters_per_kilogram',
       'greenhouse_gas_emissions_per_1000kcal_kg_c_oeq_per_1000kcal',
       'greenhouse_gas_emissions_per_100g_protein_kg_c_oeq_per_100g_protein',
       'land_use_per_1000kcal_m_per_1000kcal',
       'land_use_per_kilogram_m_per_kilogram',
       'land_use_per_100g_protein_m_per_100g_protein',
       'scarcity_weighted_water_use_per_kilogram_liters_per_kilogram',
       'scarcity_weighted_water_use_per_100g_protein_liters_per_100g_protei

Dropping unecessary columns

In [36]:
to_drop = [
       'eutrophying_emissions_per_1000kcal_g_p_oeq_per_1000kcal',
       'eutrophying_emissions_per_100g_protein_g_p_oeq_per_100_grams_protein',
       'freshwater_withdrawals_per_1000kcal_liters_per_1000kcal',
       'freshwater_withdrawals_per_100g_protein_liters_per_100g_protein',
       'greenhouse_gas_emissions_per_1000kcal_kg_c_oeq_per_1000kcal',
       'greenhouse_gas_emissions_per_100g_protein_kg_c_oeq_per_100g_protein',
       'land_use_per_1000kcal_m_per_1000kcal',
       'land_use_per_100g_protein_m_per_100g_protein',
       'scarcity_weighted_water_use_per_100g_protein_liters_per_100g_protein',
       'scarcity_weighted_water_use_per_1000kcal_liters_per_1000_kilocalories'
       ]
food_pr_df.drop(to_drop, axis=1, inplace=True)



Removing missing values

In [37]:
food_pr_df.dropna(how='any', inplace=True)

In [38]:
#checking the negative values for the land_use_change columns as seen in df.describe()
cond = food_pr_df['land_use_change'] < 0
food_pr_df.loc[cond, :]

Unnamed: 0,food_product,land_use_change,animal_feed,farm,processing,transport,packging,retail,total_emissions,eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram,freshwater_withdrawals_per_kilogram_liters_per_kilogram,land_use_per_kilogram_m_per_kilogram,scarcity_weighted_water_use_per_kilogram_liters_per_kilogram
11,Nuts,-2.1,0.0,2.1,0.0,0.1,0.1,0.0,0.2,19.15,4133.8,12.96,229889.8
19,Olive Oil,-0.4,0.0,4.3,0.7,0.5,0.9,0.0,6.0,37.26,2141.8,26.31,177480.2
25,Citrus Fruit,-0.1,0.0,0.3,0.0,0.1,0.0,0.0,0.3,2.24,82.7,0.86,4662.7
29,Wine,-0.1,0.0,0.6,0.1,0.1,0.7,0.0,1.4,4.57,78.9,1.78,1149.3


# Population

In [39]:
pop_df.head()

Unnamed: 0,Country Name,Year,Count
0,Aruba,1960,54211
1,Afghanistan,1960,8996973
2,Angola,1960,5454933
3,Albania,1960,1608800
4,Andorra,1960,13411


In [40]:
pop_df = clean_headers(pop_df, case='snake')
pop_df

Column Headers Cleaning Report:
	3 values cleaned (100.0%)


Unnamed: 0,country_name,year,count
0,Aruba,1960,54211
1,Afghanistan,1960,8996973
2,Angola,1960,5454933
3,Albania,1960,1608800
4,Andorra,1960,13411
...,...,...,...
12590,Kosovo,2017,1830700
12591,"Yemen, Rep.",2017,27834821
12592,South Africa,2017,57000451
12593,Zambia,2017,16853688


In [41]:
cond = pop_df['country_name'].isin(['Channel Islands', 'Caribbean small states', 'Pacific island small states'])
index_to_drop = pop_df.loc[cond].index.to_list()
pop_df = pop_df.drop(labels=index_to_drop)

In [42]:
pop_df.isna().sum()

country_name    0
year            0
count           0
dtype: int64

# Data Exploration/Data Transformation

# Land use change

In [43]:
fig = px.histogram(food_pr_df, x=food_pr_df['land_use_per_kilogram_m_per_kilogram'],
                   marginal="box",
                   hover_data=food_pr_df.columns)
fig.show()

In [44]:
#filtering for 75 percentile items
cond = food_pr_df['land_use_per_kilogram_m_per_kilogram'] > np.percentile(food_pr_df['land_use_per_kilogram_m_per_kilogram'], 75)
top_land_df = food_pr_df.loc[cond, ['food_product', 'land_use_per_kilogram_m_per_kilogram']].sort_values(by='land_use_per_kilogram_m_per_kilogram', ascending=False)

#list of top item
top_land = food_pr_df.sort_values(by='land_use_per_kilogram_m_per_kilogram', ascending=False)[:10]['food_product'].to_list()

fig = px.bar(top_land_df, x='land_use_per_kilogram_m_per_kilogram', y='food_product', color='land_use_per_kilogram_m_per_kilogram', color_continuous_scale='brwnyl', text_auto=True)

fig.update_layout(
    coloraxis_showscale=False,

    title = 'Top 10 food items for land use change (m² per KgCO₂)',
    title_x = 0.5,
    title_y = 0.95,
    title_xanchor='center',
    title_yanchor='top',

    yaxis = dict(title=None,
                categoryorder='total ascending'
                ),
                
    xaxis = dict(
                title='Land usage (m² per kg of product)'
            )
)    

fig.show()

# Eutrophy

In [45]:
fig = px.histogram(food_pr_df, x=food_pr_df['eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram'],
                   marginal="box",
                   hover_data=food_pr_df.columns)
fig.show()

75% of the product produce less than 50 gPO₄eq per kilogram

Since we want to know the top product that contributes to the Eutrophyng emissions per kilogram, we first find a threshold above which focussing the analysys.
The threshold is set above the 3rd percentile: 

In [46]:
# Filtering
cond = food_pr_df['eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram'] > np.percentile(food_pr_df['eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram'], 75)
top_eutrophy_df = food_pr_df.loc[cond, ['food_product', 'eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram']].sort_values(by='eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram', ascending=False)

# List of top items
top_eutr = food_pr_df.sort_values(by='eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram', ascending=False)[:10]['food_product'].to_list()

# Graph representation
fig = px.bar(top_eutrophy_df, x='eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram', y='food_product', color='eutrophying_emissions_per_kilogram_g_p_oeq_per_kilogram', color_continuous_scale='peach', text_auto=True)
fig.update_layout(
    coloraxis_showscale=False,

    title = 'Top 10 food items for Eutrophy (gPO₄eq per Kg of product)',
    title_x = 0.5,
    title_y = 0.95,
    title_xanchor='center',
    title_yanchor='top',

    yaxis = dict(title=None,
                categoryorder='total ascending'
                ),
                
    xaxis = dict(
                title='gPO₄eq per Kg'
            )
)    

fig.show()

# Freshwater withdrawals

In [47]:
fig = px.histogram(food_pr_df, x=food_pr_df['freshwater_withdrawals_per_kilogram_liters_per_kilogram'],
                   marginal="box",
                   hover_data=food_pr_df.columns)
fig.show()

In [48]:
# Filtering
cond = food_pr_df['freshwater_withdrawals_per_kilogram_liters_per_kilogram'] > np.percentile(food_pr_df['freshwater_withdrawals_per_kilogram_liters_per_kilogram'], 75)
top_water_df = food_pr_df.loc[cond, ['food_product', 'freshwater_withdrawals_per_kilogram_liters_per_kilogram']].sort_values(by='freshwater_withdrawals_per_kilogram_liters_per_kilogram', ascending=False)

# List of top itmes
top_water = food_pr_df.sort_values(by='freshwater_withdrawals_per_kilogram_liters_per_kilogram', ascending=False)[:10]['food_product'].to_list()

# Graph representation
fig = px.bar(top_water_df, x='freshwater_withdrawals_per_kilogram_liters_per_kilogram', y='food_product', color='freshwater_withdrawals_per_kilogram_liters_per_kilogram', color_continuous_scale='darkmint', text_auto=True)

fig.update_layout(
    coloraxis_showscale=False,

    title = 'Top 10 food items for freshwater withdrawals (l per Kg)',
    title_x = 0.5,
    title_y = 0.95,
    title_xanchor='center',
    title_yanchor='top',

    yaxis = dict(title=None,
                categoryorder='total ascending'
                ),
                
    xaxis = dict(
                title='liters per Kilogram'
            )
)    


fig.show()

# Scarcity-weighted water use

In [49]:
fig = px.histogram(food_pr_df, x=food_pr_df['scarcity_weighted_water_use_per_kilogram_liters_per_kilogram'],
                   marginal="box",
                   hover_data=food_pr_df.columns)
fig.show()

In [50]:
# Filtering
cond = food_pr_df['scarcity_weighted_water_use_per_kilogram_liters_per_kilogram'] > np.percentile(food_pr_df['scarcity_weighted_water_use_per_kilogram_liters_per_kilogram'], 75)
top_scarc_df = food_pr_df.loc[cond, ['food_product', 'scarcity_weighted_water_use_per_kilogram_liters_per_kilogram']].sort_values(by='scarcity_weighted_water_use_per_kilogram_liters_per_kilogram', ascending=False)

# List of top items
top_scarc = food_pr_df.sort_values(by='scarcity_weighted_water_use_per_kilogram_liters_per_kilogram', ascending=False)[:10]['food_product'].to_list()

# Graph representation
fig = px.bar(top_scarc_df, x='scarcity_weighted_water_use_per_kilogram_liters_per_kilogram', y='food_product', color='scarcity_weighted_water_use_per_kilogram_liters_per_kilogram', color_continuous_scale='solar_r', text_auto='0.f')
fig.update_layout(
    coloraxis_showscale=False,

    title = 'Top 10 food items for scarcity weighted water use',
    title_x = 0.5,
    title_y = 0.95,
    title_xanchor='center',
    title_yanchor='top',

    yaxis = dict(title=None,
                categoryorder='total ascending'
                ),
                
    xaxis = dict(
                title='liters per Kilogram'
            )
)    
fig.show()

# Green House emissions

In [51]:
fig = px.histogram(food_pr_df, x=food_pr_df['total_emissions'],
                   marginal="violin",
                   hover_data=food_pr_df.columns)
fig.show()

In [52]:
# Filtering
cond = food_pr_df['total_emissions'] > np.percentile(food_pr_df['total_emissions'], 75)
top_emiss_df = food_pr_df.loc[cond, ['food_product', 'total_emissions']].sort_values(by=('total_emissions'), ascending=False)

# List of top items
top_emiss = food_pr_df.sort_values(by='total_emissions', ascending=False)[:10]['food_product'].to_list()

# Graph representation
fig = px.bar(top_emiss_df, x='total_emissions',y='food_product', orientation='h', color='total_emissions', color_continuous_scale='purp', text_auto='0.f')

fig.update_layout(
    coloraxis_showscale=False,

    title = 'Top 10 food items for green house emissions (KgCO₂ per kg)',
    title_x = 0.5,
    title_y = 0.95,
    title_xanchor='center',
    title_yanchor='top',

    yaxis = dict(title=None,
                categoryorder='total ascending'
                ),
                
    xaxis = dict(
                title='KgCO₂ per kg'
            )
)    
fig.show()

In [53]:
# Filtering
cond=food_pr_df['food_product'].isin(top_emiss)
df = food_pr_df.loc[cond, 'food_product':'total_emissions'].sort_values(by='total_emissions', ascending=False)
df = df.drop(columns='total_emissions')
df = df.set_index('food_product')
df = df.div(df.sum(axis=1), axis=0)*100
df = df.reset_index()

# Colors
colors = ["#b30000", "#7c1158", "#4421af", "#1a53ff", "#0d88e6", "#00b7c7", "#5ad45a", "#8be04e", "#ebdc78"]

# Graph representation of the total emissions contribution per category (%)
fig = px.bar(
    df, 
    y=['farm', 'land_use_change', 'animal_feed',  'processing', 'transport', 'packging', 'retail'], 
    x='food_product', 
    title="CO2 emissions",
    color_discrete_sequence=colors,
    text_auto='.0f',
    barmode='group',
    orientation='v',
    template='none'  
    )
fig.update_traces(
    # texttemplate="%{text:.2s}", 
    textposition='outside'
    # width=0.05
    )

fig.update_layout(
    width=1500,
    title = 'Top 10 food items for total emissions contribution per category (%)',     
    title_x = 0.5,     
    title_y = 0.9,     
    title_xanchor='center',     
    title_yanchor='top',
    
    yaxis = dict(
        title='Emissions (kgCO₂ per kg of product)',
        range=[0,100]),
    
    xaxis = dict(
        title=None),
    uniformtext_minsize=8, 
    uniformtext_mode='hide')
    
fig.show()

# FAO

Creating a column with the relative continent for each state

In [54]:
converter = coco.CountryConverter()

# creating a list for unique values of countries ISO3 and a list for the continent associated to each country 
area_code_lst = clean_fao['area_abbreviation'].unique()
continent = converter.convert(names = area_code_lst, src='ISO3', to='continent')

#joining those list in a dictionary
dic = {key:value for key, value in zip(area_code_lst, continent)}

# creating a column in the dataframe 
clean_fao['continent'] = clean_fao['area_abbreviation'].apply(lambda x: dic.get(x))

# Calculating the total production for item from 1961 to 2012
clean_fao['total_production'] = clean_fao[years].sum(axis=1)

# Calculating the mean of production from the first year of production of the item
clean_fao['mean'] = year_df.apply(np.trim_zeros, axis=1, args='f').mean(axis=1)

#melting the dataframe from wide to long
long_fao = clean_fao.melt(
    id_vars=['area_abbreviation', 'area_clean', 'item', 'element', 'latitude', 'longitude', 'continent'],
    value_vars=[str(n) for n in range(1961, 2014)],
    var_name='year',
    value_name='production',
)

long_fao.head()

Unnamed: 0,area_abbreviation,area_clean,item,element,latitude,longitude,continent,year,production
0,AFG,Afghanistan,Wheat and products,Food,33.94,67.71,Asia,1961,1928.0
1,AFG,Afghanistan,Rice (Milled Equivalent),Food,33.94,67.71,Asia,1961,183.0
2,AFG,Afghanistan,Barley and products,Feed,33.94,67.71,Asia,1961,76.0
3,AFG,Afghanistan,Barley and products,Food,33.94,67.71,Asia,1961,237.0
4,AFG,Afghanistan,Maize and products,Feed,33.94,67.71,Asia,1961,210.0


In [55]:
# variable that store the number of rows before removing duplicates
before = clean_fao.shape[0]
print(f'Number of rows before dropping duplicates: {before:>6}')

# removing duplicates if there any
clean_fao = clean_fao.drop_duplicates(keep='first')

# variables that stores the number of rows after removing duplicates
after = clean_fao.shape[0]
print(f'Number of rows after dropping duplicates: {clean_fao.shape[0] :>7}')

# printing the result
if before == after:
    print('No duplicates were found')
else:
    print(f'{before - after} duplicates were found and removed')

Number of rows before dropping duplicates:  18509
Number of rows after dropping duplicates:   18056
453 duplicates were found and removed


In [56]:
# Graph representation of the world total production per year by Food and Feed
feed_food = long_fao.groupby(by=['element', 'year'])['production'].sum().reset_index()

fig = px.line(feed_food, x='year', y='production', color='element')
fig.show()

In [57]:
# Filtering
cond = feed_food['year'].isin(['1961','2013'])
pct_change50 = feed_food.loc[cond ,['element','year', 'production']]
pct_change50

# Graph representation of the production percentage increase from 1961 to 2013, of Food and Feed)
fig = px.bar(pct_change50, x='year', y='production', color='element', barmode='group', text='production', facet_col='element', facet_col_wrap=2, base='year')
fig.update_layout(
    yaxis={'categoryorder':'total descending'})
fig.show()

In [58]:
# Yeatly production by continent
prod_by_cont_df = long_fao.groupby(['continent', 'year'])['production'].sum().reset_index()

colors={'Africa':'#d7191c', 'Asia':'#fdae61', 'Europe':'#2c7bb6', 'America':'#abd9e9', 'Oceania':'#ffffbf'}

prod_by_cont_df['color']=prod_by_cont_df['continent'].map(colors)

# Graph representation
my_raceplot = barplot(prod_by_cont_df,  item_column='continent', value_column='production', time_column='year', top_entries=10, item_color='color')
fig=my_raceplot.plot(item_label = 'Continents', value_label = 'Production (Tonnes)', frame_duration = 150, date_format='%Y', orientation='horizontal')

#Add chart title, format the chart, etc.
fig.update_layout(
      title='Continents by production',
      title_x=0.5,
      title_y=0.9,
      title_xanchor='center',     
      title_yanchor='top')


In [59]:
# Create a filter for 'Food' related elements
filter_item = long_fao['element'] == 'Food'

# Create a filter for '2013' and '1961' production year
filter_year1 = long_fao['year'] == '1961'
filter_year2 = long_fao['year'] == '2013'

# Create a dataframe subset with conditions imposed by filters and with columns concerning food, countries and production values
food_1961 = long_fao.loc[(filter_item) & (filter_year1), ['item', 'year','area_clean', 'production']]
food_2013 = long_fao.loc[(filter_item) & (filter_year2), ['item','year', 'area_clean', 'production']]

# Sort values in decreasing order and select the first 15 elements
food_1961 = food_1961.sort_values(by='production', ascending=False).reset_index(drop=True).head(15)
food_2013 = food_2013.sort_values(by='production', ascending=False).reset_index(drop=True).head(15)

feod = pd.concat([food_1961, food_2013])

fig = px.scatter(
    feod, 
    x='item',
    y='production',

    size='production',
    size_max=80,
    
    color='area_clean',
    color_discrete_map={
        'China': '#B266FF',
        'United States': '#f4a261',
        'India': '#264653'},

    hover_name='area_clean',
    hover_data=dict(area_clean = False),

    labels=dict(
        item='Item',
        production='Production',
        year='year'),

    facet_col='year'
    )

fig.update_layout(
    title = 'Top food elements produced in 1961 and 2013',
    title_x = 0.5,
    title_y = 0.96,
    title_xanchor='center',
    title_yanchor='top',
    legend_title_text='Countries',
    legend_title_side = 'top',
    showlegend= True,
    legend_itemclick='toggle',

    yaxis = dict(title='Production (1000 tons)'),
    xaxis = dict(title=None),
    xaxis2=dict(title=None),
    
    height=600)


fig.show()

In [60]:
# Creating the dataframe
table_df = pd.DataFrame({
    'Eutrophy':top_eutr,
    'Freshwater Withdrawals':top_water,
    'Scarcity-weighted water use':top_scarc,
    'Green house emissions':top_emiss,
    'Land use':top_land,
    'Production 1961 - 2013':feod['item'].unique()})

# Table representation of the data
fig = go.Figure(data=[go.Table(
    
    header=dict(values=list(table_df.columns),
                line_color='darkslategray',
                fill_color='lightblue',
                align='center',
                font_size=17,
                height=40),

    cells=dict(values=[table_df['Eutrophy'], table_df['Freshwater Withdrawals'], table_df['Scarcity-weighted water use'], table_df['Green house emissions'], table_df['Land use'], table_df['Production 1961 - 2013']],
               line_color='darkslategray',
               fill_color='white',
               align='center',
               font_size=15,
               height=30,
               font_color=[
                    ['orange' if x == 'Cheese' else 'red' if x == "Beef (dairy herd)" else       
                    "red" if x == "Beef (beef herd)" else "red" if x == "Pig Meat" else "red" if x == "Lamb & Mutton" else "red" if x == "Poultry Meat" else 'darkslategray' 
                    for x in table_df.Eutrophy],
                    ['orange' if x == 'Cheese' else 'red' if x == "Beef (dairy herd)" else       
                    "red" if x == "Beef (beef herd)" else "red" if x == "Pig Meat" else "red" if x == "Lamb & Mutton" else "red" if x == "Poultry Meat" else 'darkslategray' 
                    for x in table_df['Freshwater Withdrawals']],
                    ['orange' if x == 'Cheese' else 'red' if x == "Beef (dairy herd)" else       
                    "red" if x == "Beef (beef herd)" else "red" if x == "Pig Meat" else "red" if x == "Lamb & Mutton" else "red" if x == "Poultry Meat" else 'green' if x == 'Rice' 
                    else 'darkslategray' for x in table_df['Scarcity-weighted water use']],
                    ['orange' if x == 'Cheese' else 'red' if x == "Beef (dairy herd)" else       
                    "red" if x == "Beef (beef herd)" else "red" if x == "Pig Meat" else "red" if x == "Lamb & Mutton" else "red" if x == "Poultry Meat" else 'darkslategray' 
                    for x in table_df['Green house emissions']],
                    ['orange' if x == 'Cheese' else 'red' if x == "Beef (dairy herd)" else       
                    "red" if x == "Beef (beef herd)" else "red" if x == "Pig Meat" else "red" if x == "Lamb & Mutton" else "pink" if x == "Poultry Meat" else 'darkslategray' 
                    for x in table_df['Land use']],
                    ["orange" if x == "Milk - Excluding Butter" else       
                    "green" if x == "Rice (Milled Equivalent)" else "red" if x == "Meat"else 'darkslategray' for x in table_df['Production 1961 - 2013']]
                ]
            )
        )
])

fig.update_layout(height=700)
fig.show()

## Population

In [61]:
pop_df['year'].astype('str', copy=False)
pop_df.describe()

Unnamed: 0,year,count
count,12421.0,12421.0
mean,1988.58,24074744.66
std,16.74,100911863.91
min,1960.0,3893.0
25%,1974.0,484749.0
50%,1989.0,4114826.0
75%,2003.0,13092852.0
max,2017.0,1386395000.0


In [62]:
pop_df

Unnamed: 0,country_name,year,count
0,Aruba,1960,54211
1,Afghanistan,1960,8996973
2,Angola,1960,5454933
3,Albania,1960,1608800
4,Andorra,1960,13411
...,...,...,...
12590,Kosovo,2017,1830700
12591,"Yemen, Rep.",2017,27834821
12592,South Africa,2017,57000451
12593,Zambia,2017,16853688


In [63]:
cc = coco.CountryConverter()

lst = pop_df['country_name'].unique() #list of unique countries of the dataframe
pop_iso3 = cc.convert(names=lst, to='ISO3', not_found=np.NaN) #converting the countries in iso3
pop_df['iso_3'] = pop_df['country_name'].map({n:m for n, m in zip(lst, pop_iso3)}) #creating the iso3 column

#setting dataframe years from 1961 to 2013
pop_df['year'].astype(int, copy=False)
cond = (pop_df['year'] >= 1961) & (pop_df['year'] <= 2013)
choro_df = pop_df.loc[cond]

#transforming the population from continuos to cathegorical 
bins=[0, 10000000, 50000000, 100000000, 200000000, 500000000, 1000000000, 1500000000] # Assigning bins
labels=['0 to 10 Millions', '10 to 50 Millions', '50 to 100 Millions', '100 to 200 Millions', '200 to 500 Millinons', '500 Millions to 1 Billion', '> 1 Billion'] # Assigning labels

#setting the cathegorical column from continuos values
choro_df['pop_range'] = pd.cut(pop_df['count'], bins=bins, labels=labels)
choro_df['pop-range'] = choro_df['pop_range'].astype(str)
    
# Graph representation 
fig = px.choropleth(
    choro_df, locations="iso_3",
    color="pop_range", 
    hover_name="country_name", 
    scope='world',
    animation_frame='year',
    color_discrete_sequence=px.colors.sequential.Plasma_r
 
)

# Additional traces settings                    
fig.update_traces(
    marker=dict(
        line=dict(
            color='#cfcfbe',
            width=1
        )
    )
)

# #Add chart title, format the chart, etc.
fig.update_layout(
    title_text='Countries population by year (1961-2013)',
    geo=dict(
        showframe=False,
        showcoastlines=False,
        showlakes=False,
        projection_type='equirectangular',
        coastlinecolor='#cfcfbe',
        coastlinewidth=0.5,
        visible=True,
        resolution=110
    ),
    dragmode=False,
    height=900,
    annotations = [{
        'x':0.05,
        'y':0.15,
        'xref':'paper',
        'yref':'paper',
        'text':'Source: <a href="https://data.worldbank.org/indicator/SP.POP.TOTL?most_recent_year_desc=true">Wolrdbank.org</a>',
        'showarrow':False
    }]
)

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 100
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 5

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [64]:
# Setting back the year type
pop_df['year'].astype(str, copy=False)

# Percentage of population change over the last 50 years
temp = pop_df.groupby('year')['count'].sum().reset_index()
cond = temp['year'].isin(['1961','2013'])
pop_pct_change_50 = temp.loc[cond, ['year', 'count']].reset_index(drop=True)

# Percentage of food production change over the last 50 years
cond1 = (feed_food['year'].isin(['1961','2013'])) & (feed_food['element'] == 'Food')
food_pct_change_50 = feed_food.loc[cond1 ,['year','production']].reset_index(drop=True)

# Merging those 2 dataframes
pop_food_pct_change_50 = pd.merge(pop_pct_change_50, food_pct_change_50, on='year', how='left')


fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Production change", "Population change"))

fig.add_trace(
    go.Bar(
        x=pop_food_pct_change_50['year'],
        y=pop_food_pct_change_50['production'],
    ),
    row=1, col=1
)

fig.add_trace(
    go.Bar(
        x=pop_food_pct_change_50['year'],
        y=pop_food_pct_change_50['count']
    ),
    row=1, col=2
)

fig.show()

In [65]:
# Calculating the delta of production and population from 1961 to 2013
count_delta = (pop_food_pct_change_50.iloc[1]['count'] - pop_food_pct_change_50.iloc[0]['count']) / pop_food_pct_change_50.iloc[0]['count'] * 100
count_delta = round(count_delta, 2)

prod_delta =  (pop_food_pct_change_50.iloc[1]['production'] - pop_food_pct_change_50.iloc[0]['production']) / pop_food_pct_change_50.iloc[0]['production'] * 100
prod_delta = round(prod_delta, 2)

# Calculating the amount of food produced per person in 1961 and in 2013
prod_per_pop_61 = pop_food_pct_change_50.iloc[0]['production'] / pop_food_pct_change_50.iloc[0]['count']
prod_per_pop_13 = pop_food_pct_change_50.iloc[1]['production'] / pop_food_pct_change_50.iloc[1]['count']

print(f'production per person in 1961: {round((prod_per_pop_61*1000), 2)} Tonnes/person, production per person in 2013 {round((prod_per_pop_13*1000), 2)} Tonnes/person, percentage increase in production: {prod_delta}, percentage increase in population: {count_delta}')


IndexError: single positional indexer is out-of-bounds

# Data Visualization

In [None]:
# FOOD PRODUCTION
# eutrophia (histo e bar)
# water (histo e bar)
# scarc (histo e bar)
# land (histo e bar)
# gas emission (hist, bar total e stack bar)
# FAO: 
# - feed food line chart, 
# - feed food bar percentage 1961 vs 2013, 
# - racebar continent produzione per anno
# # TODO: 
# - top elementi prodotti nel 1961 e nel 2013 per il food
# - evidenziare se esistono delle similitudini tra quei prodotti e quelli inquinanti
# POPOLAZIONE
# - popolazione per nazione per anno (plether)
# - bar percentage popolazione e comparare con feed e food percentage incremento 


In [None]:
# IMPORTANTE:
# CAPIRE PERCHé IL GRAFICO DEL MONDO NON FUNZIONI
# FINIRE DI SISTEMARE I GRAFICI A BARRE DELLE 5 RAGIONI DI MORTE NEL MONDO

#### fonti e aknowledgment

world population: https://data.worldbank.org/indicator/SP.POP.TOTL?most_recent_year_desc=true