In [521]:
import pickle
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
import numpy as np

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly
import math

## Introduction

Load the merged Dataframe

In [522]:
df = pd.read_pickle("data/final_df.pkl")

Load the extra csv file to be able to regroup the countries by region

In [523]:
df_country = pd.read_csv('Data/GroupsCountry.csv')
df_country.columns = map(lambda name : name.lower().replace(' ', '_'), df_country.columns) #Column names to lower case

Create a dictionnary that regroup all the countries in one country group. Each country group has an unique id ie. 5100 for Africa, 5000 for World, 5500 for Oceania ...

In [524]:
dicts = {}
keys = df_country['country_group_code'].unique()
for i in range(len(list_group)):
    dicts[list_group[i]]= df_country[df_country['country_group_code']==list_group[i]]['country_code'].unique()

In [525]:
# The key 5000 corresponds to the "world", we do that 
df_world = df[df.area_code.isin(dicts.get(5000))]

In [526]:
df_world.head()

Unnamed: 0,area_code,area_crops,item_crops,year,item_trades,area_harvested,production,yield,export_q,export_v,import_q,import_v,hs12_code,parent_group,child_group,parent_description,child_description,ISO3 Code
0,1,Armenia,Wheat,1992,Wheat,65500.0,141483.0,21600.0,,,400000.0,60000.0,100111,10,1001,Cereals,Wheat and meslin,ARM
1,1,Armenia,Wheat,1993,Wheat,97900.0,217900.0,22257.0,,,408000.0,59000.0,100111,10,1001,Cereals,Wheat and meslin,ARM
2,1,Armenia,Wheat,1993,"Flour, wheat",97900.0,217900.0,22257.0,,,46000.0,9400.0,100111,10,1001,Cereals,Wheat and meslin,ARM
3,1,Armenia,Wheat,1994,Wheat,85697.0,152900.0,17842.0,,,327000.0,52000.0,100111,10,1001,Cereals,Wheat and meslin,ARM
4,1,Armenia,Wheat,1994,"Flour, wheat",85697.0,152900.0,17842.0,,,55000.0,14700.0,100111,10,1001,Cereals,Wheat and meslin,ARM


In [527]:
# Creation of the dataset to analyze the production
df_world_unique = df_world.drop_duplicates(subset=['area_crops','item_crops','year'],keep='first')

In [528]:
# Group the dataframe by parent_group (categories are presented below)
df_groups = df_world_unique.groupby(['parent_group','year']).sum().reset_index()
df_groups_norm = df_groups.pivot(index='year', columns='parent_group', values='production').reset_index()

List of all the main categories : 

- 02 : Meat and edible meat offal
- 04 : Dairy produce; birds' eggs; natural honey; edible products of animal origin, not elsewhere -specified or included
- 05 : Animal originated products; not elsewhere specified or included
- 07 : Vegetables and certain roots and tubers; edible
- 08 : Fruit and nuts, edible; peel of citrus fruit or melons  
- 09 : Coffee, tea, mate and spices
- 10 : Cereals
- 11 : Products of the milling industry; malt, starches, inulin, wheat gluten
- 12 : Oil seeds and oleaginous fruits; miscellaneous grains, seeds and fruit, industrial or medicinal plants; straw and fodder
- 14 : Vegetable plaiting materials; vegetable products not elsewhere specified or included
- 15 : Animal or vegetable fats and oils and their cleavage products; prepared animal fats; animal or vegetable waxes
- 16 : Meat, fish or crustaceans, molluscs or other aquatic invertebrates; preparations thereof
- 17 : Sugars and sugar confectionery
- 18 : Cocoa and cocoa preparations
- 19 : Preparations of cereals, flour, starch or milk; pastrycooks' products
- 20 : Preparations of vegetables, fruit, nuts or other parts of plants
- 21 : Miscellaneous edible preparations
- 22 : Beverages, spirits and vinegar 
- 23 : Food industries, residues and wastes thereof; prepared animal fodder
- 24 : Tobacco and manufactured tobacco substitutes
- 38 : Chemical products n.e.c.
- 40 : Rubber and articles thereof
- 41 : Raw hides and skins (other than furskins) and leather
- 50 : Silk
- 51 : Wool, fine or coarse animal hair; horsehair yarn and woven fabric
- 52 : Cotton
- 53 : Vegetable textile fibres; paper yarn and woven fabrics of paper yarn

In [529]:
# Import World population
WorldPopulation = pd.read_excel('data/WorldPopulation.xlsx').sort_values(by='Year').reset_index().drop('index',axis=1)

**Plot the evolution of the production for each grop of crops and the world population:**

In [530]:
df_groups_px = df_groups.pivot(index='year', columns='parent_group', values='production').reset_index()

In [531]:
parent_group_legend = ['Vegetables','Fruit and nuts','Coffee, tea, mate','Cereals','Oil seeds',
       'Cocoa','Tobacco','Rubber','Vegetable textile fibres']
list_parent_group = ['07','08','09','10','12','18','24','40','53']

In [532]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

for i in range(len(parent_group_legend)):
    fig.add_trace(go.Scatter(
        x=df_groups_px.year,
        y=df_groups_px.loc[:,df_groups.parent_group.unique()[i]],
        name=parent_group_legend[i]),
        secondary_y=False
    )
    
fig.add_trace(go.Scatter(
        x=WorldPopulation.Year,
        y=WorldPopulation.WorldPopulation,
        name="World Population"),
        secondary_y=True)    
    
fig.update_layout(
    title="Production of Crops per Category",
    yaxis_title="Production [tonnes]",
    legend=go.layout.Legend(
        font=dict(
            size=10)
))
  
fig.update_yaxes(title_text="Production [tonnes]",  range=[0, 8e9],secondary_y=False)
fig.update_yaxes(title_text="World Population [people]", range=[0, 8e9],secondary_y=True)


fig.show()

In [533]:
# Save the figure in html format for the datastory
plotly.offline.plot(fig,filename='AllProds.html')

'AllProds.html'

**Plot the indexed evolution of the production for each grop of crops and the world population :**

In [534]:
# Compute the indexed production
df_groups_px_norm = df_groups_norm.loc[:,list_parent_group]/df_groups_norm.loc[:,list_parent_group].loc[0,:]

In [535]:
# Compute the indexed world population : 
WorldPopulation_norm = WorldPopulation.loc[:,"WorldPopulation"]/WorldPopulation.loc[:,"WorldPopulation"][0]

In [536]:
fig = go.Figure()
for i in range(len(parent_group_legend)):
    fig.add_trace(go.Scatter(
        x=df_groups_px.year,
        y=100*(df_groups_px_norm.loc[:,df_groups.parent_group.unique()[i]]),
        mode='lines',
        showlegend=True,
        name=parent_group_legend[i],
    ))    
fig.add_trace(go.Scatter(
        x=df_groups_px.year,
        y=100*WorldPopulation_norm,
        mode='lines',
        showlegend=True,
        name=parent_group_legend[i],
    ))    
    
    
fig.update_layout(
    title="Indexed Production of Crops per Category and world population",
    yaxis_title="Indexed production or world population[%]",
    legend=go.layout.Legend(
        font=dict(
            size=10)
))
fig.show()

In [537]:
# Save the figure for the data story :
plotly.offline.plot(fig,filename='IndexedProds.html')

'IndexedProds.html'

**Scatter plot for each crop :**

In [538]:
# CREATION YEAR DF = 2016
df_year=df_world[df_world.year==2016] # For the VPQ

df_year_unique=df_world_unique[df_world_unique.year==2016] # For the production

In [539]:
# DF for the VPQ
df_crops_prod = df_year.groupby(['parent_group','parent_description','item_crops']).sum().reset_index()
df_crops_prod['import_vpq'] = df_crops_prod.import_v/df_crops_prod.import_q

In [540]:
# DF for the production
df_crops_prod_unique = df_year_unique.groupby(['parent_group','parent_description','item_crops']).sum().reset_index() # Keep only the production value

In [541]:
list_crops = ['Avocados','Coffee, green','Tobacco, unmanufactured','Wheat']

In [542]:
fig = go.Figure()
for i in range(len(parent_group_legend)):
    fig.add_trace(go.Scatter(
        x=df_crops_prod_unique[df_crops_prod_unique.parent_group==df_groups.parent_group.unique()[i]].production,
        y=df_crops_prod[df_crops_prod.parent_group==df_groups.parent_group.unique()[i]].import_vpq,
        mode='markers',
        text=df_crops_prod[df_crops_prod.parent_group==df_groups.parent_group.unique()[i]].item_crops,
        name=parent_group_legend[i]
    ))

# Transform the axis in log
fig.update_xaxes(type="log")
fig.update_yaxes(type="log")

# Add the annotation to highlight our crop
fig.add_annotation(
    go.layout.Annotation(
            x=math.log10(749016800),
            y=math.log10(0.2215844),
            text="Wheat")
)

fig.add_annotation(
    go.layout.Annotation(
            x=math.log10(5495334),
            y=math.log10(2.3529),
            text="Avocados")
)

fig.add_annotation(
    go.layout.Annotation(
            x=math.log10(6398885.0),
            y=math.log10(10.06832),
            text="Tobacco, unmanufactured")
)

fig.add_annotation(
    go.layout.Annotation(
            x=math.log10(9315121.0),
            y=math.log10(3.670396),
            text="Coffee, green")
)

fig.update_layout(
    title="Worldwide Production in 2016 ",
    xaxis_title="Production [tones]",
    yaxis_title="VPQ [$/tonne]",
    legend=go.layout.Legend(
        font=dict(
            size=10)
))
fig.show()

In [543]:
# Save the figure for the data story :
plotly.offline.plot(fig,filename='WorldwideProds.html')

'WorldwideProds.html'

## Data Story Wheat

In [544]:
# load the specific dataset for the wheat
df_wheat = pd.read_pickle("data/wheat_df.pkl")

In [545]:
df_wheat_production = df_wheat.dropna(subset=['production','Country']).drop_duplicates(subset=['Country','year'])

In [546]:
# Notice that we have the same production than the scatter plot for the year 2016, so it's correct
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_wheat_production.groupby(['year']).sum().reset_index().year,
    y=df_wheat_production.groupby(['year']).sum().reset_index().production,
    mode='lines',
    showlegend=True,
    name="Wheat Production",
    ))

**Time serie plot for the import and export VPQ :**

In [547]:
df_wheat_trades_vpq = df_wheat.dropna(subset=['Country']).groupby(['year']).sum().reset_index()
df_wheat_trades_vpq['import_vpq'] = df_wheat_trades_vpq.import_v/df_wheat_trades_vpq.import_q
df_wheat_trades_vpq['export_vpq'] = df_wheat_trades_vpq.export_v/df_wheat_trades_vpq.export_q
df_wheat_trades_vpq.head()

Unnamed: 0,year,area_code,area_harvested,production,yield,export_q,export_v,import_q,import_v,import_vpq,export_vpq
0,1961.0,109028.0,816837800.0,889428900.0,5190800.0,45766402.0,3007811.0,45537308.0,3354361.0,0.073662,0.065721
1,1962.0,109028.0,830218072.0,1001277000.0,5542520.0,44480428.0,3019104.0,41321775.0,3155407.0,0.076362,0.067875
2,1963.0,109028.0,824520960.0,933357900.0,5345416.0,49214725.0,3305830.0,49545739.0,3700308.0,0.074685,0.067172
3,1964.0,109028.0,866156708.0,1075167000.0,5745152.0,58831852.0,3967881.0,55352964.0,4270788.0,0.077156,0.067444
4,1965.0,109028.0,867919048.0,1054584000.0,5936720.0,56153716.0,3550790.0,56172880.0,4108793.0,0.073145,0.063233


In [549]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_wheat_trades_vpq.year,
    y=df_wheat_trades_vpq.import_vpq,
    mode='lines',
    showlegend=True,
    name="Import VPQ",
    ))
fig.add_trace(go.Scatter(
    x=df_wheat_trades_vpq.year,
    y=df_wheat_trades_vpq.export_vpq,
    mode='lines',
    showlegend=True,
    name="Export VPQ",
    ))
# Set x-axis title
fig.update_layout(
    title="Import and Export Value per Quantity for the Wheat",
    xaxis_title="Year",
    yaxis_title="VPQ [1000$/tonne]")
fig.show()

In [550]:
# Save the figure for the data story :
plotly.offline.plot(fig,filename='ImportExportVPQ.html')

'ImportExportVPQ.html'

**Time serie plot for the import and export VPQ and the oil price:**

In [551]:
# Array with the oil price
oil_price=[1.57,1.52,1.5,1.45,1.42,1.36,1.33,1.32,1.27,1.21,1.7,1.82,2.7,11,10.43,11.6,12.5,12.79,29.19,35.52,34,32.38,29.04,28.2,27.01,13.53,17.73,14.24,17.31,22.26,18.62,18.44,16.33,15.53,16.86,20.29,18.86,12.28,17.44,27.6,23.12,24.36,28.1,36.05,50.59,61,69.04,94.1,60.86,77.38,107.46,109.45,105.87,96.29,49.49,40.68,52.52,0]

# https://fr.statista.com/statistiques/564926/prix-annuel-du-petrole-de-l-opep-1960/

In [552]:
# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=df_wheat_trades_vpq.year, y=df_wheat_trades_vpq.export_vpq, name="Export VPQ"),
    secondary_y=False
)
fig.add_trace(
    go.Scatter(x=df_wheat_trades_vpq.year, y=df_wheat_trades_vpq.import_vpq, name="Import VPQ"),
    secondary_y=False
)
fig.add_trace(
    go.Scatter(x=df_wheat_trades_vpq.year, y=oil_price, name="Oil Price"),
    secondary_y=True
)

fig.update_layout(
    title="Import and Export VPQ and Crude oil price per baril")    
# Set x-axis title
fig.update_xaxes(title_text="Year")
# Set y-axes titles
fig.update_yaxes(title_text="VPQ [1000$/tonne]", secondary_y=False)
fig.update_yaxes(title_text="Oil price [$/baril]", secondary_y=True)

fig.show()

In [553]:
# Save the figure for the data story :
plotly.offline.plot(fig,filename='ImportExportVPQOil.html')

'ImportExportVPQOil.html'

**Pearson correlation coefficient between oil and vpq**

In [554]:
# Export VPQ
pd.Series(oil_price).corr(df_wheat_trades_vpq.export_vpq,'pearson')

0.9086332872456037

In [555]:
# Import VPQ 
pd.Series(oil_price).corr(df_wheat_trades_vpq.import_vpq,'pearson')

0.9085604354736667

**Horizontal bar plot to display the biggest importer and exporter (Biggest actors in the wheat market)**

In [556]:
# Print trades item for wheat
df_wheat.item_trades.unique()

array(['Bran, wheat', 'Bulgur', 'Flour, wheat', 'Wheat'], dtype=object)

In [557]:
# Creation of the dataframe for the wheat for 2016
df_wheat_trades = df_wheat[df_wheat.year==2016].groupby(['Country','item_trades']).sum().reset_index()

In [558]:
# Snapshot of the dataframe
df_wheat_trades.head()

Unnamed: 0,Country,item_trades,area_code,year,area_harvested,production,yield,export_q,export_v,import_q,import_v
0,Afghanistan,"Bran, wheat",2.0,2016.0,2300210.0,4555110.0,19803.0,89.0,43.0,236.0,25.0
1,Afghanistan,Bulgur,2.0,2016.0,2300210.0,4555110.0,19803.0,0.0,0.0,0.0,0.0
2,Afghanistan,"Flour, wheat",2.0,2016.0,2300210.0,4555110.0,19803.0,0.0,0.0,2056484.0,563561.0
3,Afghanistan,Wheat,2.0,2016.0,2300210.0,4555110.0,19803.0,0.0,0.0,258473.0,44738.0
4,Albania,"Bran, wheat",3.0,2016.0,70512.0,275000.0,39000.0,0.0,0.0,25010.0,3244.0


In [559]:
# Extraction of the 50 biggest "players"
df_big_players = df_wheat_trades.groupby(['Country']).sum().reset_index()
df_big_players['tot_fluxes'] = df_big_players.import_q + df_big_players.export_q
df_big_players = df_big_players.sort_values(by='tot_fluxes',ascending=False).head(50)

In [560]:
# Create a new dataframe which contains only the countries of interesst
df_wheat_trades = df_wheat_trades[df_wheat_trades.Country.isin(df_big_players.Country.values)]

In [561]:
df_import_export = df_wheat_trades.pivot(index='Country', columns='item_trades', values=['import_q','export_q'])#.plot.bar(stacked=True,figsize=(20,7))

In [562]:
# deal with index to flatten
df_import_export.columns = [' '.join(col).strip() for col in df_import_export.columns.values]
# rename columns according to our nomenclature
df_import_export.rename(columns= {'import_q Bran, wheat': 'import_q_bran', 
                    'import_q Bulgur': 'import_q_bulgur',\
                   'import_q Flour, wheat':'import_q_flour_wheat',
                   'import_q Wheat':'import_q_wheat',
                   'export_q Bran, wheat': 'export_q_bran', 
                    'export_q Bulgur': 'export_q_bulgur',\
                   'export_q Flour, wheat':'export_q_flour_wheat',
                   'export_q Wheat':'export_q_wheat'}, inplace=True)

In [563]:
# Attribute negative value for the exportation (out flow)
df_import_export.loc[:,'export_q_flour_wheat'] = -1*df_import_export.loc[:,'export_q_flour_wheat']
df_import_export.loc[:,'export_q_wheat'] = -1*df_import_export.loc[:,'export_q_wheat']
df_import_export.loc[:,'export_q_bulgur'] = -1*df_import_export.loc[:,'export_q_bulgur']
df_import_export.loc[:,'export_q_bran'] = -1*df_import_export.loc[:,'export_q_bran']
df_import_export.reset_index(inplace=True)

In [564]:
fig = go.Figure(data=[
    go.Bar(name='Import Q Bran', y=df_import_export.Country, x=df_import_export.import_q_bran,orientation='h'),
    go.Bar(name='Import Q Bulgur', y=df_import_export.Country, x=df_import_export.import_q_bulgur,orientation='h'),
    go.Bar(name='Import Q Flour Wheat', y=df_import_export.Country, x=df_import_export.import_q_flour_wheat,orientation='h'),
    go.Bar(name='Import Q Wheat', y=df_import_export.Country, x=df_import_export.import_q_wheat,orientation='h'),
    go.Bar(name='Export Q Flour Wheat', y=df_import_export.Country, x=df_import_export.export_q_flour_wheat,orientation='h'),
    go.Bar(name='Export Q Wheat', y=df_import_export.Country, x=df_import_export.export_q_wheat,orientation='h'),
    go.Bar(name='Export Q Bulgur', y=df_import_export.Country, x=df_import_export.export_q_bulgur,orientation='h'),
    go.Bar(name='Export Q bran', y=df_import_export.Country, x=df_import_export.export_q_bran,orientation='h')

])
# Change the bar mode
fig.update_layout(barmode='relative')
fig.update_layout(
    xaxis_title="Export/Import Quantities [tonnes] (Negative values represents exportation)")
fig.show()

In [565]:
plotly.offline.plot(fig,filename='ImportExportWheat.html')

'ImportExportWheat.html'

## Coffee

In [566]:
# load the specific dataset for the wheat
df_coffee = pd.read_pickle("data/coffee_df.pkl").dropna(subset=['Country'])

In [567]:
df_coffee.item_trades.unique()

array(['Coffee, extracts', 'Coffee, green', 'Coffee, husks and skins',
       'Coffee, roasted', 'Coffee, substitutes containing coffee'],
      dtype=object)

In [568]:
# Creation of the dataframe for the wheat for 2016
df_coffee_trades = df_coffee[df_coffee.year==2016].groupby(['Country','item_trades']).sum().reset_index()

In [569]:
# Extraction of the 50 biggest "players"
df_big_players = df_coffee_trades.groupby(['Country']).sum().reset_index()
df_big_players['tot_fluxes'] = df_big_players.import_q + df_big_players.export_q
df_big_players = df_big_players.sort_values(by='tot_fluxes',ascending=False).head(50)

In [570]:
# Create a new dataframe with the countries of interest
df_coffee_trades = df_coffee_trades[df_coffee_trades.Country.isin(df_big_players.Country.values)]

In [571]:
df_import_export = df_coffee_trades.pivot(index='Country', columns='item_trades', values=['import_q','export_q'])#.plot.bar(stacked=True,figsize=(20,7))

In [572]:
# deal with index to flatten
df_import_export.columns = [' '.join(col).strip() for col in df_import_export.columns.values]
# rename columns according to our nomenclature
df_import_export.rename(columns= {'import_q Coffee, extracts': 'import_q_coffee_extracts', 
                    'import_q Coffee, green': 'import_q_coffee_green',\
                    'import_q Coffee, husks and skins':'import_q_coffee_husks_and_skins',
                    'import_q Coffee, roasted':'import_q_coffee_roasted',
                    'import_q Coffee, substitutes containing coffee':'import_q_coffee_substitutes_containing_coffee',
                    'export_q Coffee, extracts': 'export_q_coffee_extracts', 
                    'export_q Coffee, green': 'export_q_coffee_green',\
                    'export_q Coffee, husks and skins':'export_q_coffee_husks_and_skins',
                    'export_q Coffee, roasted':'export_q_coffee_roasted',
                    'export_q Coffee, substitutes containing coffee':'export_q_coffee_substitutes_containing_coffee'}, inplace=True)

In [573]:
# Attribute negative values to exportation
df_import_export.loc[:,'export_q_coffee_extracts'] = -1*df_import_export.loc[:,'export_q_coffee_extracts']
df_import_export.loc[:,'export_q_coffee_green'] = -1*df_import_export.loc[:,'export_q_coffee_green']
df_import_export.loc[:,'export_q_coffee_husks_and_skins'] = -1*df_import_export.loc[:,'export_q_coffee_husks_and_skins']
df_import_export.loc[:,'export_q_coffee_roasted'] = -1*df_import_export.loc[:,'export_q_coffee_roasted']
df_import_export.loc[:,'export_q_coffee_substitutes_containing_coffee'] = -1*df_import_export.loc[:,'export_q_coffee_substitutes_containing_coffee']
df_import_export.reset_index(inplace=True)

In [574]:
# Plot the horizontal bar plot
fig = go.Figure(data=[
    go.Bar(name='Import Q Coffee, extracts', y=df_import_export.Country, x=df_import_export.import_q_coffee_extracts,orientation='h'),
    go.Bar(name='Import Q Coffee, green', y=df_import_export.Country, x=df_import_export.import_q_coffee_green,orientation='h'),
    go.Bar(name='Import Q Coffee, husks and skins', y=df_import_export.Country, x=df_import_export.import_q_coffee_husks_and_skins,orientation='h'),
    go.Bar(name='Import Q Coffee, roasted', y=df_import_export.Country, x=df_import_export.import_q_coffee_roasted,orientation='h'),
    go.Bar(name='Import Q Coffee, substitutes containing coffee', y=df_import_export.Country, x=df_import_export.import_q_coffee_substitutes_containing_coffee,orientation='h'),
    go.Bar(name='Export Q Coffee, extracts', y=df_import_export.Country, x=df_import_export.export_q_coffee_extracts,orientation='h'),
    go.Bar(name='Export Q Coffee, green', y=df_import_export.Country, x=df_import_export.export_q_coffee_green,orientation='h'),
    go.Bar(name='Export Q Coffee, husks and skins', y=df_import_export.Country, x=df_import_export.export_q_coffee_husks_and_skins,orientation='h'),
    go.Bar(name='Export Q Coffee, roasted', y=df_import_export.Country, x=df_import_export.export_q_coffee_roasted,orientation='h'),
    go.Bar(name='Export Q Coffee, substitutes containing coffee', y=df_import_export.Country, x=df_import_export.export_q_coffee_substitutes_containing_coffee,orientation='h')

])
# Change the bar mode
fig.update_layout(barmode='relative')
fig.update_layout(
    xaxis_title="Export/Import Quantities [tonnes] (Negative values represents exportation)")
fig.show()

In [575]:
plotly.offline.plot(fig,filename='ImportExportCoffee.html')

'ImportExportCoffee.html'