<a href="https://colab.research.google.com/github/emoceanographer/WWF/blob/master/FAO_commodity_shifts_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [243]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd
from scipy import stats
import numpy as np


In [0]:
def unique(long_list):
  unique_list = []
  for element in long_list: 
    if element not in unique_list:
      unique_list.append(element)
  return unique_list

def max_change(value,year):
  max_value = max(value)
  max_index = value.index(max_value)
  return([max_value, year[max_index]])

def change_calc(value,year_offset):
  """Calculates the difference between elements 'year_offset' apart within a list"""
  change_value = []
  for i in range(year_offset,len(value)): 
    change_value.append(value[i] - value[i-year_offset])
  return change_value

def fv_mean(listA):
  output = []
  listA = list(listA)
  for item in listA:
    if len(item)>0:
      output.append(item[0])
  if len(output)>0:
    mean_output = stat.mean(output)
  else:
    mean_output = output
  return mean_output

In [0]:
def country_all_analysis(country_df, domain):
  """Country data frame is expecting an FAO data set for a country; 'domain'
  should be the "element" we are interested in"""
  
  if country_df['Element'].str.contains(domain).any():
    data = country_df[country_df['Element'].str.contains(domain)]

    slope,t,t,t,t = stats.linregress(data['Year'], data['Value'])
    value = list(data['Value'])

    change_1yr = change_calc(value,1)
    max_1yr = max_change(change_1yr, list(data['Year'][1:]))
    change_5yr = change_calc(value,5) 
    max_5yr = max_change(change_5yr, list(data['Year'][5:]))

    output = {'Slope'+'_'+domain: slope, 'Delta1yr'+'_'+domain: max_1yr, 'Delta5yr'+'_'+domain: max_5yr }
  else:
    output = {'Slope'+'_'+domain: [], 'Delta1yr'+'_'+domain: [], 'Delta5yr'+'_'+domain: [] }
  return output

In [0]:
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
rainbow = cm.get_cmap('rainbow',12)

def plot_color(x,y,t,x_label,y_label,title):
  """Plots x vs y with colors set by t; labels also entered as inputs"""
  fig = plt.figure(figsize = (8,8))
  ax = fig.add_subplot(1,1,1) 
  ax.set_xlabel(x_label, fontsize = 15)
  ax.set_ylabel(y_label, fontsize = 15) # grab units from sheet
  ax.set_title(title, fontsize = 20) 
  # grab the country name and commodity from the data
  
  #rainbow = cm.get_cmap('rainbow',len(t))
  #for i in range(0,len(x)): # goes through each element
   # ax.scatter(x[i], y[i], c = rainbow[i])
  plt.scatter(x, y, c = t, cmap = 'rainbow')
  plt.show()
  
def plot_simple(x,y,x_label, y_label, title): 
  fig = plt.figure(figsize = (4,4))
  ax = fig.add_subplot(1,1,1) 
  ax.set_xlabel(x_label, fontsize = 10)
  ax.set_ylabel(y_label, fontsize = 10) # grab units from sheet
  ax.set_title(title, fontsize = 10) 
  
  plt.scatter(x,y)
  plt.show()
  
# plots, if desired
def ctry_plot(country,df,df_gdp):
  country_ID = df['Country'].str.contains(country)
  country_data = df[country_ID] # this seems to grab only the "true" values from the above
  commodity_data = country_data[country_data['Item'].str.contains(commodity1)]

  country_ID = df_gdp['Area'].str.contains(country) # grabs same country from GDP data
  country_data = df_gdp[country_ID]
  gdp_data = country_data[country_data['Item'].str.contains(commodity2)]

  list1 = [] # commodity change
  list2 = [] # GDP change
  list3 = [] # year
  for year in unique(list(commodity_data['Year'])):

    if gdp_data['Year'].astype(str).str.contains(str(year)).any(): # if that year exists
      yr_data1 = commodity_data[commodity_data['Year'].astype(str).str.contains(str(year))]
      yr_data2 = gdp_data[gdp_data['Year'].astype(str).str.contains(str(year))]
      list1.append(yr_data1['Value'].values[0])
      list2.append(yr_data2['Value'].values[0])
      list3.append(year)

  if list1:
    slope,intercept,r_value,p_value,t = stats.linregress(list2,list1)

    plot_color(list2,list1,list3,'GDP','protein supply (g/capita/day)',country)
    plot_simple(list3,list1, 'time', 'protein supply',country)
    plot_simple(list3,list2, 'time', 'GDP',country)

In [0]:
def slopes_counter(slopes,pvals):
  """Treats NaN entries as nonexistant; proportion is from those with data"""
  length = sum(1 for x in slopes if not np.isnan(x))
  prop_pos = sum(1 for x in slopes if x > 0) / length # gets proportion
  counter = 0
  for i in range(0,len(slopes)):
    if slopes[i] > 0 and pvals[i]<0.05:
      counter = counter + 1

  prop_pos_sig = counter / length
  return prop_pos, prop_pos_sig

In [0]:
def country_slope_calcs(commodity, GDPtype):
  """for a particular commodity as defined in FAO's data, goes through each country
  and regresses the GDP (either full or per capita) versus the protein supply quantity 
  of that commodity """
  
  country_dict = {}
  country_dict = {}

  commodity1 = commodity
  commodity2 = GDPtype

  for country in country_list: 
    country_ID = df['Country'].str.contains(country)
    country_data = df[country_ID] # this seems to grab only the "true" values from the above
    commodity_data = country_data[country_data['Item'].str.contains(commodity1)]

    country_ID = df_gdp['Area'].str.contains(country) # grabs same country from GDP data
    country_data = df_gdp[country_ID]
    gdp_data = country_data[country_data['Item'].str.contains(commodity2)]

    list1 = [] # commodity change
    list2 = [] # GDP change
    list3 = [] # year
    for year in unique(list(commodity_data['Year'])):

      if gdp_data['Year'].astype(str).str.contains(str(year)).any(): # if that year exists
        yr_data1 = commodity_data[commodity_data['Year'].astype(str).str.contains(str(year))]
        yr_data2 = gdp_data[gdp_data['Year'].astype(str).str.contains(str(year))]
        list1.append(yr_data1['Value'].values[0])
        list2.append(yr_data2['Value'].values[0])
        list3.append(year)

    if list1:
      slope,intercept,r_value,p_value,t = stats.linregress(list2,list1)
      country_dict[country] = {'Slope': slope, 'Intercept': intercept, 'r^2': r_value, 'p': p_value}

    else:
      country_dict[country] = {'Slope': float('nan'), 'Intercept': float('nan'), 'r^2': float('nan'), 'p': float('nan')}
  return country_dict

In [0]:
# Choose a data source
path = "gdrive/My Drive/Colab Notebooks/FAOSTAT_data_6-18-2019.csv" # milk, poultry
path_meattype = "gdrive/My Drive/Colab Notebooks/FAOSTAT_data_7-1-2019.csv" # meat by type
path_meatall = "gdrive/My Drive/Colab Notebooks/FAOSTAT_data_meat.csv" # meat total
path_gdp = "gdrive/My Drive/Colab Notebooks/FAOSTAT_data_gdp.csv" # fao gdp data over time
path_countrycats = "gdrive/My Drive/Colab Notebooks/country_classes.csv"

In [0]:
# Run all meat version

# Creates a data frame from the data in 'path' and extracts the countries of interest
df = pd.read_csv(path)
df_gdp = pd.read_csv(path_gdp) # loads gdp data
country_list = unique(list(df['Country']))

# Run all meat version

In [0]:
# Creates a data frame from the data in 'path' and extracts the countries of interest
df = pd.read_csv(path_meatall)
df_gdp = pd.read_csv(path_gdp) # loads gdp data
country_list = unique(list(df['Country']))

In [276]:
# Runs the analysis and saves the meat version
commodity = 'Meat'
country_dict = country_slope_calcs('Meat', 'Gross Domestic Product') # capitalization matters!
countrydf = pd.DataFrame.from_dict(country_dict, orient = 'index')
savepath = 'gdrive/My Drive/Colab Notebooks/' + commodity + '_GDP_slopes.csv' # hopefully will always save 
# to the correct commodity
countrydf.to_csv(savepath)

  del sys.path[0]


In [0]:
# plots by country 
ctry_plot('Australia',df,df_gdp)

In [279]:
# Let's run some stats on these GDP / consumption relationships!
# Proportion of countries with positive slope:

slopes = countrydf.Slope.astype(float) # makes numbers not strings
pvals = countrydf.p.astype(float)
[pp,sig] = slopes_counter(slopes,pvals)

#prop_pos = sum(1 for x in slopes if x > 0) / len(slopes) # gets proportion

#counter = 0
#for i in range(0,len(slopes)):
 # if slopes[i] > 0 and pvals[i]<0.05:
  #  counter = counter+1

#prop_pos_sig = counter / len(slopes)
print(commodity)
print('Proportion positive:', pp)
print('Proportion positive and significant:', sig)

Meat
Proportion positive: 0.7595628415300546
Proportion positive and significant: 0.6885245901639344


# Run by commodity version

In [0]:
# Creates a data frame from the data in 'path' and extracts the countries of interest
df = pd.read_csv(path_meattype)
df_gdp = pd.read_csv(path_gdp) # loads gdp data
country_list = unique(list(df['Country']))

In [285]:
# Runs the analysis for each commodity
commodity_list = unique(list(df['Item']))

for commodity in commodity_list:
  country_dict = country_slope_calcs(commodity, 'Gross Domestic Product') # capitalization matters!
  countrydf = pd.DataFrame.from_dict(country_dict, orient = 'index')
  savepath = 'gdrive/My Drive/Colab Notebooks/' + commodity + '_GDP_slopes.csv' # hopefully will always save 
    # to the correct commodity
  countrydf.to_csv(savepath)

  del sys.path[0]


In [0]:
## Group countries in regions and calculate aggregate statistics
categorydf = pd.read_csv(path_countrycats)

In [0]:
meat_path = 'gdrive/My Drive/Colab Notebooks/meat_totalgdp_slopes.csv'
bovine_path = 'gdrive/My Drive/Colab Notebooks/bovine_totalgdp_slopes.csv'
mutton_path = 'gdrive/My Drive/Colab Notebooks/muttongoat_totalgdp_slopes.csv'
poultry_path = 'gdrive/My Drive/Colab Notebooks/poultry_totalgdp_slopes.csv'
pig_path = 'gdrive/My Drive/Colab Notebooks/pig_totalgdp_slopes.csv'

In [0]:
income_groups = unique(categorydf['Income group'])
income_groups.remove('x') # removes a spurious 'x' category

In [0]:
import csv
path_list = {'meat': meat_path, 'bovine': bovine_path, 'mutton': mutton_path,
            'poultry': poultry_path, 'pig': pig_path}
income_dict = {}

for group in income_groups: 
  ctrydf = categorydf[categorydf['Income group'].str.contains(group)]
  ctry_list = list(ctrydf.Economy)
  
  data_temp = [] # temporary storage for data
  dict_temp = {}
  for element in path_list:
    with open(path_list[element]) as csvfile:
      csv_reader = csv.reader(csvfile, delimiter=',')
      header = next(csv_reader)
      header[0] = 'Country' # hard-coded!
      for row in csv_reader: 
        if row[0] in ctry_list:
          data_temp.append(row) # add in that data
    groupdf = pd.DataFrame.from_records(data_temp, columns = header)
    slopes = list(groupdf['Slope'])
    slopes = [float(x) for x in slopes if x]
  
    pvals = list(groupdf['p'])
    pvals = [float(x) for x in pvals if x]

    #prop_pos = sum(1 for x in slopes if x > 0) / len(slopes)
    [prop_pos,prop_pos_sign] = slopes_counter(slopes,pvals)
  
    dict_temp.update({element: {'pp': prop_pos, 'sig': prop_pos_sign}})
  income_dict[group] = dict_temp
        
        
    

In [0]:
income_gdpmeat = pd.DataFrame.from_dict(income_dict)
income_gdpmeat.to_csv('gdrive/My Drive/Colab Notebooks/trends_by_income.csv')


In [262]:
income_dict

{'High income': {'bovine': {'pp': 0.5212765957446809,
   'sig': 0.5106382978723404},
  'meat': {'pp': 0.723404255319149, 'sig': 0.7021276595744681},
  'mutton': {'pp': 0.524822695035461, 'sig': 0.48226950354609927},
  'pig': {'pp': 0.6681034482758621, 'sig': 0.6077586206896551},
  'poultry': {'pp': 0.6436170212765957, 'sig': 0.6063829787234043}},
 'Low income': {'bovine': {'pp': 0.6739130434782609,
   'sig': 0.5217391304347826},
  'meat': {'pp': 0.8260869565217391, 'sig': 0.6521739130434783},
  'mutton': {'pp': 0.6666666666666666, 'sig': 0.5362318840579711},
  'pig': {'pp': 0.6842105263157895, 'sig': 0.5701754385964912},
  'poultry': {'pp': 0.6956521739130435, 'sig': 0.5652173913043478}},
 'Lower middle income': {'bovine': {'pp': 0.6060606060606061,
   'sig': 0.4696969696969697},
  'meat': {'pp': 0.7575757575757576, 'sig': 0.6060606060606061},
  'mutton': {'pp': 0.5858585858585859, 'sig': 0.45454545454545453},
  'pig': {'pp': 0.6441717791411042, 'sig': 0.5337423312883436},
  'poultry':