# Prepare coffee data

In [None]:
import xarray as xr
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

In [None]:
import functions as fn

In [None]:
plt_params = fn.get_plot_params()

# Growing calendar

In [None]:
growing_calendar = pd.read_csv('/g/data/xv83/dr6273/work/projects/coffee/data/coffee_country_growing_calendar.csv')

In [None]:
growing_calendar.head()

### Get country info from ERA5 mask to be consistent

In [None]:
era_countries_mask = xr.open_dataset('/g/data/xv83/dr6273/work/projects/coffee/data/era5_country_mask.nc')
era_countries_mask = era_countries_mask['country_mask']

### Add country code and abbreviation to `growing_calendar`

In [None]:
countries = np.unique(growing_calendar['FAO'])
countries

In [None]:
codes = [era_countries_mask.swap_dims({'abbrevs': 'names'}).sel(names=country).region.values.astype('int16') for country in growing_calendar['FAO']]
abbrevs = [era_countries_mask.swap_dims({'abbrevs': 'names'}).sel(names=country).abbrevs.values for country in growing_calendar['FAO']]

In [None]:
growing_calendar['region'] = codes
growing_calendar['abbrevs'] = abbrevs

In [None]:
arabica_indices = growing_calendar.loc[growing_calendar.species == 'Arabica'].index
robusta_indices = growing_calendar.loc[growing_calendar.species == 'Robusta'].index

In [None]:
growing_calendar = growing_calendar.iloc[np.concatenate([arabica_indices.values, robusta_indices.values])]

In [None]:
growing_calendar.to_csv('/g/data/xv83/dr6273/work/projects/coffee/data/coffee_country_growing_calendar_extended.csv')

# Yield

In [None]:
arabica_yield = pd.read_csv('/g/data/xv83/dr6273/work/projects/coffee/data/arabica_yield.csv', index_col=0)

In [None]:
# Robusta years are not sorted - sort by year
arabica_yield = arabica_yield.sort_values(['Country', 'Year.x'])
arabica_yield.index = range(len(arabica_yield.index))

### Don't have Arabica/robusta split for Brazil, so just treat as Arabica

In [None]:
arabica_yield.loc[(arabica_yield['Country'] == 'Brazil'), 'Country'] = 'Brazil South'

In [None]:
# Add region code and abbreviations
codes = [era_countries_mask.swap_dims({'abbrevs': 'names'}).sel(names=country).region.values.astype('int16') for country in arabica_yield['Country']]
abbrevs = [era_countries_mask.swap_dims({'abbrevs': 'names'}).sel(names=country).abbrevs.values for country in arabica_yield['Country']]

In [None]:
arabica_yield['region'] = codes
arabica_yield['abbrevs'] = abbrevs

In [None]:
arabica_yield.head()

In [None]:
robusta_yield = pd.read_csv('/g/data/xv83/dr6273/work/projects/coffee/data/robusta_yield.csv')

In [None]:
# Robusta years are not sorted - sort by year
robusta_yield = robusta_yield.sort_values(['Country', 'Year.x'])

In [None]:
# Rename Viet Nam to Vietnam
robusta_yield.loc[(robusta_yield['Country'] == 'Viet Nam'), 'Country'] = 'Vietnam'

In [None]:
robusta_yield.index = range(len(robusta_yield.index))

In [None]:
# Add region code and abbreviations
codes = [era_countries_mask.swap_dims({'abbrevs': 'names'}).sel(names=country).region.values.astype('int16') for country in robusta_yield['Country']]
abbrevs = [era_countries_mask.swap_dims({'abbrevs': 'names'}).sel(names=country).abbrevs.values for country in robusta_yield['Country']]

In [None]:
robusta_yield['region'] = codes
robusta_yield['abbrevs'] = abbrevs

In [None]:
robusta_yield.head()

### Convert to xarray DataSet

In [None]:
# Get earliest and latest years from all data
fy = np.min([arabica_yield['Year.x'].min(), robusta_yield['Year.x'].min()])
ly = np.max([arabica_yield['Year.x'].max(), robusta_yield['Year.x'].max()])
print(fy, ly)

In [None]:
# Number of countries
n_countries = len(np.unique(arabica_yield['Country'])) + len(np.unique(robusta_yield['Country']))
n_countries

In [None]:
# Set up empty DataArray
abbrevs = np.append(np.unique(arabica_yield['abbrevs']), np.unique(robusta_yield['abbrevs'])).astype('U4')
years = pd.date_range(str(fy), str(ly), freq='1YS')
codes = [era_countries_mask.sel(abbrevs=abbrev).region.values.astype('int16') for abbrev in abbrevs]

In [None]:
yield_arr = np.full((n_countries, len(range(fy, ly+1))), np.nan)

for i,country in enumerate(np.unique(arabica_yield['Country'])):
    c_data = arabica_yield.loc[(arabica_yield.Country == country)]
    c_fy = c_data.iloc[0]['Year.x']
    c_iFy = np.where(range(fy, ly) == c_fy)[0][0]
    yield_arr[i, c_iFy : c_iFy+len(c_data)] = c_data['yld_ha']
    
for j,country in enumerate(np.unique(robusta_yield['Country'])):
    c_data = robusta_yield.loc[(robusta_yield.Country == country)]
    c_fy = c_data.iloc[0]['Year.x']
    c_iFy = np.where(range(fy, ly) == c_fy)[0][0]
    yield_arr[j+i+1, c_iFy : c_iFy+len(c_data)] = c_data['yld_ha']

In [None]:
yield_da = xr.DataArray(yield_arr,
                        dims=['abbrevs', 'time'],
                        coords={'abbrevs': abbrevs,
                                'time': years})

In [None]:
yield_da.to_dataset(name='yield').to_netcdf('/g/data/xv83/dr6273/work/projects/coffee/data/coffee_yield.nc')

# Plot country yields

In [None]:
names = ['Brazil', 'Colombia', 'Ethiopia', 'Guatemala', 'Honduras', 'Mexico', 'Nicaragua',
         'Peru', 'India', 'Indonesia', 'Uganda', 'Vietnam']

In [None]:
with plt.rc_context(plt_params):
    fig, ax = plt.subplots(3, 4, figsize=(6.9, 4))
    for i, abbrev in enumerate(yield_da.abbrevs.values):
        yield_da.sel(abbrevs=abbrev).plot(ax=ax.flatten()[i], c='k', lw=1)
        
        ax.flatten()[i].text(0.05, 0.85, names[i], transform=ax.flatten()[i].transAxes)
        ax.flatten()[i].set_title('')
        
        if i in [0, 4, 8]:
            ax.flatten()[i].set_ylabel(r'Yield [t ha$^{-1}$]')
        else:
            ax.flatten()[i].set_ylabel('')
            
        if i > 7:
            ax.flatten()[i].set_xlabel('Year')
        else:
            ax.flatten()[i].set_xlabel('')
            
    plt.tight_layout()
    
#     plt.savefig('./figures/yield.pdf', format='pdf', dpi=400, bbox_inches='tight')