In [None]:
import sys
import pandas as pd
import os
import numpy as np
import numpy.ma as ma
import xarray as xr
# this requires a python setup.py develop in the cablab-core and the gidtools parent directory
from cablab import Cube
from cablab import Cube
from cablab.cube import CubeDataAccess
from datetime import datetime
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import warnings
warnings.filterwarnings('ignore')

__author__ = "gunbra32"

# data sources: 
# worldbank WDI: http://databank.worldbank.org/data/download/WDI_csv.zip
# country codes: https://github.com/datasets/country-codes/tree/master/data
# Country Code Map: https://cran.r-project.org/web/packages/rworldmap/index.html

def read_merge_worldbank(path):
    data = dict()
    for root, sub_dirs, files in os.walk(path):
        #print(root, sub_dirs,files)
        for dir in sub_dirs:
            dir_path = os.path.join(root,dir)
            file_names = os.listdir(dir_path)
            for file in file_names:
                if '_Data.csv' in file:
                    print("Reading %s %s" % (dir_path, file))
                    data[file.split(".")[0]] = pd.read_csv(os.path.join(dir_path,file),encoding= 'cp1252')
                    data[file.split(".")[0]].sort_values(["Country Code"], inplace = True)
                    #print(data[file.split(".")[0]].duplicated)
                    #data[file.split(".")[0]].drop_duplicates(inplace = True)
                    
    merged_df = pd.concat([data[df] for df in data], ignore_index=True)
    merged_df.reset_index(drop=True, inplace = True)
    #merged_df.drop_duplicates(subset = ["Country Code", "Indicator Code"],inplace=True)
    merged_df.duplicated(subset=["Country Code", "Indicator Code"]).sum()
    # fix worldbank country codes to match iso 3166 - error prone... check frequently. 
    merged_df["Country Code"].replace("ROM","ROU", inplace=True)
    merged_df["Country Code"].replace("ZAR","COD", inplace=True)
    return merged_df, data

def retrieve_dimensions(df, country_col= "Country Code", indicator_col = "Indicator Code", dates = None, mintime = 2001):
    dims = dict()
    if dates is None: 
        print("Retrieving dates from Column names - this is dangerous! Check result for consistency! ")
        dims["time"] = [col for col in df.columns if col.isdigit() and int(col) > mintime and int(col) <2150]
        dims["itime"] = [int(i) for i in dims["time"]]
        
    dims["indicators"] = df[indicator_col].sort_values().unique() 
    dims["countries"] = df[country_col].unique()
    return dims
    
def cube_worldbank(df, data_dims, country_col= "Country Code", indicator_col = "Indicator Code"):
    print("Starting Country Cube generation...")
    dg = df.groupby((country_col, indicator_col))
                                                
    ds = create_DataArray(data_dims)
    for name, grp in dg: 
        ds.loc[name] = np.array(dg.get_group(name)[data_dims["time"]].mean(axis=0, skipna=True))
        
    return ds

def create_DataArray(data_dims):
    array_size = (len(data_dims["countries"]), len(data_dims["indicators"]), len(data_dims["time"])) 
    a = np.empty(array_size)
    a.fill(np.nan)
    ds = xr.DataArray( a, coords=[data_dims["countries"],data_dims["indicators"],data_dims["itime"]],
                     dims = ["Country_Codes","Indicator_Codes","Year"])
    return ds

def cube_ESDC(ESDC, data_dims,cdict):
    cm = ESDC.data.dataset("country_mask")
    cm_2D=cm["country_mask"][0].values
    cm_unique = np.unique(cm_2D[~np.isnan(cm_2D)])
    data_dims["indicators"] = ESDC.data.variable_names
    ds = create_DataArray(data_dims)
    for var in ESDC.data.variable_names:
        if("_mask" not in var):
            print(var)
            annual_av= ESDC.data.dataset(var).groupby("time.year").mean(dim="time")
            for year in annual_av["year"].values:
                tmp=annual_av[var].sel(year=year)
                for cc in cm_unique:
                    cm_bool = cm_2D==cc
                    if(numeric_to_ISO_code(cc,cdict) is not None and numeric_to_ISO_code(cc,cdict) in ds["Country_Codes"].values and str(year) in data_dims["time"] and (tmp.values[cm_bool].size is not 0)):
                        ds.loc[dict(Country_Codes=numeric_to_ISO_code(cc,cdict),Indicator_Codes=var,Year=year)] = np.nanmean(tmp.values[cm_bool])
    return ds

def numeric_to_ISO_code(cc,cdict):
    if (cc in cdict.keys()):
        return cdict[cc]
    else:
        return None  

def read_ESDC_countrycodes(path_to_csv, col = 'ISO3166-1-Alpha-3'):
    df = pd.read_csv(pp)
    cc_dict = df.set_index('ISO3166-1-numeric').to_dict()
    return cc_dict[col]


In [None]:
path_WB = '/home/jovyan/work/datacube/worldbank-development-indicators-20161014'
ESDC = Cube.open("/home/jovyan/work/datacube/cablab-datacube-0.2.3/low-res")
pp = "/home/jovyan/work/datacube/country-cube-0.1.0/country-codes.csv"

In [None]:
merged_df, data = read_merge_worldbank(path_WB)
data_dims = retrieve_dimensions(merged_df)
cube = cube_worldbank(merged_df, data_dims)
print("Done cubing the Worldbank data.")

In [None]:
print("Start country-cubing the ESDC")

cdict = read_ESDC_countrycodes(pp)
dedes = cube_ESDC(ESDC,data_dims.copy(),cdict)
FullCube = xr.concat([cube,dedes],'Indicator_Codes')

print("Done country-cubing the ESDC")

In [None]:
FullCube.to_dataset(dim="Indicator_Codes").to_netcdf(path="country_cube.nc")