# Linear dimensionality reduction with 
# Prinicipal Component Analysis on the ESDC
## by Max Planck Institute for Biogeochemistry
## M. Mahecha & F. Gans 

In [None]:
using ESDL
using ESDLPlots
plotlyjs()

In this study we investigate the redundancy the different variables in each pixel. Therefore we calculate a linear dimensionality reduction (PCA) and check how many dimensions are needed to explain 90% of the variance of a cube that contained originally 6 variables.  First we check out the variables from the cube and add some processors, because we want to do a global study

## Access ESDC

In [None]:
c = Cube()

## Define variables for anaylsis

In [None]:
vars = ["gross_primary_productivity","latent_energy","root_moisture"
    ,"terrestrial_ecosystem_respiration","burnt_area","black_sky_albedo","net_ecosystem_exchange"];
cdata = getCubeData(c,variable=vars,region="Africa");

## Gap-filling, needed to perform PCA

In [None]:
@loadOrGenerate cubeanom=>"DimRed_Anomalies" cube_filled=>"DimRed_Filled" begin
@time cube_filled     = gapFillMSC(cdata);
#And we calculate the anomalies
@time cubeanom        = removeMSC(cube_filled)
end

## Perform PCA

In [None]:
@everywhere using MultivariateStats
@everywhere function sufficient_dimensions{T}(xout::AbstractArray{T}, xin::AbstractArray{T}, expl_var::Float64 = 0.95)
    any(isnan,xin) && return xout[1]=NaN
    npoint, nvar = size(xin)
    means = mean(xin,1)
    stds  = std(xin,1)
    xin   = broadcast((y,m,s)->s>0.0 ? (y-m)/s : one(y),xin,means,stds)
    pca = fit(PCA, xin', pratio = 0.999, method = :svd)
    xout[1]  = findfirst(cumsum(principalvars(pca)) / tprincipalvar(pca) .> expl_var)
end 

In [None]:
#First we do the analysis on the original cube:
@loadOrGenerate qualitypca=>"DimRed_Quality_filled" begin
@time qualitypca=mapCube(sufficient_dimensions,cube_filled,0.90,
    indims=InDims("Time","Variable",miss=ESDL.NaNMissing()),
    outdims=OutDims(miss=ESDL.NaNMissing()));
end

# Result
## Complexity of the multivariate time series including the seasonal cycle
## How many variables are needed to explain 90% of the variance in the data?

In [None]:
plotMAP(qualitypca,dmin=2,dmax=6)

And on the anomalies only:

In [None]:
@loadOrGenerate qualitypcaanom=>"DimRed_quality_anom" begin
qualitypcaanom=mapCube(sufficient_dimensions,cubeanom,0.90,
    indims=InDims("Time","Variable",miss=ESDL.NaNMissing()),
    outdims=OutDims(miss=ESDL.NaNMissing()));
end

## Complexity of the multivariate time series without the seasonal cycle

In [None]:
plotMAP(qualitypcaanom,dmin=2,dmax=6)