In [None]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import os
from sklearn.linear_model import LinearRegression

## IV. Distribution analysis for light-absorbing particles

This is the final step to preprocessing MERRA-2 data for a new region and only needs to be done once. The essence is to compare a single bin of each species (size bin #3 for dust and the hydrophilic bin for black and organic carbon) to the sum of each species (sum of all size bins for dust and sum of hydrophilic and hydrophobic black and organic carbon). The ratios to translate bin-->total are input to PEBSI in "ratio_BC2_BCtot", etc. in inputs.py.

For now, this document is un-automated (sorry) but the process should be quite simple. Here are at least the very basic instructions.

1. **Download ALL black carbon, organic carbon, and dust variables** for some amount of time (at least 1 year) and for some region of interest. 
2. **Aggregate the data** with minor edits to notebooks 1 and 2 (basically just need to add the new variable names like OCDP001, etc. to the dataset_variables)
3. **Sum bins for dry and wet deposition** separately. For black and organic carbon, that means summing hydrophilic and hydrophobic bins, and for dust, that means summing all the size bins.
4. **Perform a linear regression** of the timeseries where X is the bin of interest (002 for BC/OC and 003 for dust) and Y is the sum of all bins.
5. **Pull out the slopes from this regression.** The slopes will probably be somewhere between 1-5, but you can do a sanity check by plotting (bin of interest * slope) against (sum of bins). Manually input these into PEBSI under the "ratio_(bin of interest)_tot".

In [None]:
ds1 = xr.open_dataarray('D:/MERRA-2/OCDP001/MERRA2_OCDP001_60_-150.nc').sel(lat=63.5,lon=-145.625)
ds2 = xr.open_dataarray('D:/MERRA-2/OCDP002/MERRA2_OCDP002_60_-150.nc').sel(lat=63.5,lon=-145.625)
model = LinearRegression()
x = ds2.values.flatten().reshape(-1,1)
y = ds1.values.flatten() + ds2.values.flatten()
model.fit(x,y)
r2 = model.score(x,y)

fig,ax = plt.subplots()
ax.scatter(x,y)
ax.plot(x,x*model.coef_+model.intercept_,label=f'R$^2$={r2:.2f}',c='r')
print(model.coef_,model.intercept_)
ax.set_xlabel('Bin 2')
ax.set_ylabel('Bin 1 + Bin 2')
ax.legend()
plt.show()

In [None]:
ds1 = xr.open_dataarray('D:/MERRA-2/OCWT001/MERRA2_OCWT001_60_-150.nc').sel(lat=63.5,lon=-145.625)
ds2 = xr.open_dataarray('D:/MERRA-2/OCWT002/MERRA2_OCWT002_60_-150.nc').sel(lat=63.5,lon=-145.625)
model = LinearRegression()
x = ds2.values.flatten().reshape(-1,1)
y = ds1.values.flatten() + ds2.values.flatten()
model.fit(x,y)
r2 = model.score(x,y)

fig,ax = plt.subplots()
ax.scatter(x,y)
ax.plot(x,x*model.coef_+model.intercept_,label=f'R$^2$={r2:.2f}',c='r')
print(model.coef_,model.intercept_)

ax.set_xlabel('Bin 2')
ax.set_ylabel('Bin 1 + Bin 2')
ax.legend()
plt.show()