# Linear Regression Retrievals

In [None]:
import datetime as dt
from numbers import Number

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge

from plots import retrieval_template, statistical_eval
from db_tools import read_csv_profiles
from optimal_estimation import rgrid, z_hatpro, z_top

%matplotlib inline

plt.rcParams["font.family"] = "DejaVu Sans"

## Data Preparation

In [None]:
T_train = read_csv_profiles("../data/unified/training/T_rasoclim.csv")
T_test = read_csv_profiles("../data/unified/test/T_rasoclim.csv")

q_train = np.exp(read_csv_profiles("../data/unified/training/lnq_rasoclim.csv"))
q_test = np.exp(read_csv_profiles("../data/unified/test/lnq_rasoclim.csv"))

TBmwrtm_train = read_csv_profiles("../data/unified/training/TB_mwrtm.csv")
TBmwrtm_test = read_csv_profiles("../data/unified/test/TB_mwrtm.csv")

TBigmk_train = read_csv_profiles("../data/unified/training/TB_igmk.csv")

cloudy_train = read_csv_profiles("../data/unified/training/cloudy_raso.csv")["cloudy"]
cloudy_test = read_csv_profiles("../data/unified/test/cloudy_raso.csv")["cloudy"]

psfc_train = read_csv_profiles("../data/unified/training/psfc.csv")
psfc_test = read_csv_profiles("../data/unified/test/psfc.csv")

Tsfc_train = T_train["z=612m"].rename("Tsfc").to_frame()
Tsfc_test = T_test["z=612m"].rename("Tsfc").to_frame()

qsfc_train = q_train["z=612m"].rename("qsfc").to_frame()
qsfc_test = q_test["z=612m"].rename("qsfc").to_frame()

T_cosmo0006 = read_csv_profiles("../data/unified/priors/T_cosmo7+00+06_mean.csv")
T_cosmo2430 = read_csv_profiles("../data/unified/priors/T_cosmo7+24+30_mean.csv")

q_cosmo0006 = np.exp(read_csv_profiles("../data/unified/priors/lnq_cosmo7+00+06_mean.csv"))
q_cosmo2430 = np.exp(read_csv_profiles("../data/unified/priors/lnq_cosmo7+24+30_mean.csv"))

In [None]:
kband_all = [col for col in TBmwrtm_train.columns if "TB" in col and int(col[3:8]) < 40000]
vband_all = [col for col in TBmwrtm_train.columns if "TB" in col and int(col[3:8]) > 40000]
vband_zen = [col for col in TBmwrtm_train.columns if "TB" in col and int(col[3:8]) > 40000 and col.endswith("_00.0")]

In [None]:
def join(*dfs, noise=None):
    if noise is not None:
        if isinstance(noise, Number): noise = [noise]*len(dfs)
        dfs = [df + np.random.normal(0., scale=n, size=df.shape) for df, n in zip(dfs, noise)]
    return pd.concat(dfs, axis=1)

To detect overfitting and make the synthetic retrievals more 'realistic', Gaussian noise is added to the test data fields.

In [None]:
TBnoise = 0.5
qnoise = 0.0005
Tnoise = 0.5
pnoise = 0.2

TBkqp_train = join(TBmwrtm_train[kband_all], qsfc_train, psfc_train)
TBkqp_test = join(TBmwrtm_test[kband_all], qsfc_test, psfc_test, noise=[TBnoise, qnoise, pnoise])

TBvTp_train = join(TBmwrtm_train[vband_all], Tsfc_train, psfc_train)
TBvTp_test = join(TBmwrtm_test[vband_all], Tsfc_test, psfc_test, noise=[TBnoise, Tnoise, pnoise])

TBvzTp_train = join(TBmwrtm_train[vband_zen], Tsfc_train, psfc_train)
TBvzTp_test = join(TBmwrtm_test[vband_zen], Tsfc_test, psfc_test, noise=[TBnoise, Tnoise, pnoise])

## Miscellaneous

In [None]:
class Model:

    def __init__(self, training_predictors, training_targets, alpha):
        self.lm = Ridge(alpha=alpha)
        self.lm.fit(training_predictors, training_targets)
        self.predictor_cols = list(training_predictors.columns)
        self.target_cols = list(training_targets.columns)
    
    def __call__(self, test_predictors):
        assert list(test_predictors.columns) == self.predictor_cols
        prediction = self.lm.predict(test_predictors.values)
        return pd.DataFrame(prediction, index=test_predictors.index, columns=self.target_cols)

## Choice of Regularization Parameter

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
alphas = [50, 200, 500, 1000, 4000]

regTs = [Model(TBvTp_train, T_train, alpha=alpha) for alpha in alphas]
statistical_eval(ax1, T_test, *[m(TBvTp_test) for m in regTs], labels=["α = {}".format(alpha) for alpha in alphas])
ax1.legend(loc="upper right")
ax1.set_ylim(0, 7)
ax1.set_xlim(-0.4, 2.5)
ax1.grid()

regqs = [Model(TBkqp_train, q_train, alpha=alpha) for alpha in alphas]
statistical_eval(ax2, q_test, *[m(TBkqp_test) for m in regqs], labels=["α = {}".format(alpha) for alpha in alphas])
ax2.legend(loc="upper right")
ax2.set_ylim(0, 7)
ax2.set_xlim(-0.00015, 0.0012)
ax2.grid()

It seems that a regularization parameter of 500 is a good choice.

## Default Retrievals

In [None]:
regT_all = Model(TBvTp_train, T_train, alpha=500)
regT_zen = Model(TBvzTp_train, T_train, alpha=500)
regq = Model(TBkqp_train, q_train, alpha=500)

fig, (axT1, axT2, axq1, axq2) = retrieval_template([8, 6],
        Tlims=[(-0.5, 4), (0, 12), (-0.3, 1.5), (0, 2.5)],
        qlims=[(-0.15, 1), (0, 12), (-0.15, 1), (0, 2.5)],                                                  
        )

for ax in [axT1, axT2]:
    statistical_eval(ax, T_test,
         regT_zen(TBvzTp_test),
         regT_all(TBvTp_test),
         labels=["zenith only", "elevation scan"])
axT2.set_xticks([0.2*i for i in range(-1, 8)])
    
for ax in [axq1, axq2]:
    statistical_eval(ax, q_test*1000,
         regq(TBkqp_test)*1000,
         labels=["zenith only"])
    ax.set_ylabel("")

axT1.legend(loc="upper right", fontsize=11)
axq1.legend(loc="upper right", fontsize=11)
fig.tight_layout()

## Clear Sky Only Retrievals

In [None]:
regT_clear = Model(TBvTp_train.loc[~cloudy_train,:], T_train.loc[~cloudy_train,:], alpha=500)
regT_all = Model(TBvTp_train, T_train, alpha=500)
regq_clear = Model(TBkqp_train.loc[~cloudy_train,:], q_train.loc[~cloudy_train,:], alpha=500)
regq_all = Model(TBkqp_train, q_train, alpha=500)

fig, (axT1, axT2, axq1, axq2) = retrieval_template([8, 6],
        Tlims=[(-0.5, 4), (0, 12), (-0.3, 1.5), (0, 2.5)],
        qlims=[(-0.15, 1), (0, 12), (-0.15, 1), (0, 2.5)],                                                  
        )

for ax in [axT1, axT2]:
    statistical_eval(ax, T_test,
         regT_all(TBvTp_test.loc[~cloudy_test,:]),
         regT_clear(TBvTp_test.loc[~cloudy_test,:]),
         labels=["all sky training", "clear sky training"])
axT2.set_xticks([0.2*i for i in range(-1, 8)])
    
for ax in [axq1, axq2]:
    statistical_eval(ax, q_test*1000,
         regq_all(TBkqp_test.loc[~cloudy_test,:])*1000,
         regq_clear(TBkqp_test.loc[~cloudy_test,:])*1000,
         labels=["all sky training", "clear sky training"])
    ax.set_ylabel("")

axT1.legend(loc="upper right", fontsize=11)
axq1.legend(loc="upper right", fontsize=11)
fig.tight_layout()

## Retrievals with COSMO-7 data as additional predictors

In [None]:
def cosmo_data(predictors, cosmo):
    data_all = join(predictors, cosmo).dropna()
    test = data_all.iloc[::3,:]
    train = data_all.drop(test.index, axis=0)
    return train, test

In [None]:
T0006_train, T0006_test = cosmo_data(TBvTp_test, T_cosmo0006.iloc[:,20::4])
T2430_train, T2430_test = cosmo_data(TBvTp_test, T_cosmo2430.iloc[:,20::4])

q0006_train, q0006_test = cosmo_data(TBkqp_test, q_cosmo0006.iloc[:,:20:4])
q2430_train, q2430_test = cosmo_data(TBkqp_test, q_cosmo2430.iloc[:,:20:4])

In [None]:
regT_def = Model(TBvTp_train, T_train, alpha=500)
regT0006 = Model(T0006_train, T_test.loc[T0006_train.index,:], alpha=500)
regT2430 = Model(T2430_train, T_test.loc[T2430_train.index,:], alpha=500)

regq_def = Model(TBkqp_train, q_train, alpha=500)
regq0006 = Model(q0006_train, q_test.loc[q0006_train.index,:], alpha=500)
regq2430 = Model(q2430_train, q_test.loc[q2430_train.index,:], alpha=500)

fig, (axT, axq) = plt.subplots(1, 2, figsize=[8, 4.5])

statistical_eval(axT, T_test,
         regT_def(TBvTp_test),
         regT0006(T0006_test),
         #regT2430(T2430_test),
         labels=["default", "COSMO-7", "2430"],
         colors=["#1f78b4", "#000000", "#666666"])
    
statistical_eval(axq, q_test*1000,
         regq_def(TBkqp_test)*1000,
         regq0006(q0006_test)*1000,
         #regq2430(q2430_test)*1000,
         labels=["default", "COSMO-7", "2430"],
         colors=["#1f78b4", "#000000", "#666666"])

axT.set_xlim(-0.5, 4)
axT.set_ylim(0, 12)
axT.set_xlabel("temperature [K]")
axq.set_xlim(-0.15, 1)
axq.set_ylim(0, 12)
axq.set_xlabel("total water content [g/kg]")
axq.set_ylabel("")
axT.legend(loc="upper right", fontsize=11)
axq.legend(loc="upper right", fontsize=11)
fig.tight_layout()