### Introduction
Notebook contains regression projection for summary statistc dimensionality reduction prior to approximate Bayesian computation (see https://rss.onlinelibrary.wiley.com/doi/10.1111/j.1467-9868.2011.01010.x).

### Imports
All imporst occur here:

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from time import time

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn import pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso

from joblib import dump, load

### Read in summary statistics and parameters (priors)

In [None]:
sum_stats = pd.read_csv("../output/summary_stats.csv", index_col=False)
prior = pd.read_feather("../output/prior.feather")
assert np.all(sum_stats["random_seed"] == prior["random_seed"]) # Make sure everthing is aligned

seeds = sum_stats.pop("random_seed")
prior = prior.drop(columns="random_seed")

### Split into test set and train set

In [None]:
# Test set == pseudo-observed datasets. Leaving 2000 out as PODs.
train_prior = prior.iloc[:-2000]
train_sum_stats = sum_stats.iloc[:-2000]

test_prior = prior.iloc[-2000:]
test_sum_stats = sum_stats.iloc[-2000:]

train_prior.to_csv("../output/train_prior.csv", index=False)
train_sum_stats.to_csv("../output/train_sum_stats.csv", index=False)
test_prior.to_csv("../output/test_prior.csv", index=False)
test_sum_stats.to_csv("../output/test_sum_stats.csv", index=False)

### Pseudo-observed datasets based analysis

In [None]:
# Set up linear regression, lasso regression and random forest regression models
model_dict = {"linear_regression": LinearRegression(),
              "lasso_regression": pipeline.make_pipeline(StandardScaler(), Lasso(max_iter=50000, alpha=0.5)),
              "random_forest": RandomForestRegressor(n_estimators=150, n_jobs=-1)
             }

# Set up objects to store results
parameters = list(prior)
train_sum_stats_pred = dict(zip(model_dict.keys(), [{} for i in range(0, len(model_dict.keys()))]))
test_sum_stats_pred = dict(zip(model_dict.keys(), [{} for i in range(0, len(model_dict.keys()))]))
rmse_results = pd.DataFrame(columns = [model + "_RMSE" for model in model_dict.keys()], index=list(prior))

linear_regression_coefficients = pd.DataFrame(columns=parameters, index=list(sum_stats))
lasso_regression_coefficients = pd.DataFrame(columns=parameters, index=list(sum_stats))
random_forest_importances = pd.DataFrame(columns=parameters, index=list(sum_stats))

In [None]:
%%time

# Loop through parameters, fit models
for param in parameters:  # Loop through wildcat model parameters
    print("Parameter: {}".format(param))
    y = prior[[param]].values.ravel()
    y_train = y[:-2000]
    y_test = y[-2000:]
    for name, model in model_dict.items():
        
        # Train the model on the training set
        print(" - Training {} ...".format(name))
        start_time = time()
        estimator = model.fit(train_sum_stats, train_prior[param])
        print(" - Training completed and moddel saved in {:.2f} s".format(time() - start_time))
        
        # Predict on the train and test set
        train_pred = estimator.predict(train_sum_stats)
        train_sum_stats_pred[name] = {**train_sum_stats_pred[name], param: train_pred}
        
        test_pred = estimator.predict(test_sum_stats)
        test_sum_stats_pred[name] = {**test_sum_stats_pred[name], param: test_pred}
        
        # Calculate RMSE using test set:
        rmse_results.loc[param, name + "_RMSE"] = mean_squared_error(test_prior[param], test_pred, squared=False)
        
        # Record importances and coefficients
        if name == "linear_regression":
            linear_regression_coefficients[param] = estimator.coef_
        elif name == "lasso_regression":
            lasso_regression_coefficients[param] = estimator["lasso"].coef_
        elif name == "random_forest":
            random_forest_importances[param] = estimator.feature_importances_
        

### Write out all the results

In [None]:
for model, results in train_sum_stats_pred.items():
    df = pd.DataFrame(results)
    filename = "../output/projection/{}_train_sum_stats_projection.csv".format(model)
    df.to_csv(filename, index=False)
    
for model, results in test_sum_stats_pred.items():
    df = pd.DataFrame(results)
    filename = "../output/projection/{}_test_sum_stats_projection.csv".format(model)
    df.to_csv(filename, index=False)
    
rmse_results.to_csv("../output/projection/rmse_results.csv", index_label = "parameter")

linear_regression_coefficients.to_csv("../output/projection/linear_regression_coefficients.csv", index_label="summary_stats")
lasso_regression_coefficients.to_csv("../output/projection/lasso_regression_coefficients.csv", index_label="summary_stats")
random_forest_importances.to_csv("../output/projection/random_forest_importances.csv", index_label="summary_stats")

### Plot importances for each parameter

In [None]:
for param in parameters:
    importances = random_forest_importances[param].sort_values(ascending=False)
    sns.set(style="whitegrid")
    f, ax = plt.subplots(figsize=(10, 14))
    sns.barplot(x=importances, y=importances.index, color = "0.2", orient="h")
    plt.xlabel("Importances for predicting {}".format(param))
    plt.ylabel("Summary statistic".format(param))
    plt.gcf().subplots_adjust(left=0.3)
    plt.savefig("../plots/importances/importance_{}.png".format(param))
    plt.close()


The rest of these results will be plotted in R (abc.R).