In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import folktables
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import itertools
from tqdm import tqdm
import seaborn as sns
from folk_tables_utils import *


In [None]:
plt.rc('xtick', labelsize=12)   
plt.rc('ytick', labelsize=12)   
plt.rc('legend', fontsize=12)
plt.rc('font', family='serif', serif='Palatino')

In [None]:
all_folk_cols = [
        'AGEP',
        'SCHL',
        'MAR',
        'SEX',
        'DIS',
        'MIG',
        'RELP',
        'RAC1P',
        'PUMA',
        'CIT',
        'OCCP',
        'JWTR',
        'POWPUMA',
        'POVPIP',
    ]
target = 'JWMNP'
dummy_cols = [
        'MAR',
        'SEX',
        'DIS',
        'ESP',
        'MIG',
        'RAC1P',
        'CIT',
        'JWTR',
    ]

def travel_time_filter(data):
    """
    Filters for the employment prediction task
    """
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['PWGTP'] >= 1]
    df = df[df['ESR'] == 1]
    df = df[df['JWMNP'] >= 0]
    return df
preprocess=travel_time_filter
target_transform= None

SubmodelClass = LinearRegressionModel
ImputedModelClass = ImputedLinearRegressionModel
AggregatorClass = MyModelAggregator

ACSIncomeNew = folktables.BasicProblem(
    features=all_folk_cols,
    target=target,
    target_transform=target_transform,    
    group='SEX',
    preprocess=preprocess,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

In [None]:
def test_make_projection_permutation_matrix():
    data_source = folktables.ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=["AL"], download=True)
    features, label, group = ACSIncomeNew.df_to_pandas(acs_data)
    df1 = features
    df2 = df1[all_folk_cols[:2]]
    df1_dummy = pd.get_dummies(df1, columns= list(set(df1.columns) & set(dummy_cols)) , drop_first=True)
    df2_dummy = pd.get_dummies(df2, columns= list(set(df2.columns) & set(dummy_cols)), drop_first=True)
    proj_plus, proj_minus = make_projection_permutation_matrix(df1_dummy.columns, df2_dummy.columns)
    proj = np.vstack([proj_plus, proj_minus])
    assert proj_plus.shape == (len(df2_dummy.columns), len(df1_dummy.columns))
    assert proj_minus.shape == ( len(df1_dummy.columns) - len(df2_dummy.columns), len(df1_dummy.columns))
    assert np.all(np.sum(proj, axis=0) == 1)
    assert np.all(np.sum(proj, axis=1) == 1)
    assert np.all(np.logical_or(proj == 0, proj == 1))

    a,b = make_projection_permutation(df1_dummy.columns, df2_dummy.columns)
    inverse = np.eye(len(df1_dummy.columns))[inverse_perm(a + b)]
    assert np.all(inverse @ proj == np.eye(len(df1_dummy.columns)))

    print("make_projection_permutation_matrix passed!")

test_make_projection_permutation_matrix()
    

# Make Dataset objects

### Folktables

In [None]:
num_datasets = 5
year_list = ["2018"] * num_datasets
states_list = [["CA"], ["NY"], ["TX"], ["FL"], ["IL"]]
dataset_folk_cols_list = [
    all_folk_cols,
    all_folk_cols[1:],
    all_folk_cols[2:],
    all_folk_cols[4:],
    all_folk_cols[6:]
]

In [None]:
og_dataset_list = []
dataset_list = []
og_val_dataset_list = []
val_dataset_list = []
og_test_dataset_list = []
test_dataset_list = []

def get_dummy_columns(dummy_cols, folk_cols):
    return list(set(dummy_cols) & set(folk_cols))

for idx in range(num_datasets):
    year = year_list[idx]
    states = states_list[idx]
    folk_cols = dataset_folk_cols_list[idx]

    data_source = folktables.ACSDataSource(survey_year=year, horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=states, download=True)
    features, label, group = ACSIncomeNew.df_to_pandas(acs_data)
    label = label * 1.0

    subfeatures = features[folk_cols]
    subfeatures = pd.get_dummies(subfeatures, columns=list(set(dummy_cols) & set(folk_cols)), drop_first=True)
    features = pd.get_dummies(features, columns=list(set(dummy_cols) & set(all_folk_cols)), drop_first=True)

    features_train, features_test, label_train, label_test = train_test_split(features, label, test_size=0.2, random_state=42)
    features_train, features_val, label_train, label_val = train_test_split(features_train, label_train, test_size=0.2, random_state=42)

    subfeatures_train, subfeatures_test, sublabel_train, sublabel_test = train_test_split(subfeatures, label, test_size=0.2, random_state=42)
    subfeatures_train, subfeatures_val, sublabel_train, sublabel_val = train_test_split(subfeatures_train, sublabel_train, test_size=0.2, random_state=42)
    
    if SubmodelClass != LogisticRegressionModel:
        label_train = np.log(label_train)
        label_val = np.log(label_val)
        label_test = np.log(label_test)
        sublabel_train = np.log(sublabel_train)
        sublabel_val = np.log(sublabel_val)
        sublabel_test = np.log(sublabel_test)

    assert np.allclose(label_train, sublabel_train), idx
    assert np.allclose(label_val, sublabel_val)
    assert np.allclose(label_test, sublabel_test)

    og_dataset = FolktablesDataset(name=year, features=features_train, labels=label_train, columns=features_train.columns,transforms=PandasToNumpyTransform())
    dataset = FolktablesDataset(name=year, features=subfeatures_train, labels=label_train, columns=subfeatures_train.columns,transforms=PandasToNumpyTransform())

    og_val_dataset = FolktablesDataset(name=year, features=features_val, labels=label_val, columns=features_val.columns, transforms=PandasToNumpyTransform())
    val_dataset = FolktablesDataset(name=year, features=subfeatures_val, labels=label_val, columns=subfeatures_val.columns, transforms=PandasToNumpyTransform())

    og_test_dataset = FolktablesDataset(name=year, features=features_test, labels=label_test, columns=features_test.columns,transforms=PandasToNumpyTransform())
    test_dataset = FolktablesDataset(name=year, features=subfeatures_test, labels=label_test, columns=subfeatures_test.columns,transforms=PandasToNumpyTransform())

    # center data using training data statistics
    def normalize_data(df, mean=None, std=None):
        if mean is None:
            mean = df.mean()
        if std is None:
            std = df.std()
        return (df - mean) / (std + 1e-7), mean, std
    og_dataset.features, features_train_mean, features_train_std = normalize_data(og_dataset.features)
    og_features_train_mean = features_train_mean
    og_features_train_std = features_train_std
    og_val_dataset.features, _, _ = normalize_data(og_val_dataset.features, features_train_mean, features_train_std)
    og_test_dataset.features, _, _ = normalize_data(og_test_dataset.features, features_train_mean, features_train_std)
    if SubmodelClass != LogisticRegressionModel:
        og_dataset.labels, label_train_mean, label_train_std = normalize_data(og_dataset.labels)
        og_val_dataset.labels, _, _ = normalize_data(og_val_dataset.labels, label_train_mean, label_train_std)
        og_test_dataset.labels, _, _ = normalize_data(og_test_dataset.labels, label_train_mean, label_train_std)

    dataset.features, features_train_mean, features_train_std = normalize_data(dataset.features)
    val_dataset.features, _, _ = normalize_data(val_dataset.features, features_train_mean, features_train_std)
    test_dataset.features, _, _ = normalize_data(test_dataset.features, features_train_mean, features_train_std)
    if SubmodelClass != LogisticRegressionModel:
        dataset.labels, label_train_mean, label_train_std = normalize_data(dataset.labels)
        val_dataset.labels, _, _ = normalize_data(val_dataset.labels, label_train_mean, label_train_std)
        test_dataset.labels, _, _ = normalize_data(test_dataset.labels, label_train_mean, label_train_std)

    og_dataset_list.append(og_dataset)
    dataset_list.append(dataset)
    og_val_dataset_list.append(og_val_dataset)
    val_dataset_list.append(val_dataset)
    og_test_dataset_list.append(og_test_dataset)
    test_dataset_list.append(test_dataset)
all_og_dataset = og_dataset_list[0]
for og_dataset in og_dataset_list[1:]:
    all_og_dataset += og_dataset

In [None]:
[print(len(og_val_dataset)) for og_val_dataset in og_dataset_list]

In [None]:
[print(len(dataset.columns)) for dataset in dataset_list]

# Visualize Cov Matrix

In [None]:
# compute eigenvalues of covariance matrix
cov_matrix, _, _ =compute_covariance_matrix(all_og_dataset)
eigvals, eigvecs = np.linalg.eig(cov_matrix)
print(np.max(eigvals) / np.min(eigvals))

In [None]:
#plot a heatmap of the covariance matrix
sns.heatmap(np.abs(cov_matrix))
plt.xlabel('Feature Index', fontsize=20)
plt.ylabel('Feature Index', fontsize=20)
plt.title('Covariance Matrix Heatmap', fontsize=20)
plt.tight_layout()
# plt.savefig("gaussian-plots/folktables-covariance-matrix-heatmap.pdf")

# Train Base Models

In [None]:
itds = dataset_list[-1]
gaussian_imse_fn = lambda x, y: -1.0

In [None]:
n_list = [200, 300, 400, 500, 600, 700, 800, 900, 1000] +  [2000, 3000, 4000, 5000, 6000, 7000, 8000, 10000]
trial_list = np.arange(80)
results_list = []
result_cols = ['n', 'dataset_idx', 'num_columns', 'mse', 'abse', 'bool_error', 'trial', 'gaussian_imse']
models_list = [SubmodelClass() for _ in range(num_datasets)]
for n, trial in tqdm(itertools.product(n_list, trial_list), total=len(n_list) * len(trial_list)):
    tds = og_test_dataset_list[-1]
    # cov_matrix, _, _ = compute_covariance_matrix(all_og_dataset)
    imputed_baseline = ImputedModelClass(cov_matrix, og_dataset_list[-1].columns)
    rw_imputed_baseline = ImputedModelClass(cov_matrix, og_dataset_list[-1].columns)
    
    losses_list = []
    for i in range(num_datasets):
        ds = dataset_list[i]
        ds.shuffle(seed=trial)

        X, y, columns = ds.get_n_samples_numpy(n)
        imputed_baseline.impute(X, y, columns)
        models_list[i].fit(X, y, columns)

        vds = val_dataset_list[i]
        mse = compute_error(vds, model=models_list[i], metric=mse_fn, num_samples=10000, imperfect=True)
        losses_list.append(mse)
        rw_imputed_baseline.impute(X, y, columns, reweight=1 / mse)

        gaussian_imse = compute_error(tds, model=models_list[i], metric=gaussian_imse_fn, num_samples=6000, imperfect=False)
        mse = compute_error(tds, model=models_list[i], metric=hacky_cov_mse_fn, num_samples=6000, imperfect=False)
        abse = compute_error(tds, model=models_list[i], metric=abse_fn, num_samples=6000, imperfect=False)
        bool_error = compute_error(tds, model=models_list[i], metric=boolerr_fn, num_samples=6000, imperfect=False)
        results_list.append((n, str(i), str(len(columns)), mse, abse, bool_error, trial, gaussian_imse))

    agg_model = AggregatorClass()
    losses_list = np.array(losses_list)
    agg_model.fit(all_columns=og_dataset_list[-1].columns, model_list=models_list, cov_matrix=cov_matrix, losses_list=losses_list)
    gaussian_imse = compute_error(tds, model=agg_model, metric=gaussian_imse_fn, num_samples=6000, imperfect=False)
    mse = compute_error(tds, model=agg_model, metric=hacky_cov_mse_fn, num_samples=6000, imperfect=False)
    abse = compute_error(tds, model=agg_model, metric=abse_fn, num_samples=6000, imperfect=False)
    bool_error = compute_error(tds, model=agg_model, metric=boolerr_fn, num_samples=6000, imperfect=False)
    results_list.append((n, "agg", "NA", mse, abse, bool_error, trial, gaussian_imse))

    myds = og_dataset_list[-1]
    myds.shuffle(seed=trial)
    myfeatures, mylabels, mycolumns = myds.get_n_samples_numpy(n)

    # identity_cov_ds = GaussianDataset(theta=tds.theta, cov=np.eye(tds.theta.shape[0]), sigma=0.0, columns=tds.columns)
    naive_agg_model = NaiveAggregator(all_columns=og_dataset_list[-1].columns, model_list=models_list)
    gaussian_imse = -1
    mse = compute_error(tds, model=naive_agg_model, metric=mse_fn, num_samples=6000, imperfect=True)
    abse = compute_error(tds, model=naive_agg_model, metric=abse_fn, num_samples=6000, imperfect=True)
    bool_error = compute_error(tds, model=naive_agg_model, metric=boolerr_fn, num_samples=6000, imperfect=True)
    #parameter error option
    # mse = compute_error(identity_cov_ds, model=naive_agg_model, metric=mse_fn, num_samples=6000, imperfect=True)
    # abse = compute_error(identity_cov_ds, model=naive_agg_model, metric=abse_fn, num_samples=6000, imperfect=True)
    # bool_error = compute_error(identity_cov_ds, model=naive_agg_model, metric=boolerr_fn, num_samples=6000, imperfect=True)
    results_list.append((n, "naive_agg", "NA", mse, abse, bool_error, trial, gaussian_imse))

    naive_agg_model.fit(myfeatures, mylabels, X_columns=mycolumns)
    gaussian_imse = -1
    mse = compute_error(tds, model=naive_agg_model, metric=mse_fn, num_samples=6000, imperfect=True)
    abse = compute_error(tds, model=naive_agg_model, metric=abse_fn, num_samples=6000, imperfect=True)
    bool_error = compute_error(tds, model=naive_agg_model, metric=boolerr_fn, num_samples=6000, imperfect=True)
    #parameter error option
    # mse = compute_error(identity_cov_ds, model=naive_agg_model, metric=mse_fn, num_samples=6000, imperfect=True)
    # abse = compute_error(identity_cov_ds, model=naive_agg_model, metric=abse_fn, num_samples=6000, imperfect=True)
    # bool_error = compute_error(identity_cov_ds, model=naive_agg_model, metric=boolerr_fn, num_samples=6000, imperfect=True)
    results_list.append((n, "opt_naive_agg", "NA", mse, abse, bool_error, trial, gaussian_imse))

    imputed_baseline.fit()
    gaussian_imse = compute_error(tds, model=imputed_baseline, metric=gaussian_imse_fn, num_samples=6000, imperfect=False)
    mse = compute_error(tds, model=imputed_baseline, metric=hacky_cov_mse_fn, num_samples=6000, imperfect=False)
    abse = compute_error(tds, model=imputed_baseline, metric=abse_fn, num_samples=6000, imperfect=False)
    bool_error = compute_error(tds, model=imputed_baseline, metric=boolerr_fn, num_samples=6000, imperfect=False)
    results_list.append((n, "imputed_baseline", "NA", mse, abse, bool_error, trial, gaussian_imse))

    rw_imputed_baseline.fit()
    gaussian_imse = compute_error(tds, model=rw_imputed_baseline, metric=gaussian_imse_fn, num_samples=6000, imperfect=False)
    mse = compute_error(tds, model=rw_imputed_baseline, metric=hacky_cov_mse_fn, num_samples=6000, imperfect=False)
    abse = compute_error(tds, model=rw_imputed_baseline, metric=abse_fn, num_samples=6000, imperfect=False)
    bool_error = compute_error(tds, model=rw_imputed_baseline, metric=boolerr_fn, num_samples=6000, imperfect=False)
    results_list.append((n, "rw_imputed_baseline", "NA", mse, abse, bool_error, trial, gaussian_imse))

    # HACK to get the right number of columns ######
    myds = itds
    # myds = dataset_list[0]
    # myds = ub_lb_dataset
    myds.shuffle(seed=trial)
    myfeatures, mylabels, mycolumns = myds.get_n_samples_numpy(n)
    # ######

    baseline_model = SubmodelClass()
    baseline_model.fit(myfeatures, mylabels, mycolumns)
    gaussian_imse = compute_error(tds, model=baseline_model, metric=gaussian_imse_fn, num_samples=6000, imperfect=False)
    mse = compute_error(tds, model=baseline_model, metric=hacky_cov_mse_fn, num_samples=6000, imperfect=False)
    abse = compute_error(tds, model=baseline_model, metric=abse_fn, num_samples=6000, imperfect=False)
    bool_error = compute_error(tds, model=baseline_model, metric=boolerr_fn, num_samples=6000, imperfect=False)
    results_list.append((n, "lb_baseline", "NA", mse, abse, bool_error, trial, gaussian_imse))

    myfeatures, mylabels, mycolumns = myds.get_n_samples_numpy(num_datasets * n)
    baseline_model = SubmodelClass()
    baseline_model.fit(myfeatures, mylabels, mycolumns)
    gaussian_imse = compute_error(tds, model=baseline_model, metric=gaussian_imse_fn, num_samples=6000, imperfect=False)
    mse = compute_error(tds, model=baseline_model, metric=hacky_cov_mse_fn, num_samples=6000, imperfect=False)
    abse = compute_error(tds, model=baseline_model, metric=abse_fn, num_samples=6000, imperfect=False)
    bool_error = compute_error(tds, model=baseline_model, metric=boolerr_fn, num_samples=6000, imperfect=False)
    results_list.append((n, "ub_baseline", "NA", mse, abse, bool_error, trial, gaussian_imse))


In [None]:
results_df = pd.DataFrame(results_list, columns=result_cols)
og_results_df = results_df.copy()

In [None]:
# OR load from file
og_results_df= pd.read_csv("folktables-results-05-12-23-useILfortest-fixeditds.csv")

In [None]:
results_df = og_results_df.copy()
method_name_remap = {
    "agg": "Collab",
    "rw_imputed_baseline": "RW-Imputation",
    "imputed_baseline": "Imputation",
    "naive_agg": "Naive-Collab",
    "opt_naive_agg": "Optimized-Naive-Collab",
    "lb_baseline": "Naive-Local",
    "ub_baseline": f"Naive-Local ({num_datasets}x data)"
}
results_df = results_df[results_df["dataset_idx"].isin(method_name_remap.keys())]
results_df['dataset_idx'] = results_df['dataset_idx'].map(method_name_remap)

unique_dataset_idx = results_df['dataset_idx'].unique()

# Create a color palette and a marker list
palette = dict(zip(unique_dataset_idx, sns.color_palette(n_colors=len(unique_dataset_idx))))
markers = dict(zip(unique_dataset_idx, ['o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X'][:len(unique_dataset_idx)]))


### Folktables

In [None]:
plot_methods_list = ["Collab", "RW-Imputation", "Imputation", "Naive-Collab", "Optimized-Naive-Collab", "Naive-Local", f"Naive-Local ({num_datasets}x data)"]
plotdf = results_df[results_df['dataset_idx'].isin(plot_methods_list)]
# select n greater than 1000
plotdf = plotdf[plotdf['n'] >= 2000]
plotdf = plotdf[plotdf['n'] <= 8000]

# log scale mse column in plotdf
# plotdf['mse'] = np.log10(plotdf['mse'])
ax=sns.lineplot(x="n", y="mse", hue="dataset_idx", data=plotdf, style="dataset_idx", markersize=7, palette=palette, markers=markers)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[0:], labels=labels[0:])
plt.ylabel("Mean Sq. Pred. Err.", fontsize=20)
plt.xlabel("number of samples (n)", fontsize=20)
plt.title("Illinois Full Feature Prediction Error (Large n)", fontsize=20)
plt.tight_layout()
#log scale y
# plt.yscale('log')

# #set xlim
# plt.xlim(1000, 8000)
plt.ylim(0.85, 0.95)

# plt.savefig("gaussian-plots/folktables-full-prediction-states-large-n-v3.pdf")



In [None]:
plot_methods_list = ["Collab", "RW-Imputation", "Imputation", "Naive-Collab", "Optimized-Naive-Collab",  f"Naive-Local ({num_datasets}x data)"]
plotdf = results_df[results_df['dataset_idx'].isin(plot_methods_list)]
# select n greater than 1000
plotdf = plotdf[plotdf['n'] < 1000]
plotdf = plotdf[plotdf['n'] >= 400]

# log scale mse column in plotdf
# plotdf['mse'] = np.log10(plotdf['mse'])
ax=sns.lineplot(x="n", y="mse", hue="dataset_idx", data=plotdf, style="dataset_idx", markersize=7, palette=palette, markers=markers)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[0:], labels=labels[0:])
plt.ylabel("Mean Sq. Pred. Err.", fontsize=20)
plt.xlabel("number of samples (n)", fontsize=20)
plt.title("Illinois Full Feature Prediction Error (Small n)", fontsize=20)
plt.tight_layout()

# #set xlim
# plt.xlim(1000, 8000)
# plt.ylim(0.85, 1.05)

# plt.savefig("gaussian-plots/folktables-full-prediction-states-small-n-v4.pdf")



In [None]:
plot_methods_list = ["Collab","Naive-Collab", "Optimized-Naive-Collab",  f"Naive-Local ({num_datasets}x data)"]
plotdf = results_df[results_df['dataset_idx'].isin(plot_methods_list)]
# select n greater than 1000
plotdf = plotdf[plotdf['n'] < 1000]
plotdf = plotdf[plotdf['n'] >= 400]

# log scale mse column in plotdf
# plotdf['mse'] = np.log10(plotdf['mse'])
ax=sns.lineplot(x="n", y="mse", hue="dataset_idx", data=plotdf, style="dataset_idx", markersize=7, palette=palette, markers=markers)
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles[0:], labels=labels[0:])
plt.ylabel("Mean Sq. Pred. Err.", fontsize=20)
plt.xlabel("number of samples (n)", fontsize=20)
plt.title("Illinois Full Feature Prediction Error (Small n)", fontsize=20)
plt.tight_layout()

# #set xlim
# plt.xlim(1000, 8000)
# plt.ylim(0.85, 1.05)

# plt.savefig("gaussian-plots/folktables-full-prediction-states-small-n-v4.pdf")
# plt.savefig("gaussian-plots/folktables-full-prediction-states-small-n-v5.pdf")

