In [1]:
import numpy as np
# import seaborn as sns
# sns.set(style="darkgrid")
from scipy.stats import poisson
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score

In [2]:
from pathlib import Path
import pickle
from pif_dsbmm_dpf.citation_real.process_real import label_propagation

seed = 42
datadir = Path('/scratch/fitzgeraldj/data/caus_inf_data/')
data_model_str = f"real_seed{seed}"
data_model_path = datadir / f"{data_model_str}.pkl"
with open(data_model_path, "rb") as f:
    data_model = pickle.load(f)

Y = data_model.Y
old_aus = [np.flatnonzero(Y[-2].sum(axis=1)),np.flatnonzero(Y[-1].sum(axis=1))]
Y_heldout = data_model.Y_heldout
full_A_end = data_model.full_A_end
test_aus = data_model.test_aus

dsbmm_datadir = datadir / "dsbmm_data"
deg_corr = True
directed = True




In [46]:
def predict(params, A, Y_p, model):
    # only do for heldout aus, so final timestep
    gamma = params["Gamma_hat"][:, -1, :]
    alpha = params["Alpha_hat"][:, -1, :]
    z = params["Z_hat"][:, -1, :]
    variant = model.split(".")[-1]
    print(f"Model {model.split('.')[0]}; variant {variant}")
    if "dsbmm_dpf" in model or "network_pref_only" in model:
        dsbmm_res_str = f"{data_model_str}_{'dc' if deg_corr else 'ndc'}_{'dir' if directed else 'undir'}_{'meta' if variant=='z-theta-joint' else 'nometa'}"
        with open(dsbmm_datadir / f"{dsbmm_res_str}_subs.pkl", "rb") as f:
            _, Z_trans, block_probs = pickle.load(f)
        full_node_probs = label_propagation(
            test_aus,
            old_aus,
            full_A_end,
            params["Z_hat"][:, -2:, :].copy(),
            Z_trans,
            block_probs[..., -1],
            deg_corr=deg_corr,
        )
        z = full_node_probs[:, -1, :]
        if "dsbmm_dpf" in model:
            # need to expand to match form of rho
            z = np.pad(z, ((0, 0), (0, alpha.shape[-1])))

    # may want to do similar for w
    w = params["W_hat"][:, -1, :]
    beta = params["Beta_hat"][:, -1]

    rate = (beta * A).dot(Y_p)

    if model == "network_pref_only":
        rate += z.dot(gamma.T)
    elif model == "topic_only":
        rate += alpha.dot(w.T)
    elif "dsbmm_dpf" in model:
        rate += z.dot(gamma.T) + alpha.dot(w.T)
    try:
        return rate.toarray() + 1e-10
    except AttributeError:
        return rate + 1e-10


def get_ll(predicted, truth, restrict_users=None):
    if restrict_users is not None:
        predicted = predicted[restrict_users, :]
        # truth = truth[restrict_users,:] already restricted now
    return poisson.logpmf(truth.toarray(), predicted).sum(axis=1).mean()


def get_classification_metrics(pred, truth, restrict_users=None):
    if restrict_users is not None:
        pred = pred[restrict_users, :]
        # truth = truth[restrict_users,:] again already restricted
    return roc_auc_score(truth.toarray().flatten(), pred.flatten())


def get_influence_rates(params, A, Y_p):
    # again only want final timestep
    beta = params["Beta_hat"][:, -1]
    rate = (beta * A).dot(Y_p)
    mean_inf_rate = rate.mean(axis=1)
    return mean_inf_rate


In [47]:
## Filter aus that publish at least 1 paper in the held-out period
aus_to_predict = (Y_heldout.sum(axis=1) > 0)
assert len(test_aus) == len(aus_to_predict)
print("Num aus that publish at least one paper in the held-out data:", aus_to_predict.sum())
aus_to_predict = test_aus

Num aus that publish at least one paper in the held-out data: 1235


### Load results; print average influence and heldout prediction results.

In [63]:
out = Path('/scratch/fitzgeraldj/data/caus_inf_data/real_results/')
b = 'Beta_hat'
clean_names = {
            'unadjusted.main':'Unadjusted',
            #   'spf':'mSPF',
              'network_pref_only.main':'Network-Only',
              'topic_only.main':'Topic-Only',
              'dsbmm_dpf.z-theta-joint':'Ours',
              }

methods = ['unadjusted.main', 
          #  'network_pref_only.main', 
           'topic_only.main',
           'dsbmm_dpf.z-theta-joint',
           ]
results = {m:np.load((out / (m + "_pres_subs_ewcnone_model_fitted_params")) / 'all_params.npz') for m in methods}

preds = [predict(results[m], full_A_end, Y[-1], m) for m in methods]
hol = {m:get_ll(pred, 
                Y_heldout, 
                restrict_users=aus_to_predict) for m,pred in zip(methods,preds)}
auc = {m:get_classification_metrics(pred, 
                                    Y_heldout, 
                                    restrict_users=aus_to_predict) for m,pred in zip(methods,preds)}

data = [[clean_names[m], results[m][b].mean(), hol[m], auc[m]] for m in methods]

df = pd.DataFrame(data, columns=['Method', 'Average Estimated Influence', 'HOL', 'AUC'])
df

Model unadjusted; variant main
Model topic_only; variant main
Model dsbmm_dpf; variant z-theta-joint


Label prop.: 100%|██████████| 1235/1235 [00:33<00:00, 37.36it/s] 


Unnamed: 0,Method,Average Estimated Influence,HOL,AUC
0,Unadjusted,0.417052,-16.90877,0.887669
1,Topic-Only,0.42017,-16.932607,0.887687
2,Ours,0.413281,-20.286492,0.926881


In [65]:
out = df.copy()
out.columns = ["","Av. $\\beta$", "HOL", "AUC"]
out.iloc[:,1:] = np.round(out.iloc[:,1:],2)
out = out.astype(str)
out["HOL"][out["HOL"] == out["HOL"].min(axis=0)] = '\textbf{' + out["HOL"][out["HOL"] == out["HOL"].min(axis=0)] + '}'
out["AUC"][out["AUC"] == out["AUC"].max(axis=0)] = '\textbf{' + out["AUC"][out["AUC"] == out["AUC"].max(axis=0)] + '}'
out.to_latex('./results/real_results.tex',escape=False)

out

  out.to_latex('./results/real_results.tex',escape=False)


Unnamed: 0,Unnamed: 1,Av. $\beta$,HOL,AUC
0,Unadjusted,0.42,\textbf{-16.91},0.89
1,Topic-Only,0.42,-16.93,0.89
2,Ours,0.41,-20.29,\textbf{0.93}
