In [37]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportion_confint
golden_label=pd.read_csv("data/total_labeled.csv")['income']
pseudo_label=pd.read_csv("data/total_unlabeled_pred.csv")['income']
concated_label=pd.concat([golden_label,pseudo_label],axis=0)

In [38]:
golden_success=np.sum(golden_label)
pseudo_success=np.sum(pseudo_label)
n_golden=len(golden_label)
n_pseudo=len(pseudo_label)
print("golden success: ", golden_success)
print("pseudo success: ", pseudo_success)
print("golden size: ", n_golden)
print("pseudo size: ", n_pseudo)

golden success:  2412
pseudo success:  7568
golden size:  10000
pseudo size:  38842


In [39]:
golden_ci_low, golden_ci_high = proportion_confint(count=golden_success, nobs=n_golden, alpha=0.1, method='wilson')
pseudo_ci_low, pseudo_ci_high = proportion_confint(count=pseudo_success, nobs=n_pseudo, alpha=0.1, method='wilson')
concated_ci_low, concated_ci_high = proportion_confint(count=np.sum(concated_label), nobs=len(concated_label), alpha=0.1, method='wilson')
print("golden ci: ", f"{golden_ci_low:.3f}", f"{golden_ci_high:.3f}")
print("pseudo ci: ", f"{pseudo_ci_low:.3f}", f"{pseudo_ci_high:.3f}")
print("golden ci width: ", f"{golden_ci_high-golden_ci_low:.3f}")
print("pseudo ci width: ", f"{pseudo_ci_high-pseudo_ci_low:.3f}")
print("concated ci: ", f"{concated_ci_low:.3f}", f"{concated_ci_high:.3f}")
print("concated ci width: ", f"{concated_ci_high-concated_ci_low:.3f}")

golden ci:  0.234 0.248
pseudo ci:  0.192 0.198
golden ci width:  0.014
pseudo ci width:  0.007
concated ci:  0.201 0.207
concated ci width:  0.006


In [15]:
import joblib
model=joblib.load("xgb_model.pkl")
total_labeled=pd.read_csv("data/total_labeled.csv")
X_total=total_labeled.copy().drop(columns=['income'])
income_pred=model.predict(X_total)
total_labeled['income_pred']=income_pred
imcome_pred_proba=model.predict_proba(X_total)[:,1]
total_labeled['income_pred_proba']=imcome_pred_proba


In [16]:
total_labeled.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,income_pred,income_pred_proba
0,18,4,423024,11,9,4,8,1,4,1,0,0,20,39,0,0,0.002953
1,17,4,178953,2,8,4,12,3,4,0,0,0,20,39,0,0,0.003176
2,25,2,348986,11,9,4,6,2,2,1,0,0,40,39,0,0,0.007416
3,20,4,218215,15,10,4,12,3,4,0,0,0,30,39,0,0,0.002002
4,47,4,244025,11,9,4,7,4,0,1,0,0,56,33,0,0,0.033039


In [17]:
total_labeled.to_csv("data/total_labeled_pred.csv", index=False)

In [35]:
from ppi_py import ppi_mean_ci
ppi_ci = ppi_mean_ci(
    golden_label.to_numpy().reshape(-1, 1),
    income_pred,
    pseudo_label.to_numpy().reshape(-1, 1),
    alpha=0.1
)

print("ppi ci: ", f"{ppi_ci[0][0]:.3f}", f"{ppi_ci[1][0]:.3f}")
print("ppi ci width: ", f"{ppi_ci[1][0]-ppi_ci[0][0]:.3f}")

ppi ci:  0.235 0.246
ppi ci width:  0.011
