Optionally a PCA can be conducted here

In [None]:
import pandas as pd

df = pd.read_pickle("test/test_llm_generations.pkl")

labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
df['popularity_bin'] = pd.qcut(df['popularity'], q=5, labels=labels)

In [None]:
from sklearn.utils import resample
import pandas as pd

# Step 1: Determine the common sample size for each bin after class balancing
min_bin_size = min(
    min(len(group[group['label_mini_fact'] == 1]), len(group[group['label_mini_fact'] == 0]))
    for _, group in df.groupby('popularity_bin')
)

# Step 2: Balance classes within each bin and resample to the common sample size
balanced_dfs = []
for bin_label, group in df.groupby('popularity_bin'):
    # Separate the classes within the bin
    positive_class = group[group['label_mini_fact'] == 1]
    negative_class = group[group['label_mini_fact'] == 0]
    
    # Balance the classes by undersampling
    positive_class_balanced = resample(positive_class, replace=False, n_samples=min_bin_size, random_state=42)
    negative_class_balanced = resample(negative_class, replace=False, n_samples=min_bin_size, random_state=42)
    print(f"Balancing bin '{bin_label}' with {min_bin_size} samples in each class")
    
    # Combine the balanced classes within the bin
    balanced_group = pd.concat([positive_class_balanced, negative_class_balanced])
    
    # Resample the bin to the common bin size (2 * min_bin_size)
    balanced_group = resample(balanced_group, replace=False, n_samples=2 * min_bin_size, random_state=42)
    balanced_dfs.append(balanced_group)

# Combine all balanced bins into a single DataFrame
balanced_df = pd.concat(balanced_dfs).reset_index(drop=True)

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.utils import resample

fig, axes = plt.subplots(1, 5, figsize=(16, 5)) 
popularity_bins = labels

for i, bin_label in enumerate(popularity_bins):
    filtered_df = balanced_df[balanced_df['popularity_bin'] == bin_label].copy()
    X = list(filtered_df['embeddings-16_mini_fact'])
    y = filtered_df['label_mini_fact'].values
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    X_pca_positive = X_pca[y == 1]
    X_pca_negative = X_pca[y == 0]
    axes[i].scatter(X_pca_positive[:, 0], X_pca_positive[:, 1], c='yellow', alpha=0.7, label='Positive (Label 1)')
    axes[i].scatter(X_pca_negative[:, 0], X_pca_negative[:, 1], c='purple', alpha=0.7, label='Negative (Label 0)')
    axes[i].set_title(f"{bin_label} Popularity")
    axes[i].set_xlabel("PCA Component 1")
    axes[i].set_ylabel("PCA Component 2")
    axes[i].set_aspect('equal')  
    axes[i].legend()


plt.tight_layout()
plt.show()

Some Tests (can be skipped)

In [None]:
import pandas as pd

df_train = pd.read_pickle("train/train_injection_low_popularity_with_embeddings.pkl")
print(len(df_train))

In [None]:
import pandas as pd

df_train = pd.read_pickle("train/train_with_popularity_unbalanced.pkl")
print(len(df_train))

df_train["closest_article"] = df_train["closest_article"].apply(lambda x: x[0] if isinstance(x, list) else x)

df_test1 = pd.read_pickle("test/test_llm_generations.pkl")
df_test2 = pd.read_pickle("test/test_all_popularity_real_samples.pkl")
df_test = pd.concat([df_test1, df_test2], ignore_index=True)
df_test["closest_article"] = df_test["closest_article"].apply(lambda x: x[0] if isinstance(x, list) else x)

In [None]:
from itertools import chain

#train_mini_facts_docs = list(set(chain.from_iterable(df_train['closest_article'])))
train_mini_facts_docs = df_train['closest_article'].tolist()

#test_mini_facts_docs = list(set(chain.from_iterable(df_test['docs'])))
test_mini_facts_docs = df_test['closest_article'].tolist()

for train_mini_facts_doc_t in train_mini_facts_docs:
    if train_mini_facts_doc_t in test_mini_facts_docs:
        print(train_mini_facts_doc_t)


for test_mini_facts_doc_t in test_mini_facts_docs:
    if test_mini_facts_doc_t in train_mini_facts_docs:
        print(test_mini_facts_doc_t)

Results: take the path from predictions to see the results

In [44]:
import pandas as pd

df_predictions1 = pd.read_pickle("predictions/prediction_with_train_popularity_unbalanced.pkl")
save_path1 = "with_train_popularity_unbalanced"
df_predictions2 = pd.read_pickle("predictions/prediction_with_train_popularity_balanced.pkl")
save_path2 = "with_train_popularity_balanced"
df_predictions3 = pd.read_pickle("predictions/prediction_with_real_samples_with_train_popularity_balanced.pkl")
save_path3 = "with_real_samples_with_train_popularity_balanced"
df_predictions4 = pd.read_pickle("predictions/prediction_with_real_samples_with_train_popularity_balanced_train_injections.pkl")
save_path4 = "with_real_samples_with_train_popularity_balanced_train_injections"

df = df_predictions4.copy()
save_path = save_path4

Bins are created

In [None]:
import pandas as pd

labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
df['popularity_bin'] = pd.qcut(df['popularity'], q=5, labels=labels)

bin_ranges = pd.qcut(df['popularity'], q=5).cat.categories

bin_label_mapping = {label: bin_range for label, bin_range in zip(labels, bin_ranges)}

print("Bin Ranges and Labels:")
for label, bin_range in bin_label_mapping.items():
    print(f"{label}: {bin_range}")

print("\nNumber of Samples in Each Bin:")
sample_counts = df['popularity_bin'].value_counts(sort=False) 
for label, count in sample_counts.items():
    print(f"{label}: {count}")


Balancing

In [None]:
from sklearn.utils import resample
import pandas as pd

min_bin_size = min(
    min(len(group[group['label_mini_fact'] == 1]), len(group[group['label_mini_fact'] == 0]))
    for _, group in df.groupby('popularity_bin')
)

balanced_dfs = []
for bin_label, group in df.groupby('popularity_bin'):
    positive_class = group[group['label_mini_fact'] == 1]
    negative_class = group[group['label_mini_fact'] == 0]
    
    positive_class_balanced = resample(positive_class, replace=False, n_samples=min_bin_size, random_state=42)
    negative_class_balanced = resample(negative_class, replace=False, n_samples=min_bin_size, random_state=42)
    print(f"Balancing bin '{bin_label}' with {min_bin_size} samples in each class")
    
    balanced_group = pd.concat([positive_class_balanced, negative_class_balanced])
    
    balanced_group = resample(balanced_group, replace=False, n_samples=2 * min_bin_size, random_state=42)
    balanced_dfs.append(balanced_group)

balanced_df = pd.concat(balanced_dfs).reset_index(drop=True)


F1 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score



print("Size of each bin:")
bin_sizes = balanced_df['popularity_bin'].value_counts().sort_index()
print(bin_sizes)

# Calculate F1 Scores for positive and negative classes for each bin
f1_scores_positive = {}
f1_scores_negative = {}
auroc_scores = {}

for bin_label, group in balanced_df.groupby('popularity_bin'):
    predictions = (group['pred_prob'] >= 0.5).astype(int)
    if group['label_mini_fact'].nunique() > 1:
        f1_scores_positive[bin_label] = f1_score(group['label_mini_fact'], predictions)
        f1_scores_negative[bin_label] = f1_score(1 - group['label_mini_fact'], 1 - predictions)
        auroc_scores[bin_label] = roc_auc_score(group['label_mini_fact'], group['pred_prob'])
    else:
        f1_scores_positive[bin_label] = np.nan
        f1_scores_negative[bin_label] = np.nan
        auroc_scores[bin_label] = np.nan

f1_df_positive = pd.DataFrame(list(f1_scores_positive.items()), columns=['popularity_bin', 'f1_score_positive'])
f1_df_negative = pd.DataFrame(list(f1_scores_negative.items()), columns=['popularity_bin', 'f1_score_negative'])
auroc_df = pd.DataFrame(list(auroc_scores.items()), columns=['popularity_bin', 'auroc'])

fig, axes = plt.subplots(1, 3, figsize=(21*0.7, 6*0.7))


axes[0].bar(f1_df_positive['popularity_bin'], f1_df_positive['f1_score_positive'], color='dodgerblue')
axes[0].set_title('F1 Score (Positive Class)')
axes[0].set_xlabel('Popularity Bin')
axes[0].set_ylim(0.3, 1)  
axes[0].set_ylabel('F1 Score')


axes[1].bar(f1_df_negative['popularity_bin'], f1_df_negative['f1_score_negative'], color='tomato')
axes[1].set_title('F1 Score (Negative Class)')
axes[1].set_xlabel('Popularity Bin')
axes[1].set_ylim(0.3, 1)  
axes[1].set_ylabel('F1 Score')


axes[2].bar(auroc_df['popularity_bin'], auroc_df['auroc'], color='grey')
axes[2].set_title('AUROC for Each Popularity Bin')
axes[2].set_xlabel('Popularity Bin')
axes[2].set_ylim(0.3, 1)  
axes[2].set_ylabel('AUROC')


plt.tight_layout()
plt.show()

overall_auroc = roc_auc_score(balanced_df['label_mini_fact'], balanced_df['pred_prob'])
print(f"Overall AUROC: {overall_auroc:.4f}")
fig.savefig(f"f1_{save_path}.png", dpi=300, bbox_inches='tight')



Prob Distribution

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 300


fig, axes = plt.subplots(1, 5, figsize=(20, 5), sharey=True) 

for i, bin_label in enumerate(labels):

    bin_samples = balanced_df[balanced_df['popularity_bin'] == bin_label]
    
    positive_samples = bin_samples[bin_samples['label_mini_fact'] == 1]
    negative_samples = bin_samples[bin_samples['label_mini_fact'] == 0]
    
    sns.histplot(positive_samples['pred_prob'], bins=20, kde=True, color='blue', label='Positive Samples (1)', ax=axes[i])
    sns.histplot(negative_samples['pred_prob'], bins=20, kde=True, color='red', label='Negative Samples (0)', ax=axes[i])

    title_text = f"{bin_label}\nPopularity" 
    axes[i].set_title(title_text, fontsize=14) 
    axes[i].set_xlabel('Prediction Probability', fontsize=16)  

    if i == 0:
        axes[i].set_ylabel('Frequency', fontsize=16)  
    

    axes[i].tick_params(axis='x', labelsize=12) 
    axes[i].tick_params(axis='y', labelsize=12)  
    

    if i == 0:
        axes[i].legend(fontsize=14) 

plt.tight_layout()
plt.show()

fig.savefig(f"{save_path}.png", dpi=300, bbox_inches='tight')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


fig, axes = plt.subplots(1, 5, figsize=(20, 4), sharey=True)

for i, bin_label in enumerate(labels):
    bin_samples = balanced_df[balanced_df['popularity_bin'] == bin_label]
    
    positive_samples = bin_samples[bin_samples['label_mini_fact'] == 1]
    negative_samples = bin_samples[bin_samples['label_mini_fact'] == 0]
    
    sns.histplot(positive_samples['pred_prob'], bins=20, kde=True, color='blue', label='Positive Samples (1)', ax=axes[i])
    sns.histplot(negative_samples['pred_prob'], bins=20, kde=True, color='red', label='Negative Samples (0)', ax=axes[i])
    
    if i == 0:
        axes[i].set_title(f"Pred Prob Distribution - {bin_label} Popularity")
    else:
        axes[i].set_title(f"{bin_label} Popularity")
    axes[i].set_xlabel('Prediction Probability')
    if i == 0:
        axes[i].set_ylabel('Frequency')
    
    if i == 0:
        axes[i].legend()

plt.tight_layout()
plt.show()

fig.savefig("probs_train_popularity_balanced_train_injections.png", dpi=300, bbox_inches='tight')