In [1]:
import os
import re
import math
import pickle
import numpy as np
import pandas as pd

from tqdm import tqdm
from scipy.special import softmax
# from utils import denoise_text, preprocess_text

from sklearn.metrics import auc, roc_curve, RocCurveDisplay
from sklearn.utils.class_weight import compute_class_weight 
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

from matplotlib import pyplot as plt
from sklearn.metrics import (
    accuracy_score, roc_auc_score, 
    f1_score, accuracy_score, precision_score, recall_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay)

from IPython.display import display, Markdown, Latex

pd.set_option('display.max_colwidth', None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
FILE = "datasets/cards_waterloo.csv"

In [3]:
data = pd.read_csv(FILE, low_memory=False)

In [4]:
# Load label encoder
with open('cards/models/label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)

In [5]:
# list_samples = [0, 50, 100, 200, 300, 400, "400V2", 500, 700, 1000]
seed = "9834838408490912248"
list_samples = ["0V1", "400V1"]
# "50_V2", "50_V3"
list_samples = [
#     "0_V1", "50_V1", "50_V2", "50_V2.1", "100_V2", "200_V2"
    "0_V1", 
    "0_V1_SimCSE_RANDOM_hard_negatives", "0_V1_SimCSE_SAME_BRANCH_hard_negatives",
    "50_SimCSE_SAME_BRANCH_5_hard_negatives",
    "50_V1", "50_V2", "50_V4", 
#     "50_V4_base", 
    "50_GPT-4_V3",
#     "50_V4_SimCSE_base", "50_V4_SimCSE_unsupervised", "50_V4_subsampled", "50_V4_SimCSE_RANDOM_hard_negatives",
#     "50_V4_new_pipe", 
    "100_V2", "500_V2", "100_V3", "500_V3", "GPT-4_V2"
]
partition = ["TRAIN", "VALID", "TEST"]
f1_scores = {}
reports = {}
cards_data = data[(data.DATASET=="cards")].copy(deep=True)
for n in list_samples:
    print(n)

    augmented_cards = pd.read_csv(f"datasets/augmented/{seed}/cards_augmented_{n}.csv")

    for p in partition:
        augmented_cards_partition = augmented_cards[augmented_cards.PARTITION==p]

        y_true = augmented_cards_partition.claim.values
        y_pred = augmented_cards_partition[f"cards_aug_pred"].values
            
        if p not in reports: reports[p]=[]
        report = classification_report_df(y_true, y_pred)
        reports[p].append(report)

        if p not in f1_scores: f1_scores[p]=[]
        f1_scores[p].append(f1_score(y_true, y_pred, average='macro'))

0_V1


NameError: name 'classification_report_df' is not defined

In [None]:
resumen = pd.DataFrame(f1_scores, index=list_samples).T
resumen = resumen[["0_V1", "50_V1", "50_V2", "50_V4", "100_V2", "500_V2"]]

cols = {}
for c in resumen.columns:
    if "V1" in c:
        cols[c] = c.replace("V1", "CHATGPT_V1")
    elif "V2" in c:
        cols[c] = c.replace("V2", "CHATGPT_V2")
    elif "V3" in c:
        cols[c] = c.replace("V3", "GPT-4-TEST")
    elif "V4" in c:
        cols[c] = c.replace("V4", "GPT-4")

resumen = resumen.rename(columns=cols)
resumen

In [None]:
resumen = pd.DataFrame(f1_scores, index=list_samples).T
resumen = resumen[[
    "0_V1", "0_V1_SimCSE_RANDOM_hard_negatives", "0_V1_SimCSE_SAME_BRANCH_hard_negatives", 
    "50_SimCSE_SAME_BRANCH_5_hard_negatives"]]
resumen = resumen.rename(columns={
    "0_V1": "CARDS",
    "0_V1_SimCSE_RANDOM_hard_negatives": "RANDOM (1)",
    "0_V1_SimCSE_SAME_BRANCH_hard_negatives": "SAME BRANCH (1)",
    "50_SimCSE_SAME_BRANCH_5_hard_negatives": "SAME BRANCH (2)"
})
resumen

In [None]:
resumen = pd.DataFrame(f1_scores, index=list_samples).T
resumen = resumen[[
    "0_V1", "50_V4", "50_GPT-4_V3"]]
resumen = resumen.rename(columns={
    "0_V1": "CARDS",
    "50_V4": "GPT-4 (1)",
    "50_GPT-4_V3": "GPT-4 (2)",
})
resumen

In [None]:
def styling(data):
    red = 'background-color: #f8d2d2'
    green = 'background-color: #d2f8d2'
    if data.name=="diff":
        return [red if float(v)<0 else green for v in data]
    return [None for v in data]

submpling_exp = resumen[["0_V1", "50_V4", "50_GPT-4_V3"]]
submpling_exp
# submpling_exp["diff"] = submpling_exp.iloc[:, 1] - submpling_exp.iloc[:, 0]
# submpling_exp.style.apply(styling, axis=0)

In [None]:
submpling_exp = resumen[["0_CHATGPT_V1", "50_GPT-4", ""]]
submpling_exp["diff"] = submpling_exp.iloc[:, 1] - submpling_exp.iloc[:, 0]
submpling_exp.style.apply(styling, axis=0)

In [None]:
submpling_exp = resumen[["50_GPT-4_base", "50_GPT-4_SimCSE_base"]]
submpling_exp["diff"] = submpling_exp.iloc[:, 1] - submpling_exp.iloc[:, 0]
submpling_exp.style.apply(styling, axis=0)

In [None]:
submpling_exp = resumen[["50_GPT-4", "50_GPT-4_SimCSE_unsupervised"]]
submpling_exp["diff"] = submpling_exp.iloc[:, 1] - submpling_exp.iloc[:, 0]
submpling_exp.style.apply(styling, axis=0)

In [None]:
resumen.columns

In [None]:
submpling_exp = resumen[[
    "0_V1", 
    "0_V1_SimCSE_RANDOM_hard_negatives",
    "0_V1_SimCSE_SAME_BRANCH_hard_negatives"
]]
# submpling_exp["diff"] = submpling_exp.iloc[:, 1] - submpling_exp.iloc[:, 0]
# submpling_exp.style.apply(styling, axis=0)
submpling_exp

In [None]:
submpling_exp = resumen[["50_GPT-4", "50_GPT-4_SimCSE_RANDOM_hard_negatives"]]
submpling_exp = submpling_exp.rename(columns={
    "50_GPT-4": "0_V1", 
    "50_GPT-4_SimCSE_RANDOM_hard_negatives": "0_V1_SimCSE_SAME_BRANCH_hard_negatives"})
submpling_exp["diff"] = submpling_exp.iloc[:, 1] - submpling_exp.iloc[:, 0]
submpling_exp.style.apply(styling, axis=0)

In [None]:
submpling_exp = resumen[["50_GPT-4", "50_GPT-4_new_pipe"]]
submpling_exp["diff"] = submpling_exp.iloc[:, 1] - submpling_exp.iloc[:, 0]
submpling_exp.style.apply(styling, axis=0)

In [None]:
fig = plt.figure(figsize=(15, 5), dpi=80)
for p in partition[1:]:
    plt.plot(list_samples, f1_scores[p], label=p)
plt.title("F1-score vs Data Augmented")
plt.ylabel('F1-Score')
plt.xlabel('N. Samples Generated')
plt.legend()

In [None]:
classes = le.classes_
coms = pd.DataFrame()
coms["support"] = data[
    (data.PARTITION=="TRAIN")&(data.DATASET=="cards")].claim.value_counts().sort_index().values
for i, n in enumerate(list_samples[1:]):
    aug = reports["VALID"][i+1][["f1-score"]].astype(float).values
    base = reports["VALID"][0][["f1-score"]].astype(float).values
    diff = (aug - base).flatten()
    
    coms[n] = diff[:18] 
#     print(n)
#     print(n, classes[diff[:18]>0.])
#     print(sum(diff[:18]))
#     print(diff)
#     print(sum(diff[diff>0.][:18]))
#     print()
coms["generated"] = report_generated["f1-score"][:18].apply(float)
coms.index = classes
coms.loc["sum", :] = coms.sum()
coms

red = 'background-color: #f8d2d2'
green = 'background-color: #d2f8d2'
coms.style.apply(
    lambda data, color: [red if float(v)<0 else green for v in data], 
                 color='darkorange', axis=0)

In [None]:
augmented_cards[(augmented_cards.claim == "4_5")&(augmented_cards.DATASET == "generated-gpt-4")].text

In [None]:
dataset = pd.read_csv("datasets/augmented/9834838408490912248/cards_augmented_50_V3_9834838408490912248.csv")
generated = pd.read_csv("datasets/generated_disinformation_taxonomy_CARDS_GPT-4_specific_samples_V2.csv")
# for class_ in le.classes_[1:]:
dataset["based_claims"] = None
dataset.loc[(dataset.claim=="4_5")&(dataset.DATASET=="generated-gpt-4"), "based_claims"] = generated.loc[(generated.generated_label=="4_5"), "based_claims"]

In [None]:
dataset[(dataset.claim=="4_5")&(dataset.DATASET=="generated-gpt-4")][["text", "claim", "cards_aug_pred"]]

In [None]:
bad_index = augmented_cards[(augmented_cards.claim == "4_5")&(augmented_cards.cards_aug_pred != "4_5")&(augmented_cards.PARTITION == "TEST")][["text", "cards_aug_pred"]].index

generated_4_5 = generated.loc[(generated.generated_label=="4_5")]
generated_4_5[generated_4_5.based_claims.apply(lambda x: any([True if i in bad_index else False for i in eval(x)]))]

In [None]:
classes = le.classes_
coms = pd.DataFrame()
coms["support"] = data[
    (data.PARTITION=="TRAIN")&(data.DATASET=="cards")].claim.value_counts().sort_index().values
for i, n in enumerate(list_samples[1:]):
    aug = reports["TEST"][i+1][["f1-score"]].astype(float).values
    base = reports["TEST"][0][["f1-score"]].astype(float).values
    diff = (aug - base).flatten()
    
    coms[n] = diff[:18]
#     print(n)
    print(n, classes[diff[:18]>0.])
#     print(sum(diff[:18]))
#     print(diff)
#     print(sum(diff[diff>0.][:18]))
#     print()
coms["generated"] = report_generated["f1-score"][:18].apply(float)
coms.index = classes
coms.loc["average", :] = coms.sum()
coms

In [None]:
for n in list_samples[1:]:
    augmented_cards = pd.read_csv(f"datasets/cards_augmented_{n}.csv")
    augmented_cards_test = augmented_cards[augmented_cards.PARTITION=="TEST"]

    y_true = augmented_cards_test.claim.values
    y_pred = augmented_cards_test[f"cards_aug_{n}_pred"].values

    report = classification_report_df(y_true, y_pred)
    display(Markdown(f"### n={n}"))
    display(report)

In [None]:
f1_scores["VALID"]

In [None]:
f1_scores["TEST"]

In [None]:
n = 0
VERSION = 1
augmented_cards = pd.read_csv(f"datasets/augmented/cards_augmented_{n}V{VERSION}.csv")
augmented_cards_test = augmented_cards[augmented_cards.PARTITION=="TEST"]

y_true = augmented_cards_test.claim.values
y_pred = augmented_cards_test[f"cards_aug_{n}V{VERSION}_pred"].values

print(f1_score(y_true, y_pred, average="macro"))

report = classification_report_df(y_true, y_pred)
display(Markdown(f"### n={n}"))
display(report)

In [None]:
n = 400
VERSION = 1
augmented_cards = pd.read_csv(f"datasets/augmented/cards_augmented_{n}V{VERSION}.csv")
augmented_cards_test = augmented_cards[augmented_cards.PARTITION=="TEST"]

y_true = augmented_cards_test.claim.values
y_pred = augmented_cards_test[f"cards_aug_{n}V{VERSION}_pred"].values

print(f1_score(y_true, y_pred, average="macro"))

report = classification_report_df(y_true, y_pred)
display(Markdown(f"### n={n}"))
display(report)

In [None]:
n = 400
VERSION = "V5"
augmented_cards = pd.read_csv(f"datasets/augmented/cards_augmented_{n}{VERSION}.csv")
augmented_cards_test = augmented_cards[augmented_cards.PARTITION=="TEST"]

y_true = augmented_cards_test.claim.values
y_pred = augmented_cards_test[f"cards_aug_{n}{VERSION}_pred"].values

print(f1_score(y_true, y_pred, average="macro"))

report = classification_report_df(y_true, y_pred)
display(Markdown(f"### n={n}"))
display(report)

In [None]:
comparative = pd.read_csv(f"datasets/augmented/cards_augmented_400.csv")
comparative = comparative.iloc[:,1:]
tmp = pd.read_csv(f"datasets/augmented/cards_augmented_400V2.csv")
comparative["cards_aug_400V2_pred"] = tmp["cards_aug_400V2_pred"]
comparative["cards_aug_400V2_proba"] = tmp["cards_aug_400V2_proba"]

comparative["cards_aug_400_proba"] = comparative["cards_aug_400_proba"].apply(format_scores)
comparative["cards_aug_400V2_proba"] = comparative["cards_aug_400V2_proba"].apply(format_scores)

comparative["cards_aug_400_score"] = comparative.apply(
    lambda x: x["cards_aug_400_proba"][le.transform([x["cards_aug_400_pred"]])[0]], axis=1)

comparative["cards_aug_400V2_score"] = comparative.apply(
    lambda x: x["cards_aug_400V2_proba"][le.transform([x["cards_aug_400V2_pred"]])[0]], axis=1)

In [None]:
comparative_test = comparative[comparative.PARTITION=="TEST"]
comparative_test[
    (comparative_test.claim=="4_5")&(comparative_test.cards_aug_400_pred!=comparative_test.cards_aug_400V2_pred)]

In [None]:
augmented_cards = pd.read_csv(f"datasets/augmented/cards_augmented_50_filteredV2.csv")
augmented_cards_test = augmented_cards[augmented_cards.PARTITION=="TEST"]

y_true = augmented_cards_test.claim.values
y_pred = augmented_cards_test[f"cards_aug_50_pred"].values

report = classification_report_df(y_true, y_pred)
display(Markdown(f"### n=50"))
print(f1_score(y_true, y_pred, average="macro"))
display(report)

In [None]:
augmented_cards[
    (augmented_cards.DATASET=="generated-chatgpt")&(augmented_cards.labels=="1_4")].cards_pred.value_counts()

In [None]:
generated_taxonomy = pd.read_csv("datasets/generated_disinformation_taxonomy_CARDS_CHATGPT_specific_samples_predict.csv")

# generated_taxonomy = generated_taxonomy[
#     generated_taxonomy.generated_label.isin(["1_1", "1_2", "1_3", "1_4", "1_6", "1_7", "2_1"])]

y_true = generated_taxonomy.generated_label.values
y_pred = generated_taxonomy.cards_pred.values

report = classification_report(y_true, y_pred)
print(report)

classes = np.sort(generated_taxonomy.cards_pred.unique())
c_m = confusion_matrix(y_true, y_pred)
cmp = ConfusionMatrixDisplay(
    c_m, display_labels=classes)
fig, ax = plt.subplots(figsize=(8,8))
cmp.plot(ax=ax)  


report = classification_report_df(y_true, y_pred)
report

In [None]:
generated_taxonomy = pd.read_csv("datasets/generated_disinformation_taxonomy_CARDS_CHATGPT_specific_samples_V2.csv")

y_true = generated_taxonomy.generated_label.values
y_pred = generated_taxonomy.cards_pred.values

report = classification_report(y_true, y_pred)
print(report)

classes = np.sort(generated_taxonomy.cards_pred.unique())
c_m = confusion_matrix(y_true, y_pred)
cmp = ConfusionMatrixDisplay(
    c_m, display_labels=classes)
fig, ax = plt.subplots(figsize=(8,8))
cmp.plot(ax=ax)  


report = classification_report_df(y_true, y_pred)
report

In [None]:
y_true = cards_data.claim.values
y_pred = cards_data.sec_clf_roberta_pred.values

report = classification_report(y_true, y_pred)
print(report)

In [None]:
test_generated = pd.read_csv("datasets/generated_disinformation_binary_TEST_CHATGPT.csv", low_memory=False)
test_generated["roberta_proba"] = test_generated["roberta_proba"].apply(format_scores)
test_generated["aug_roberta_proba"] = test_generated["aug_roberta_proba"].apply(format_scores)
test_generated["labels"] = 1

display(Markdown(f"### Randomly Generated TEST CHATGPT"))
models_comparative(test_generated, models)