In [4]:
import pandas as pd
from sklearn.metrics import *
from tqdm import tqdm
from utils import Hetero_Data_Processor_Filter_on_Test_since_first_post
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [5]:

# Usage
file_path_replies = r"replies_charlie_hebdo.pkl"
file_path_posts = r"posts_charlie_hebdo.pkl"


processor = Hetero_Data_Processor_Filter_on_Test_since_first_post(file_path_replies, file_path_posts, time_cut=20)
data = processor.process()


In [6]:
data

HeteroData(
  id={
    x=[1409, 106],
    y=[1409],
    train_mask=[1409],
    val_mask=[1409],
    test_mask=[1409],
  },
  reply_user_id={ x=[13169, 104] },
  (id, retweet, reply_user_id)={ edge_index=[2, 13169] },
  (reply_user_id, rev_retweet, id)={ edge_index=[2, 13169] }
)

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, to_hetero

class GAT(torch.nn.Module):
    def __init__(self, dim_h,dim_i, dim_out):
        super().__init__()
        self.conv1 = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.conv2 = GATConv(dim_h, dim_i, add_self_loops=False)  # Added second GATConv layer
        self.linear = nn.Linear(dim_i, dim_out)
        self.dropout = nn.Dropout(p=0.4)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index).relu()
        h = self.dropout(h)
        h = self.conv2(h, edge_index).relu()  # Pass through the second GATConv layer
        h = self.dropout(h)
        h = self.linear(h)
        return h

In [8]:

model = GAT(dim_h=64,dim_i=32, dim_out=2)
model = to_hetero(model, data.metadata(), aggr='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from torch.nn.functional import cross_entropy

def evaluate(model, data, mask_names):
    model.eval()
    out = model(data.x_dict, data.edge_index_dict)
    preds = out.argmax(dim=-1)
    labels = data['id'].y

    if isinstance(mask_names, str):
        mask = data['id'][mask_names]
    else:
        mask = torch.zeros_like(data['id'].y, dtype=torch.bool)
        for name in mask_names:
            mask |= data['id'][name]

    preds_masked = preds[mask]
    labels_masked = labels[mask]
    probs = out[mask][:, 1]  # Fixed: out is a tensor

    acc = accuracy_score(labels_masked.cpu(), preds_masked.cpu())
    precision = precision_score(labels_masked.cpu(), preds_masked.cpu(), zero_division=0)
    recall = recall_score(labels_masked.cpu(), preds_masked.cpu(), zero_division=0)

    try:
        auc = roc_auc_score(labels_masked.cpu(), probs.detach().cpu())
    except ValueError:
        auc = 0.0

    return acc, precision, recall, auc


def evaluate_metrics(model, data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)['id']
        preds = out.argmax(dim=1)
        probs = out[:, 1]  # Probability of class 1

    true = data['id'].y[mask]
    pred = preds[mask]
    prob = probs[mask]

    acc = accuracy_score(true.cpu(), pred.cpu())
    prec = precision_score(true.cpu(), pred.cpu(), average='macro', zero_division=0)
    recall = recall_score(true.cpu(), pred.cpu(), average='macro', zero_division=0)
    try:
        auc = roc_auc_score(true.cpu(), prob.cpu())
    except:
        auc = 0.0

    return acc, prec, recall, auc


In [10]:
y_train = data['id'].y[data['id'].train_mask].cpu()
imbalance = (y_train == 1).sum() / len(y_train)

for epoch in range(1, 101):
            model.train()
            optimizer.zero_grad()
            out = model(data.x_dict, data.edge_index_dict)['id']
            mask = data['id'].train_mask
            loss = F.cross_entropy(out[mask], data['id'].y[mask])
            loss.backward()
            optimizer.step()

            if epoch % 100 == 0:
                 train_acc, train_prec, train_recall, train_auc= evaluate_metrics(model, data, data['id'].train_mask)
                 print(f"[Epoch {epoch}] Train Loss: {loss:.4f} | Train Recall: {train_recall:.4f} | Train Auc: {train_auc:.4f}")

# Final evaluation on val + test
final_mask = data['id'].val_mask | data['id'].test_mask
acc, prec, recall, auc = evaluate_metrics(model, data, final_mask)
print(f"[Final Val+Test] Acc: {acc:.4f} | Prec: {prec:.4f} | Recall: {recall:.4f} | AUC: {auc:.4f}")

[Epoch 100] Train Loss: 0.3127 | Train Recall: 0.7934 | Train Auc: 0.9169
[Final Val+Test] Acc: 0.6250 | Prec: 0.7857 | Recall: 0.6250 | AUC: 0.8750


#### Training Draft Model

In [None]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("GAT Network 2025-06-07  Charlie Hebdo")

In [32]:
def evaluate_metrics(model, data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)['id']
        preds = out.argmax(dim=1)
        probs = out[:, 1]  # Probability of class 1

    true = data['id'].y[mask]
    pred = preds[mask]
    prob = probs[mask]

    acc = accuracy_score(true.cpu(), pred.cpu())
    prec = precision_score(true.cpu(), pred.cpu(), average='macro', zero_division=0)
    recall = recall_score(true.cpu(), pred.cpu(), average='macro', zero_division=0)
    try:
        auc = roc_auc_score(true.cpu(), prob.cpu())
    except:
        auc = 0.0

    return acc, prec, recall, auc

for time_cut in range(20, (60 * 24 * 3), 15):
    print(f"\n=== Time Cut: {time_cut} ===")
    processor = Hetero_Data_Processor_Filter_on_Test_since_first_post(file_path_replies, file_path_posts, time_cut=time_cut)
    data = processor.process()

    model = GAT(dim_h=64, dim_i=32, dim_out=2)
    model = to_hetero(model, data.metadata(), aggr='sum')
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data, model = data.to(device), model.to(device)

    # Compute imbalance
    y_train = data['id'].y[data['id'].train_mask].cpu()
    imbalance = (y_train == 1).sum() / len(y_train)

    with mlflow.start_run(run_name=f"time_cut_{time_cut}"):
        for epoch in range(1, 101):
            model.train()
            optimizer.zero_grad()
            out = model(data.x_dict, data.edge_index_dict)['id']
            mask = data['id'].train_mask
            loss = F.cross_entropy(out[mask], data['id'].y[mask])
            loss.backward()
            optimizer.step()

            if epoch % 100 == 0:
                 train_acc, train_prec, train_recall, train_auc= evaluate_metrics(model, data, data['id'].train_mask)
                 print(f"[Epoch {epoch}] Train Loss: {loss:.4f} | Train Recall: {train_recall:.4f} | Train Auc: {train_auc:.4f}")
        mlflow.log_metric("train_recall", train_recall, step=epoch)
        mlflow.log_metric("train_precision", train_prec, step=epoch)
        mlflow.log_metric("train_auc", train_auc, step=epoch)

        # Final evaluation on val + test
        final_mask = data['id'].val_mask | data['id'].test_mask
        acc, prec, recall, auc = evaluate_metrics(model, data, final_mask)
        print(f"[Final Val+Test] Acc: {acc:.4f} | Prec: {prec:.4f} | Recall: {recall:.4f} | AUC: {auc:.4f}")

        mlflow.log_metric("final_acc", acc)
        mlflow.log_metric("final_precision", prec)
        mlflow.log_metric("final_recall", recall)
        mlflow.log_metric("final_auc", auc)
        mlflow.log_metric("train_imbalance_rate", imbalance.item())

        mlflow.log_metric("time_cut", time_cut)



=== Time Cut: 20 ===
[Epoch 100] Train Loss: 0.3251 | Train Recall: 0.8056 | Train Auc: 0.9113
[Final Val+Test] Acc: 0.7500 | Prec: 0.8333 | Recall: 0.7500 | AUC: 0.8750

=== Time Cut: 35 ===
[Epoch 100] Train Loss: 0.3272 | Train Recall: 0.7606 | Train Auc: 0.9146
[Final Val+Test] Acc: 0.6800 | Prec: 0.7316 | Recall: 0.7045 | AUC: 0.8636

=== Time Cut: 50 ===
[Epoch 100] Train Loss: 0.3174 | Train Recall: 0.8244 | Train Auc: 0.9094
[Final Val+Test] Acc: 0.7273 | Prec: 0.7481 | Recall: 0.7389 | AUC: 0.8889

=== Time Cut: 65 ===
[Epoch 100] Train Loss: 0.3488 | Train Recall: 0.7990 | Train Auc: 0.8958
[Final Val+Test] Acc: 0.7073 | Prec: 0.7712 | Recall: 0.7131 | AUC: 0.9024

=== Time Cut: 80 ===
[Epoch 100] Train Loss: 0.3673 | Train Recall: 0.7361 | Train Auc: 0.8826
[Final Val+Test] Acc: 0.6136 | Prec: 0.7153 | Recall: 0.6284 | AUC: 0.9089

=== Time Cut: 95 ===
[Epoch 100] Train Loss: 0.3231 | Train Recall: 0.8015 | Train Auc: 0.9160
[Final Val+Test] Acc: 0.7368 | Prec: 0.7632 | Rec