In [1]:
import pandas as pd
from sklearn.metrics import *
from tqdm import tqdm
from utils import Hetero_Data_Processor_Transfer_Learning
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HANConv, Linear
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_dataset = 'charlie_hebdo'
test_dataset = 'ottawashooting'
time_cut =60*3*24
processor = Hetero_Data_Processor_Transfer_Learning(train_dataset, test_dataset, time_cut=time_cut,test_size=0.3)
data = processor.process()

rumour
1    139
0    119
Name: count, dtype: int64


In [3]:
data

HeteroData(
  id={
    x=[2859, 106],
    y=[2859],
    train_mask=[2859],
    val_mask=[2859],
    test_mask=[2859],
  },
  reply_user_id={ x=[26437, 104] },
  (id, retweet, reply_user_id)={ edge_index=[2, 26437] },
  (reply_user_id, rev_retweet, id)={ edge_index=[2, 26437] }
)

In [4]:
class HAN(nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=64, heads=4):
        super().__init__()
        self.han = HANConv(dim_in, dim_h, heads=heads,dropout=0.2, metadata=data.metadata())
        self.han2 = HANConv(dim_h, dim_h, heads=heads, dropout=0.2, metadata=data.metadata())
        self.linear = nn.Linear(dim_h, dim_out)

    def forward(self, x_dict, edge_index_dict):
        out = self.han(x_dict, edge_index_dict)
        out = self.han2(out, edge_index_dict)
        out = self.linear(out['id'])
        return out
    

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
import torch
from torch.nn.functional import cross_entropy

def evaluate(model, data, mask_names):
    model.eval()
    out = model(data.x_dict, data.edge_index_dict)
    preds = out.argmax(dim=-1)
    labels = data['id'].y

    if isinstance(mask_names, str):
        mask = data['id'][mask_names]
    else:
        mask = torch.zeros_like(data['id'].y, dtype=torch.bool)
        for name in mask_names:
            mask |= data['id'][name]

    preds_masked = preds[mask]
    labels_masked = labels[mask]
    probs = out[mask][:, 1]  # Fixed: out is a tensor

    acc = accuracy_score(labels_masked.cpu(), preds_masked.cpu())
    precision = precision_score(labels_masked.cpu(), preds_masked.cpu(), zero_division=0)
    recall = recall_score(labels_masked.cpu(), preds_masked.cpu(), zero_division=0)

    try:
        auc = roc_auc_score(labels_masked.cpu(), probs.detach().cpu())
    except ValueError:
        auc = 0.0

    return acc, precision, recall, auc




def train(model, data, optimizer, epochs=100):
    for epoch in range(1, epochs + 1):
        model.train()
        optimizer.zero_grad()

        out = model(data.x_dict, data.edge_index_dict)
        mask = data['id'].train_mask
        #out_id = out['id']
        loss = F.cross_entropy(out[mask], data['id'].y[mask])
        #loss = cross_entropy(out_id[data['id'].train_mask], data['id'].y[data['id'].train_mask])
        loss.backward()
        optimizer.step()

        # Train metrics
        acc, precision, recall, auc = evaluate(model, data, 'train_mask')
        print(f"[Epoch {epoch:03d}] Train - Acc: {acc:.4f} | Prec: {precision:.4f} | Recall: {recall:.4f} | AUC: {auc:.4f}")

        # Val metrics every 10 epochs
        if epoch % 10 == 0:
            acc_val, prec_val, recall_val, auc_val = evaluate(model, data, 'val_mask')
            print(f"[Epoch {epoch:03d}] Val   - Acc: {acc_val:.4f} | Prec: {prec_val:.4f} | Recall: {recall_val:.4f} | AUC: {auc_val:.4f}")

    print("\nFinal Evaluation (Val + Test):")
    acc_final, prec_final, recall_final, auc_final = evaluate(model, data, ['val_mask', 'test_mask'])
    print(f"[Final] Val+Test - Acc: {acc_final:.4f} | Prec: {prec_final:.4f} | Recall: {recall_final:.4f} | AUC: {auc_final:.4f}")
    


In [6]:

model = HAN(dim_in=-1, dim_out=2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

In [7]:
train(model, data, optimizer, epochs=200)


[Epoch 001] Train - Acc: 0.6386 | Prec: 0.1723 | Recall: 0.0601 | AUC: 0.3109
[Epoch 002] Train - Acc: 0.6574 | Prec: 0.1062 | Recall: 0.0222 | AUC: 0.3356
[Epoch 003] Train - Acc: 0.6717 | Prec: 0.0917 | Recall: 0.0131 | AUC: 0.3639
[Epoch 004] Train - Acc: 0.6828 | Prec: 0.0833 | Recall: 0.0078 | AUC: 0.3942
[Epoch 005] Train - Acc: 0.6897 | Prec: 0.0625 | Recall: 0.0039 | AUC: 0.4276
[Epoch 006] Train - Acc: 0.6993 | Prec: 0.0000 | Recall: 0.0000 | AUC: 0.4677
[Epoch 007] Train - Acc: 0.7043 | Prec: 0.0000 | Recall: 0.0000 | AUC: 0.4921
[Epoch 008] Train - Acc: 0.7047 | Prec: 0.0000 | Recall: 0.0000 | AUC: 0.5139
[Epoch 009] Train - Acc: 0.7059 | Prec: 0.0000 | Recall: 0.0000 | AUC: 0.5349
[Epoch 010] Train - Acc: 0.7059 | Prec: 0.0000 | Recall: 0.0000 | AUC: 0.5556
[Epoch 010] Val   - Acc: 0.5116 | Prec: 0.0000 | Recall: 0.0000 | AUC: 0.6568
[Epoch 011] Train - Acc: 0.7059 | Prec: 0.0000 | Recall: 0.0000 | AUC: 0.5768
[Epoch 012] Train - Acc: 0.7059 | Prec: 0.0000 | Recall: 0.0000 

In [4]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("Han Network 2025-06-14 2 Sydney Siege TF")

2025/06/14 14:55:06 INFO mlflow.tracking.fluent: Experiment with name 'Han Network 2025-06-14 2 Sydney Siege TF' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/rumour-detection-gnn/mlruns/89', creation_time=1749912906343, experiment_id='89', last_update_time=1749912906343, lifecycle_stage='active', name='Han Network 2025-06-14 2 Sydney Siege TF', tags={}>

In [6]:
def evaluate_metrics(model, data, mask):
    model.eval()
    with torch.no_grad():
        out = model(data.x_dict, data.edge_index_dict)
        preds = out.argmax(dim=1)
        probs = out[:, 1]  # Probability of class 1

    true = data['id'].y[mask]
    pred = preds[mask]
    prob = probs[mask]

    acc = accuracy_score(true.cpu(), pred.cpu())
    prec = precision_score(true.cpu(), pred.cpu(), average='macro', zero_division=0)
    recall = recall_score(true.cpu(), pred.cpu(), average='macro', zero_division=0)
    try:
        auc = roc_auc_score(true.cpu(), prob.cpu())
    except:
        auc = 0.0

    return acc, prec, recall, auc

for time_cut in range(20, (60 * 24 * 3), 15):
    print(f"\n=== Time Cut: {time_cut} ===")
    train_dataset = 'charlie_hebdo'
    #test_dataset = 'ferguson'
    test_dataset = 'sydneysiege'
    #test_dataset = 'germanwings_crash'
    #test_dataset = 'ottawashooting'
    time_cut =time_cut
    processor = Hetero_Data_Processor_Transfer_Learning(train_dataset, test_dataset, time_cut=time_cut,test_size=0.3)
    data = processor.process()


    model = HAN(dim_in=-1, dim_out=2)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data, model = data.to(device), model.to(device)

    # Compute imbalance
    y_train = data['id'].y[data['id'].train_mask].cpu()
    imbalance = (y_train == 1).sum() / len(y_train)

    with mlflow.start_run(run_name=f"time_cut_{time_cut}"):
        for epoch in range(1, 101):
            model.train()
            optimizer.zero_grad()
            out = model(data.x_dict, data.edge_index_dict)
            mask = data['id'].train_mask
            loss = F.cross_entropy(out[mask], data['id'].y[mask])
            loss.backward()
            optimizer.step()

            if epoch % 100 == 0:
                 train_acc, train_prec, train_recall, train_auc= evaluate_metrics(model, data, data['id'].train_mask)
                 print(f"[Epoch {epoch}] Train Loss: {loss:.4f} | Train Recall: {train_recall:.4f} | Train Auc: {train_auc:.4f}")
        mlflow.log_metric("train_recall", train_recall, step=epoch)
        mlflow.log_metric("train_precision", train_prec, step=epoch)
        mlflow.log_metric("train_auc", train_auc, step=epoch)

        # Final evaluation on val + test
        final_mask = data['id'].val_mask | data['id'].test_mask
        acc, prec, recall, auc = evaluate_metrics(model, data, final_mask)
        print(f"[Final Val+Test] Acc: {acc:.4f} | Prec: {prec:.4f} | Recall: {recall:.4f} | AUC: {auc:.4f}")

        mlflow.log_metric("final_acc", acc)
        mlflow.log_metric("final_precision", prec)
        mlflow.log_metric("final_recall", recall)
        mlflow.log_metric("final_auc", auc)
        mlflow.log_metric("train_imbalance_rate", imbalance.item())

        mlflow.log_metric("time_cut", time_cut)



=== Time Cut: 20 ===
rumour
0    12
1     2
Name: count, dtype: int64
[Epoch 100] Train Loss: 0.3345 | Train Recall: 0.8199 | Train Auc: 0.9115
[Final Val+Test] Acc: 0.9286 | Prec: 0.9615 | Recall: 0.7500 | AUC: 0.8750

=== Time Cut: 35 ===
rumour
0    22
1     3
Name: count, dtype: int64
[Epoch 100] Train Loss: 0.3564 | Train Recall: 0.8104 | Train Auc: 0.8908
[Final Val+Test] Acc: 0.8000 | Prec: 0.6404 | Recall: 0.7424 | AUC: 0.8333

=== Time Cut: 50 ===
rumour
0    30
1     5
Name: count, dtype: int64
[Epoch 100] Train Loss: 0.3476 | Train Recall: 0.8159 | Train Auc: 0.9081
[Final Val+Test] Acc: 0.8286 | Prec: 0.6786 | Recall: 0.7333 | AUC: 0.8467

=== Time Cut: 65 ===
rumour
0    41
1     5
Name: count, dtype: int64
[Epoch 100] Train Loss: 0.3502 | Train Recall: 0.8104 | Train Auc: 0.9065
[Final Val+Test] Acc: 0.7826 | Prec: 0.6078 | Recall: 0.7024 | AUC: 0.7805

=== Time Cut: 80 ===
rumour
0    53
1     7
Name: count, dtype: int64
[Epoch 100] Train Loss: 0.3544 | Train Recall: 0.