In [4]:
import pandas as pd
from sklearn.metrics import *
from tqdm import tqdm
from utils import HeteroDataProcessorFilterNodeonTest
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [5]:

# Usage
file_path_replies = r"/home/azureuser/rumour-detection-pheme/replies_charlie_hebdo.pkl"
file_path_posts = r"/home/azureuser/rumour-detection-pheme/posts_charlie_hebdo.pkl"

time_cut =120

processor = HeteroDataProcessorFilterNodeonTest(file_path_replies, file_path_posts, time_cut)
data = processor.process()


In [6]:
data

HeteroData(
  id={
    x=[1485, 106],
    y=[1485],
    train_mask=[1485],
    val_mask=[1485],
    test_mask=[1485],
  },
  reply_user_id={ x=[13824, 104] },
  (id, retweet, reply_user_id)={ edge_index=[2, 13824] },
  (reply_user_id, rev_retweet, id)={ edge_index=[2, 13824] }
)

In [15]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("GAT Network 2024-09-28 2 layers Filter Node on test 17min-3days")

<Experiment: artifact_location='/home/azureuser/rumour-detection-pheme/mlruns/34', creation_time=1727919992274, experiment_id='34', last_update_time=1727919992274, lifecycle_stage='active', name='GAT Network 2024-09-28 2 layers Filter Node on test 17min-3days', tags={}>

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, to_hetero

class GAT(torch.nn.Module):
    def __init__(self, dim_h, dim_out):
        super().__init__()
        self.conv1 = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.conv2 = GATConv(dim_h, dim_h, add_self_loops=False)  # Added second GATConv layer
        self.linear = nn.Linear(dim_h, dim_out)
        self.dropout = nn.Dropout(p=0.4)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index).relu()
        h = self.dropout(h)
        h = self.conv2(h, edge_index).relu()  # Pass through the second GATConv layer
        h = self.dropout(h)
        h = self.linear(h)
        return h

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
    acc = (pred[mask] == data['id'].y[mask]).sum() / mask.sum()
    return float(acc)

In [9]:

model = GAT(dim_h=64, dim_out=2)
model = to_hetero(model, data.metadata(), aggr='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)



for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)['id']
    mask = data['id'].train_mask
    loss = F.cross_entropy(out[mask], data['id'].y[mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        train_acc = test(data['id'].train_mask)
        val_acc = test(data['id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')
    
test_acc = test(data['id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')
    


Epoch:   0 | Train Loss: 1.2335 | Train Acc: 29.62% | Val Acc: 76.19%
Epoch:  50 | Train Loss: 0.4279 | Train Acc: 80.51% | Val Acc: 35.71%
Epoch: 100 | Train Loss: 0.3397 | Train Acc: 85.94% | Val Acc: 64.29%
Epoch: 150 | Train Loss: 0.2801 | Train Acc: 87.94% | Val Acc: 66.67%
Epoch: 200 | Train Loss: 0.2451 | Train Acc: 89.58% | Val Acc: 76.19%
Epoch: 250 | Train Loss: 0.1966 | Train Acc: 92.79% | Val Acc: 78.57%
Epoch: 300 | Train Loss: 0.1721 | Train Acc: 93.93% | Val Acc: 78.57%
Epoch: 350 | Train Loss: 0.1455 | Train Acc: 95.93% | Val Acc: 83.33%
Epoch: 400 | Train Loss: 0.1153 | Train Acc: 97.36% | Val Acc: 83.33%
Epoch: 450 | Train Loss: 0.0860 | Train Acc: 98.29% | Val Acc: 80.95%
Test accuracy: 95.24%


In [10]:
test_mask = data['id'].test_mask | data['id'].val_mask
pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
true_labels = data['id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.873342175066313

In [11]:
recall_score(true_labels, pred_labels, average='macro')

0.842665855143031

In [None]:
#
file_path_replies = r"/home/azureuser/rumour-detection-pheme/replies_charlie_hebdo.pkl"
file_path_posts = r"/home/azureuser/rumour-detection-pheme/posts_charlie_hebdo.pkl"

for time_cut in range(17,(60*24*3),10):
    print(time_cut)
    processor = HeteroDataProcessorFilterNodeonTest(file_path_replies, file_path_posts, time_cut)
    data = processor.process()

    model = GAT(dim_h=64, dim_out=2)
    model = to_hetero(model, data.metadata(), aggr='sum')
    
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data, model = data.to(device), model.to(device)
   
    with mlflow.start_run():

        for epoch in range(400):
            model.train()
            optimizer.zero_grad()
            out = model(data.x_dict, data.edge_index_dict)['id']
            mask = data['id'].train_mask
            loss = F.cross_entropy(out[mask], data['id'].y[mask])
            loss.backward()
            optimizer.step()
            
            if epoch % 100 == 0:
                print(epoch)
                train_acc = test(data['id'].train_mask)
                val_acc = test(data['id'].val_mask)
                print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

            
        test_acc = test(data['id'].test_mask)
        print(f'Test accuracy: {test_acc*100:.2f}%')

        test_mask = data['id'].test_mask | data['id'].val_mask
        pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
        true_labels = data['id'].y[test_mask]
        pred_labels = pred[test_mask]
        test_precision =precision_score(true_labels, pred_labels, average='macro')
        test_recall=recall_score(true_labels, pred_labels, average='macro')
        print(f'Test Recall: {test_recall*100:.2f}%')
        print(f'Test Precision: {test_precision*100:.2f}%')

        mlflow.log_metric("train_loss", loss.item(), step=epoch)
        mlflow.log_metric("train_acc", train_acc, step=epoch)
        mlflow.log_metric("val_acc", val_acc, step=epoch)
        mlflow.log_metric("test_acc", test_acc)
        mlflow.log_metric("test_precision",  test_precision)
        mlflow.log_metric("test_recall",  test_recall)

            
        mlflow.log_param("dim_h", 64)
        mlflow.log_param("dim_out", 2)
        mlflow.log_param("learning_rate", 0.001)
        mlflow.log_param("epochs", 400)
        mlflow.log_metric("time_cut", time_cut)



        
            
        mlflow.pytorch.log_model(model, f"GAT_model_{time_cut}")


17
0
Epoch:   0 | Train Loss: 0.7403 | Train Acc: 75.95% | Val Acc: 0.00%
100
Epoch: 100 | Train Loss: 0.3199 | Train Acc: 85.87% | Val Acc: 40.00%
200
Epoch: 200 | Train Loss: 0.2267 | Train Acc: 90.29% | Val Acc: 20.00%
300
Epoch: 300 | Train Loss: 0.1481 | Train Acc: 94.22% | Val Acc: 20.00%
Test accuracy: 100.00%
Test Recall: 60.00%
Test Precision: 75.00%




27
0
Epoch:   0 | Train Loss: 0.7895 | Train Acc: 71.95% | Val Acc: 0.00%
100
Epoch: 100 | Train Loss: 0.3126 | Train Acc: 86.22% | Val Acc: 53.85%
200
Epoch: 200 | Train Loss: 0.1959 | Train Acc: 90.65% | Val Acc: 76.92%
300
Epoch: 300 | Train Loss: 0.1264 | Train Acc: 93.08% | Val Acc: 76.92%
Test accuracy: 83.33%
Test Recall: 80.13%
Test Precision: 80.13%




37
0
Epoch:   0 | Train Loss: 0.7034 | Train Acc: 75.52% | Val Acc: 0.00%
100
Epoch: 100 | Train Loss: 0.3062 | Train Acc: 86.94% | Val Acc: 73.33%
200
Epoch: 200 | Train Loss: 0.2090 | Train Acc: 91.08% | Val Acc: 86.67%
300
Epoch: 300 | Train Loss: 0.1186 | Train Acc: 95.36% | Val Acc: 86.67%
Test accuracy: 85.71%
Test Recall: 82.93%
Test Precision: 82.62%




47
0
Epoch:   0 | Train Loss: 0.8573 | Train Acc: 77.66% | Val Acc: 0.00%
100
Epoch: 100 | Train Loss: 0.3300 | Train Acc: 84.51% | Val Acc: 50.00%
200
Epoch: 200 | Train Loss: 0.2426 | Train Acc: 89.22% | Val Acc: 66.67%
300
Epoch: 300 | Train Loss: 0.1650 | Train Acc: 93.36% | Val Acc: 72.22%
Test accuracy: 94.12%
Test Recall: 83.17%
Test Precision: 84.52%




57
0
Epoch:   0 | Train Loss: 0.6361 | Train Acc: 77.66% | Val Acc: 5.00%
100
Epoch: 100 | Train Loss: 0.2972 | Train Acc: 86.80% | Val Acc: 55.00%
200
Epoch: 200 | Train Loss: 0.2139 | Train Acc: 92.51% | Val Acc: 70.00%
300
Epoch: 300 | Train Loss: 0.1163 | Train Acc: 97.00% | Val Acc: 80.00%
Test accuracy: 89.47%
Test Recall: 84.47%
Test Precision: 85.03%




67
0
Epoch:   0 | Train Loss: 2.2680 | Train Acc: 27.98% | Val Acc: 100.00%
100
Epoch: 100 | Train Loss: 0.3537 | Train Acc: 84.30% | Val Acc: 38.10%
200
Epoch: 200 | Train Loss: 0.2646 | Train Acc: 88.65% | Val Acc: 66.67%
300
Epoch: 300 | Train Loss: 0.2042 | Train Acc: 92.86% | Val Acc: 76.19%
Test accuracy: 95.24%
Test Recall: 88.10%
Test Precision: 88.89%




77
0
Epoch:   0 | Train Loss: 0.9275 | Train Acc: 77.30% | Val Acc: 0.00%
100
Epoch: 100 | Train Loss: 0.2986 | Train Acc: 86.51% | Val Acc: 59.09%
200
Epoch: 200 | Train Loss: 0.1980 | Train Acc: 91.29% | Val Acc: 59.09%
300
Epoch: 300 | Train Loss: 0.1403 | Train Acc: 95.07% | Val Acc: 54.55%
Test accuracy: 90.48%
Test Recall: 72.51%
Test Precision: 75.62%




87
0
Epoch:   0 | Train Loss: 0.9208 | Train Acc: 36.97% | Val Acc: 40.74%
100
Epoch: 100 | Train Loss: 0.3322 | Train Acc: 85.94% | Val Acc: 55.56%
200
Epoch: 200 | Train Loss: 0.2349 | Train Acc: 91.22% | Val Acc: 77.78%
300
Epoch: 300 | Train Loss: 0.1447 | Train Acc: 95.57% | Val Acc: 77.78%
Test accuracy: 92.59%
Test Recall: 85.19%
Test Precision: 85.97%




97
0
Epoch:   0 | Train Loss: 0.7273 | Train Acc: 75.23% | Val Acc: 3.12%
100
Epoch: 100 | Train Loss: 0.3080 | Train Acc: 86.08% | Val Acc: 59.38%
200
Epoch: 200 | Train Loss: 0.2157 | Train Acc: 90.58% | Val Acc: 68.75%
300
Epoch: 300 | Train Loss: 0.1555 | Train Acc: 92.72% | Val Acc: 65.62%
Test accuracy: 96.88%
Test Recall: 83.53%
Test Precision: 87.01%




107
0
Epoch:   0 | Train Loss: 1.1707 | Train Acc: 36.19% | Val Acc: 47.22%
100
Epoch: 100 | Train Loss: 0.3376 | Train Acc: 84.30% | Val Acc: 55.56%
200
Epoch: 200 | Train Loss: 0.2816 | Train Acc: 88.44% | Val Acc: 66.67%
300
Epoch: 300 | Train Loss: 0.1804 | Train Acc: 93.08% | Val Acc: 72.22%
Test accuracy: 94.44%
Test Recall: 84.66%
Test Precision: 87.49%




117
0
Epoch:   0 | Train Loss: 0.6895 | Train Acc: 75.87% | Val Acc: 22.50%
100
Epoch: 100 | Train Loss: 0.3053 | Train Acc: 86.51% | Val Acc: 60.00%
200
Epoch: 200 | Train Loss: 0.2139 | Train Acc: 91.86% | Val Acc: 75.00%
300
Epoch: 300 | Train Loss: 0.1186 | Train Acc: 97.36% | Val Acc: 75.00%
Test accuracy: 95.00%
Test Recall: 85.65%
Test Precision: 87.84%




127
0
Epoch:   0 | Train Loss: 0.8364 | Train Acc: 52.53% | Val Acc: 43.48%
100
Epoch: 100 | Train Loss: 0.3157 | Train Acc: 86.08% | Val Acc: 58.70%
200
Epoch: 200 | Train Loss: 0.2434 | Train Acc: 90.29% | Val Acc: 76.09%
300
Epoch: 300 | Train Loss: 0.1600 | Train Acc: 94.79% | Val Acc: 80.43%
Test accuracy: 97.78%
Test Recall: 87.04%
Test Precision: 90.77%




137
0
Epoch:   0 | Train Loss: 0.9419 | Train Acc: 74.59% | Val Acc: 36.00%
100
Epoch: 100 | Train Loss: 0.3197 | Train Acc: 86.01% | Val Acc: 66.00%
200
Epoch: 200 | Train Loss: 0.2198 | Train Acc: 90.15% | Val Acc: 76.00%
300
Epoch: 300 | Train Loss: 0.1422 | Train Acc: 93.65% | Val Acc: 82.00%
Test accuracy: 91.84%
Test Recall: 83.77%
Test Precision: 85.67%




147
0
Epoch:   0 | Train Loss: 0.6707 | Train Acc: 77.66% | Val Acc: 40.74%
100
Epoch: 100 | Train Loss: 0.3062 | Train Acc: 86.65% | Val Acc: 72.22%
200
Epoch: 200 | Train Loss: 0.1942 | Train Acc: 92.29% | Val Acc: 70.37%
300
Epoch: 300 | Train Loss: 0.1257 | Train Acc: 96.00% | Val Acc: 75.93%
Test accuracy: 94.44%
Test Recall: 77.06%
Test Precision: 83.66%




157
0
Epoch:   0 | Train Loss: 0.9954 | Train Acc: 59.46% | Val Acc: 33.90%
