In [1]:
import pandas as pd
from sklearn.metrics import *
from tqdm import tqdm
from utils import Hetero_Data_Processor_Filter_on_Test_since_first_post
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [2]:

# Usage
file_path_replies = r"/home/azureuser/rumour-detection-pheme/replies_charlie_hebdo.pkl"
file_path_posts = r"/home/azureuser/rumour-detection-pheme/posts_charlie_hebdo.pkl"


processor = Hetero_Data_Processor_Filter_on_Test_since_first_post(file_path_replies, file_path_posts, time_cut=17)
data = processor.process()


In [3]:
data

HeteroData(
  id={
    x=[1407, 106],
    y=[1407],
    train_mask=[1407],
    val_mask=[1407],
    test_mask=[1407],
  },
  reply_user_id={ x=[13161, 104] },
  (id, retweet, reply_user_id)={ edge_index=[2, 13161] },
  (reply_user_id, rev_retweet, id)={ edge_index=[2, 13161] }
)

In [2]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
#mlflow.set_experiment("spyder-experiment")
import mlflow.pytorch
mlflow.set_experiment("GAT Network 2024-10-30 2 layers Filter Node on test time since first post")

<Experiment: artifact_location='/home/azureuser/rumour-detection-pheme/mlruns/45', creation_time=1730336815570, experiment_id='45', last_update_time=1730336815570, lifecycle_stage='active', name='GAT Network 2024-10-30 2 layers Filter Node on test time since first post', tags={}>

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, to_hetero

class GAT(torch.nn.Module):
    def __init__(self, dim_h, dim_out):
        super().__init__()
        self.conv1 = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.conv2 = GATConv(dim_h, dim_h, add_self_loops=False)  # Added second GATConv layer
        self.linear = nn.Linear(dim_h, dim_out)
        self.dropout = nn.Dropout(p=0.4)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index).relu()
        h = self.dropout(h)
        h = self.conv2(h, edge_index).relu()  # Pass through the second GATConv layer
        h = self.dropout(h)
        h = self.linear(h)
        return h

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
    acc = (pred[mask] == data['id'].y[mask]).sum() / mask.sum()
    return float(acc)

In [18]:

model = GAT(dim_h=64, dim_out=2)
model = to_hetero(model, data.metadata(), aggr='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)



for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)['id']
    mask = data['id'].train_mask
    loss = F.cross_entropy(out[mask], data['id'].y[mask])
    loss.backward()
    optimizer.step()
    
    if epoch % 50 == 0:
        train_acc = test(data['id'].train_mask)
        val_acc = test(data['id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')
    
test_acc = test(data['id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')
    


Epoch:   0 | Train Loss: 0.9307 | Train Acc: 64.67% | Val Acc: 0.00%
Epoch:  50 | Train Loss: 0.3778 | Train Acc: 82.94% | Val Acc: 0.00%
Epoch: 100 | Train Loss: 0.3129 | Train Acc: 86.01% | Val Acc: 66.67%
Epoch: 150 | Train Loss: 0.2651 | Train Acc: 88.87% | Val Acc: 66.67%
Epoch: 200 | Train Loss: 0.2253 | Train Acc: 91.43% | Val Acc: 66.67%
Epoch: 250 | Train Loss: 0.1910 | Train Acc: 94.08% | Val Acc: 66.67%
Epoch: 300 | Train Loss: 0.1420 | Train Acc: 95.93% | Val Acc: 66.67%
Epoch: 350 | Train Loss: 0.1092 | Train Acc: 97.57% | Val Acc: 66.67%
Epoch: 400 | Train Loss: 0.0809 | Train Acc: 98.43% | Val Acc: 66.67%
Epoch: 450 | Train Loss: 0.0679 | Train Acc: 99.21% | Val Acc: 66.67%
Test accuracy: 100.00%


In [20]:
test_mask = data['id'].test_mask | data['id'].val_mask
pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
true_labels = data['id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.8333333333333333

In [21]:
recall_score(true_labels, pred_labels, average='macro')

0.8333333333333333

In [7]:
#
file_path_replies = r"/home/azureuser/rumour-detection-pheme/replies_charlie_hebdo.pkl"
file_path_posts = r"/home/azureuser/rumour-detection-pheme/posts_charlie_hebdo.pkl"

for time_cut in range(3993+15,(60*24*3),15):
    print(time_cut)
    processor = Hetero_Data_Processor_Filter_on_Test_since_first_post(file_path_replies, file_path_posts, time_cut=time_cut)
    data = processor.process()
    
    model = GAT(dim_h=64, dim_out=2)
    model = to_hetero(model, data.metadata(), aggr='sum')
        
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    data, model = data.to(device), model.to(device)
       
    with mlflow.start_run():
    
        for epoch in range(400):
            model.train()
            optimizer.zero_grad()
            out = model(data.x_dict, data.edge_index_dict)['id']
            mask = data['id'].train_mask
            loss = F.cross_entropy(out[mask], data['id'].y[mask])
            loss.backward()
            optimizer.step()
                
            if epoch % 100 == 0:
                print(epoch)
                train_acc = test(data['id'].train_mask)
                val_acc = test(data['id'].val_mask)
                print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')
    
                
        test_acc = test(data['id'].test_mask)
        print(f'Test accuracy: {test_acc*100:.2f}%')
    
        test_mask = data['id'].test_mask | data['id'].val_mask
        pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
        true_labels = data['id'].y[test_mask]
        pred_labels = pred[test_mask]
        test_precision =precision_score(true_labels, pred_labels, average='macro')
        test_recall=recall_score(true_labels, pred_labels, average='macro')
        print(f'Test Recall: {test_recall*100:.2f}%')
        print(f'Test Precision: {test_precision*100:.2f}%')
    
        mlflow.log_metric("train_loss", loss.item(), step=epoch)
        mlflow.log_metric("train_acc", train_acc, step=epoch)
        mlflow.log_metric("val_acc", val_acc, step=epoch)
        mlflow.log_metric("test_acc", test_acc)
        mlflow.log_metric("test_precision",  test_precision)
        mlflow.log_metric("test_recall",  test_recall)
    
                
        mlflow.log_param("dim_h", 64)
        mlflow.log_param("dim_out", 2)
        mlflow.log_param("learning_rate", 0.001)
        mlflow.log_param("epochs", 400)
        mlflow.log_metric("time_cut", time_cut)

    
    
    
            
                
        mlflow.pytorch.log_model(model, f"GAT_model_{time_cut}")


4008
0
Epoch:   0 | Train Loss: 0.8092 | Train Acc: 62.38% | Val Acc: 55.15%
100
Epoch: 100 | Train Loss: 0.3289 | Train Acc: 85.30% | Val Acc: 81.40%
200
Epoch: 200 | Train Loss: 0.2343 | Train Acc: 90.65% | Val Acc: 80.40%
300
Epoch: 300 | Train Loss: 0.1515 | Train Acc: 94.72% | Val Acc: 79.40%
Test accuracy: 89.33%
Test Recall: 74.56%
Test Precision: 77.31%




4023
0
Epoch:   0 | Train Loss: 0.7532 | Train Acc: 71.52% | Val Acc: 66.78%
100
Epoch: 100 | Train Loss: 0.2962 | Train Acc: 87.08% | Val Acc: 82.06%
200
Epoch: 200 | Train Loss: 0.1860 | Train Acc: 92.79% | Val Acc: 82.39%
300
Epoch: 300 | Train Loss: 0.1188 | Train Acc: 96.07% | Val Acc: 82.06%
Test accuracy: 85.00%
Test Recall: 79.40%
Test Precision: 76.50%




4038
0
Epoch:   0 | Train Loss: 1.0790 | Train Acc: 25.84% | Val Acc: 40.86%
100
Epoch: 100 | Train Loss: 0.3377 | Train Acc: 84.94% | Val Acc: 81.06%
200
Epoch: 200 | Train Loss: 0.2424 | Train Acc: 89.44% | Val Acc: 77.74%
300
Epoch: 300 | Train Loss: 0.2038 | Train Acc: 91.65% | Val Acc: 77.41%
Test accuracy: 83.67%
Test Recall: 72.84%
Test Precision: 72.04%




4053
0
Epoch:   0 | Train Loss: 1.0051 | Train Acc: 25.84% | Val Acc: 44.19%
100
Epoch: 100 | Train Loss: 0.3205 | Train Acc: 86.44% | Val Acc: 81.40%
200
Epoch: 200 | Train Loss: 0.2383 | Train Acc: 90.58% | Val Acc: 84.05%
300
Epoch: 300 | Train Loss: 0.1441 | Train Acc: 93.22% | Val Acc: 84.39%
Test accuracy: 83.33%
Test Recall: 80.20%
Test Precision: 76.57%




4068
0
Epoch:   0 | Train Loss: 0.6669 | Train Acc: 77.66% | Val Acc: 55.48%
100
Epoch: 100 | Train Loss: 0.2975 | Train Acc: 86.94% | Val Acc: 81.06%
200
Epoch: 200 | Train Loss: 0.1899 | Train Acc: 92.79% | Val Acc: 82.06%
300
Epoch: 300 | Train Loss: 0.1207 | Train Acc: 96.72% | Val Acc: 79.07%
Test accuracy: 88.67%
Test Recall: 77.16%
Test Precision: 77.46%




4083
0
Epoch:   0 | Train Loss: 1.2887 | Train Acc: 73.02% | Val Acc: 56.81%
100
Epoch: 100 | Train Loss: 0.2916 | Train Acc: 87.58% | Val Acc: 81.06%
200
Epoch: 200 | Train Loss: 0.2034 | Train Acc: 92.65% | Val Acc: 78.74%
300
Epoch: 300 | Train Loss: 0.1224 | Train Acc: 95.15% | Val Acc: 80.07%
Test accuracy: 83.33%
Test Recall: 74.28%
Test Precision: 73.43%




4098
0
Epoch:   0 | Train Loss: 0.7239 | Train Acc: 74.88% | Val Acc: 54.15%
100
Epoch: 100 | Train Loss: 0.3128 | Train Acc: 85.80% | Val Acc: 81.40%
200
Epoch: 200 | Train Loss: 0.2236 | Train Acc: 90.51% | Val Acc: 82.06%
300
Epoch: 300 | Train Loss: 0.1486 | Train Acc: 95.15% | Val Acc: 79.07%
Test accuracy: 91.00%
Test Recall: 76.74%
Test Precision: 79.36%




4113
0
Epoch:   0 | Train Loss: 0.9151 | Train Acc: 61.74% | Val Acc: 47.84%
100
Epoch: 100 | Train Loss: 0.3126 | Train Acc: 85.22% | Val Acc: 79.73%
200
Epoch: 200 | Train Loss: 0.2278 | Train Acc: 91.29% | Val Acc: 79.07%
300
Epoch: 300 | Train Loss: 0.1578 | Train Acc: 94.93% | Val Acc: 79.07%
Test accuracy: 92.00%
Test Recall: 77.06%
Test Precision: 80.30%




4128
0
Epoch:   0 | Train Loss: 0.9146 | Train Acc: 23.34% | Val Acc: 45.18%
100
Epoch: 100 | Train Loss: 0.3413 | Train Acc: 84.94% | Val Acc: 81.40%
200
Epoch: 200 | Train Loss: 0.2457 | Train Acc: 88.87% | Val Acc: 82.72%
300
Epoch: 300 | Train Loss: 0.1608 | Train Acc: 93.29% | Val Acc: 79.07%
Test accuracy: 82.33%
Test Recall: 74.49%
Test Precision: 71.98%




4143
0
Epoch:   0 | Train Loss: 0.7963 | Train Acc: 45.47% | Val Acc: 50.83%
100
Epoch: 100 | Train Loss: 0.3193 | Train Acc: 85.58% | Val Acc: 79.73%
200
Epoch: 200 | Train Loss: 0.2204 | Train Acc: 90.15% | Val Acc: 80.07%
300
Epoch: 300 | Train Loss: 0.1583 | Train Acc: 94.79% | Val Acc: 78.74%
Test accuracy: 83.33%
Test Recall: 76.67%
Test Precision: 73.75%




4158
0
Epoch:   0 | Train Loss: 0.7147 | Train Acc: 75.59% | Val Acc: 55.15%
100
Epoch: 100 | Train Loss: 0.3096 | Train Acc: 85.72% | Val Acc: 82.39%
200
Epoch: 200 | Train Loss: 0.2092 | Train Acc: 90.79% | Val Acc: 82.72%
300
Epoch: 300 | Train Loss: 0.1438 | Train Acc: 93.86% | Val Acc: 83.39%
Test accuracy: 87.67%
Test Recall: 79.83%
Test Precision: 78.50%




4173
0
Epoch:   0 | Train Loss: 1.0713 | Train Acc: 32.76% | Val Acc: 38.21%
100
Epoch: 100 | Train Loss: 0.3285 | Train Acc: 85.01% | Val Acc: 82.06%
200
Epoch: 200 | Train Loss: 0.2282 | Train Acc: 90.79% | Val Acc: 83.39%
300
Epoch: 300 | Train Loss: 0.1445 | Train Acc: 94.00% | Val Acc: 83.72%
Test accuracy: 84.33%
Test Recall: 80.30%
Test Precision: 76.77%




4188
0
Epoch:   0 | Train Loss: 0.8378 | Train Acc: 28.41% | Val Acc: 44.85%
100
Epoch: 100 | Train Loss: 0.3166 | Train Acc: 85.80% | Val Acc: 80.73%
200
Epoch: 200 | Train Loss: 0.2286 | Train Acc: 90.79% | Val Acc: 82.72%
300
Epoch: 300 | Train Loss: 0.1374 | Train Acc: 96.22% | Val Acc: 79.73%
Test accuracy: 91.33%
Test Recall: 77.91%
Test Precision: 79.36%




4203
0
Epoch:   0 | Train Loss: 0.8661 | Train Acc: 77.66% | Val Acc: 55.48%
100
Epoch: 100 | Train Loss: 0.3483 | Train Acc: 84.30% | Val Acc: 78.74%
200
Epoch: 200 | Train Loss: 0.2732 | Train Acc: 88.08% | Val Acc: 82.06%
300
Epoch: 300 | Train Loss: 0.1967 | Train Acc: 93.29% | Val Acc: 81.40%
Test accuracy: 90.67%
Test Recall: 79.25%
Test Precision: 80.58%




4218
0
Epoch:   0 | Train Loss: 1.1539 | Train Acc: 36.69% | Val Acc: 40.53%
100
Epoch: 100 | Train Loss: 0.3229 | Train Acc: 85.30% | Val Acc: 81.73%
200
Epoch: 200 | Train Loss: 0.2486 | Train Acc: 89.36% | Val Acc: 79.73%
300
Epoch: 300 | Train Loss: 0.1644 | Train Acc: 93.15% | Val Acc: 80.07%
Test accuracy: 89.67%
Test Recall: 76.00%
Test Precision: 78.92%




4233
0
Epoch:   0 | Train Loss: 0.9547 | Train Acc: 33.69% | Val Acc: 35.55%
100
Epoch: 100 | Train Loss: 0.3233 | Train Acc: 85.58% | Val Acc: 82.39%
200
Epoch: 200 | Train Loss: 0.2357 | Train Acc: 89.94% | Val Acc: 81.40%
300
Epoch: 300 | Train Loss: 0.1486 | Train Acc: 95.57% | Val Acc: 84.72%
Test accuracy: 90.00%
Test Recall: 80.05%
Test Precision: 80.37%




4248
0
Epoch:   0 | Train Loss: 0.8575 | Train Acc: 76.80% | Val Acc: 53.82%
100
Epoch: 100 | Train Loss: 0.3105 | Train Acc: 85.44% | Val Acc: 81.40%
200
Epoch: 200 | Train Loss: 0.2246 | Train Acc: 90.36% | Val Acc: 83.06%
300
Epoch: 300 | Train Loss: 0.1545 | Train Acc: 93.50% | Val Acc: 83.39%
Test accuracy: 83.00%
Test Recall: 79.02%
Test Precision: 75.34%




4263
0
Epoch:   0 | Train Loss: 0.8535 | Train Acc: 31.48% | Val Acc: 50.83%
100
Epoch: 100 | Train Loss: 0.3202 | Train Acc: 85.80% | Val Acc: 81.06%
200
Epoch: 200 | Train Loss: 0.2279 | Train Acc: 89.22% | Val Acc: 84.05%
300
Epoch: 300 | Train Loss: 0.1537 | Train Acc: 92.65% | Val Acc: 85.05%
Test accuracy: 82.67%
Test Recall: 81.31%
Test Precision: 76.37%




4278
0
Epoch:   0 | Train Loss: 1.2191 | Train Acc: 62.38% | Val Acc: 72.09%
100
Epoch: 100 | Train Loss: 0.3428 | Train Acc: 85.08% | Val Acc: 78.74%
200
Epoch: 200 | Train Loss: 0.2840 | Train Acc: 87.79% | Val Acc: 80.40%
300
Epoch: 300 | Train Loss: 0.2305 | Train Acc: 90.44% | Val Acc: 80.73%
Test accuracy: 88.00%
Test Recall: 78.34%
Test Precision: 77.61%




4293
0
Epoch:   0 | Train Loss: 0.8111 | Train Acc: 59.24% | Val Acc: 44.85%
100
Epoch: 100 | Train Loss: 0.3188 | Train Acc: 85.94% | Val Acc: 80.40%
200
Epoch: 200 | Train Loss: 0.2197 | Train Acc: 89.72% | Val Acc: 85.05%
300
Epoch: 300 | Train Loss: 0.1655 | Train Acc: 92.72% | Val Acc: 85.05%
Test accuracy: 84.00%
Test Recall: 80.30%
Test Precision: 76.77%




4308
0
Epoch:   0 | Train Loss: 0.7279 | Train Acc: 77.59% | Val Acc: 55.48%
100
Epoch: 100 | Train Loss: 0.3041 | Train Acc: 86.65% | Val Acc: 80.40%
200
Epoch: 200 | Train Loss: 0.2157 | Train Acc: 91.15% | Val Acc: 82.06%
300
Epoch: 300 | Train Loss: 0.1374 | Train Acc: 95.36% | Val Acc: 83.39%
Test accuracy: 83.67%
Test Recall: 80.46%
Test Precision: 76.60%




In [3]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
# Get the experiment ID or name
experiment_name = "GAT Network 2024-10-30 2 layers Filter Node on test time since first post"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is not None:
    experiment_id = experiment.experiment_id

    # Retrieve all runs in the experiment as a DataFrame
    df = mlflow.search_runs(experiment_ids=[experiment_id])
    
    # Show the DataFrame
    #print(df)
else:
    print(f"Experiment '{experiment_name}' not found.")


In [4]:
df['metrics.time_cut'].max()

3993.0

In [7]:
df[df['metrics.time_cut_posts']==4212]['metrics.time_cut_replies'].max()

2827.0

In [8]:
print(60*24*3)

4320
