In [1]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
import numpy as np
from sklearn.metrics import precision_score, recall_score
import pickle

In [22]:
# Load the HeteroData object from the pkl file
with open('charlie_hebdo_graph_dataset_node_embeddings_v2.pkl', 'rb') as f:
    data = pickle.load(f)

In [23]:
data

HeteroData(
  id={
    x=[2002, 106],
    y=[2002],
    train_mask=[2002],
    val_mask=[2002],
    test_mask=[2002],
  },
  reply_user_id={ x=[19050, 4] },
  (id, retweet, reply_user_id)={ edge_index=[2, 19050] },
  (reply_user_id, rev_retweet, id)={ edge_index=[2, 19050] }
)

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, to_hetero

class GAT(torch.nn.Module):
    def __init__(self, dim_h, dim_out):
        super().__init__()
        self.conv1 = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.conv2 = GATConv(dim_h, dim_h, add_self_loops=False)  # Added second GATConv layer
        self.linear = nn.Linear(dim_h, dim_out)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index).relu()
        h = self.dropout(h)
        h = self.conv2(h, edge_index).relu()  # Pass through the second GATConv layer
        h = self.dropout(h)
        h = self.linear(h)
        return h

model = GAT(dim_h=64, dim_out=2)
model = to_hetero(model, data.metadata(), aggr='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
    acc = (pred[mask] == data['id'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)['id']
    mask = data['id'].train_mask
    loss = F.cross_entropy(out[mask], data['id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 50 == 0:
        train_acc = test(data['id'].train_mask)
        val_acc = test(data['id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')


Epoch:   0 | Train Loss: 0.7806 | Train Acc: 76.87% | Val Acc: 77.67%
Epoch:  50 | Train Loss: 0.4272 | Train Acc: 80.51% | Val Acc: 84.00%
Epoch: 100 | Train Loss: 0.3602 | Train Acc: 83.37% | Val Acc: 85.67%
Epoch: 150 | Train Loss: 0.3304 | Train Acc: 85.51% | Val Acc: 86.33%
Epoch: 200 | Train Loss: 0.3053 | Train Acc: 87.01% | Val Acc: 87.33%
Epoch: 250 | Train Loss: 0.2615 | Train Acc: 88.29% | Val Acc: 87.67%
Epoch: 300 | Train Loss: 0.2255 | Train Acc: 90.44% | Val Acc: 87.67%
Epoch: 350 | Train Loss: 0.2012 | Train Acc: 92.93% | Val Acc: 87.67%
Epoch: 400 | Train Loss: 0.1642 | Train Acc: 94.79% | Val Acc: 87.00%
Epoch: 450 | Train Loss: 0.1347 | Train Acc: 96.65% | Val Acc: 87.00%
Epoch: 500 | Train Loss: 0.1092 | Train Acc: 97.50% | Val Acc: 86.33%
Epoch: 550 | Train Loss: 0.0859 | Train Acc: 98.29% | Val Acc: 86.33%
Epoch: 600 | Train Loss: 0.0598 | Train Acc: 98.79% | Val Acc: 85.67%
Epoch: 650 | Train Loss: 0.0596 | Train Acc: 99.21% | Val Acc: 86.67%
Epoch: 700 | Train L

In [8]:
test_mask = data['id'].test_mask
pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
true_labels = data['id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.8013392857142857

In [9]:
recall_score(true_labels, pred_labels, average='macro')

0.7939762443438914

In [10]:

from sklearn.metrics import precision_score, recall_score

class GAT(torch.nn.Module):
    def __init__(self, dim_h, dim_out):
        super().__init__()
        self.conv = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.linear = nn.Linear(dim_h, dim_out)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x, edge_index):
        h = self.conv(x, edge_index).relu()
        h = self.dropout(h)
        h = self.linear(h)
        return h

model = GAT(dim_h=64, dim_out=2)
model = to_hetero(model, data.metadata(), aggr='sum')
#print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
    true_labels = data['id'].y[mask]
    pred_labels = pred[mask].cpu()
    acc = (pred_labels == true_labels).sum() / mask.sum()
    return float(acc)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)['id']
    mask = data['id'].train_mask
    loss = F.cross_entropy(out[mask], data['id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 5 == 0:
        train_acc = test(data['id'].train_mask)
        val_acc = test(data['id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Epoch:   0 | Train Loss: 0.6099 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:   5 | Train Loss: 0.5384 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  10 | Train Loss: 0.5333 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  15 | Train Loss: 0.5204 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  20 | Train Loss: 0.5275 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  25 | Train Loss: 0.5247 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  30 | Train Loss: 0.5239 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  35 | Train Loss: 0.5202 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  40 | Train Loss: 0.5196 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  45 | Train Loss: 0.5152 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  50 | Train Loss: 0.5224 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  55 | Train Loss: 0.5136 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  60 | Train Loss: 0.5124 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  65 | Train Loss: 0.5142 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  70 | Train L

In [11]:
pred = model(data.x_dict, data.edge_index_dict)['id'].argmax(dim=-1)
true_labels = data['id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.36666666666666664

In [12]:
recall_score(true_labels, pred_labels, average='macro')

0.497737556561086

In [13]:
import torch
import torch.nn.functional as F
from torch import nn

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HANConv, Linear





class HAN(nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=64, heads=4):
        super().__init__()
        self.han = HANConv(dim_in, dim_h, heads=heads,dropout=0.5, metadata=data.metadata())
        self.linear = nn.Linear(dim_h, dim_out)

    def forward(self, x_dict, edge_index_dict):
        out = self.han(x_dict, edge_index_dict)
        out = self.linear(out['id'])
        return out

model = HAN(dim_in=-1, dim_out=2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
    acc = (pred[mask] == data['id'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['id'].train_mask
    loss = F.cross_entropy(out[mask], data['id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = test(data['id'].train_mask)
        val_acc = test(data['id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Epoch:   0 | Train Loss: 5.2255 | Train Acc: 65.67% | Val Acc: 64.33%
Epoch:  20 | Train Loss: 0.8749 | Train Acc: 77.44% | Val Acc: 78.00%
Epoch:  40 | Train Loss: 0.5626 | Train Acc: 77.73% | Val Acc: 78.33%
Epoch:  60 | Train Loss: 0.6310 | Train Acc: 76.30% | Val Acc: 76.67%
Epoch:  80 | Train Loss: 0.5887 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch: 100 | Train Loss: 0.4921 | Train Acc: 77.94% | Val Acc: 77.67%
Epoch: 120 | Train Loss: 1.0309 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch: 140 | Train Loss: 0.5430 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch: 160 | Train Loss: 0.4986 | Train Acc: 78.73% | Val Acc: 78.00%
Epoch: 180 | Train Loss: 0.5077 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch: 200 | Train Loss: 0.4719 | Train Acc: 79.44% | Val Acc: 77.33%
Epoch: 220 | Train Loss: 1.3693 | Train Acc: 81.51% | Val Acc: 79.00%
Epoch: 240 | Train Loss: 0.4756 | Train Acc: 80.23% | Val Acc: 78.00%
Epoch: 260 | Train Loss: 0.9111 | Train Acc: 80.37% | Val Acc: 77.67%
Epoch: 280 | Train L

In [14]:
pred  =model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
true_labels = data['id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.7643951946975973

In [15]:
recall_score(true_labels, pred_labels, average='macro')

0.5722002262443439

In [16]:
import torch
import torch.nn.functional as F
from torch import nn

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HANConv, Linear





class HAN(nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=64, heads=4):
        super().__init__()
        self.han = HANConv(dim_in, dim_h, heads=heads,dropout=0.5, metadata=data.metadata())
        self.han2 = HANConv(dim_h, dim_h, heads=heads, dropout=0.5, metadata=data.metadata())
        self.linear = nn.Linear(dim_h, dim_out)

    def forward(self, x_dict, edge_index_dict):
        out = self.han(x_dict, edge_index_dict)
        out = self.han2(out, edge_index_dict)
        out = self.linear(out['id'])
        return out

model = HAN(dim_in=-1, dim_out=2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
    acc = (pred[mask] == data['id'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['id'].train_mask
    loss = F.cross_entropy(out[mask], data['id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = test(data['id'].train_mask)
        val_acc = test(data['id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Epoch:   0 | Train Loss: 0.7408 | Train Acc: 74.02% | Val Acc: 73.67%
Epoch:  20 | Train Loss: 0.5444 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  40 | Train Loss: 0.4293 | Train Acc: 78.52% | Val Acc: 78.00%
Epoch:  60 | Train Loss: 0.3647 | Train Acc: 82.73% | Val Acc: 78.67%
Epoch:  80 | Train Loss: 0.3197 | Train Acc: 85.44% | Val Acc: 79.67%
Epoch: 100 | Train Loss: 0.2885 | Train Acc: 86.80% | Val Acc: 80.00%
Epoch: 120 | Train Loss: 0.2759 | Train Acc: 88.51% | Val Acc: 80.67%
Epoch: 140 | Train Loss: 0.2520 | Train Acc: 89.44% | Val Acc: 81.33%
Epoch: 160 | Train Loss: 0.2344 | Train Acc: 90.79% | Val Acc: 82.67%
Epoch: 180 | Train Loss: 0.2151 | Train Acc: 91.51% | Val Acc: 84.00%
Epoch: 200 | Train Loss: 0.2003 | Train Acc: 91.93% | Val Acc: 83.33%
Epoch: 220 | Train Loss: 0.1822 | Train Acc: 93.00% | Val Acc: 84.67%
Epoch: 240 | Train Loss: 0.1616 | Train Acc: 93.72% | Val Acc: 84.33%
Epoch: 260 | Train Loss: 0.1503 | Train Acc: 94.79% | Val Acc: 84.67%
Epoch: 280 | Train L

In [17]:
pred  =model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
true_labels = data['id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.7832135933401756

In [18]:
recall_score(true_labels, pred_labels, average='macro')

0.7809389140271493

In [31]:
! git commit -m "commit 06-08"

On branch main
Your branch is up to date with 'origin/main'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   Create_dataset_classes_with_time_cut.ipynb[m
	[31mmodified:   Graph Neural Networks.ipynb[m
	[31mmodified:   graph_dataset_creation.ipynb[m
	[31mmodified:   posts_charlie_hebdo.pkl[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.ipynb_checkpoints/[m
	[31mcharlie_hebdo_graph_dataset_node_embeddings_v2.pkl[m
	[31mreplies_charlie_hebdo.pkl[m

no changes added to commit (use "git add" and/or "git commit -a")


In [32]:
!git push

Everything up-to-date
