In [1]:
import pandas as pd
import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
import numpy as np
from sklearn.metrics import precision_score, recall_score
import pickle

In [2]:
# Load the HeteroData object from the pkl file
with open('charlie_hebdo_graph_dataset_reply_node_embeddings.pkl', 'rb') as f:
    data = pickle.load(f)

In [3]:
data

HeteroData(
  user_id={
    x=[2002, 106],
    y=[2002],
    train_mask=[2002],
    val_mask=[2002],
    test_mask=[2002],
  },
  reply_id={ x=[19050, 104] },
  (user_id, retweet, reply_id)={ edge_index=[2, 19050] },
  (reply_id, rev_retweet, user_id)={ edge_index=[2, 19050] }
)

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, to_hetero

class GAT(torch.nn.Module):
    def __init__(self, dim_h, dim_out):
        super().__init__()
        self.conv1 = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.conv2 = GATConv(dim_h, dim_h, add_self_loops=False)  # Added second GATConv layer
        self.linear = nn.Linear(dim_h, dim_out)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x, edge_index):
        h = self.conv1(x, edge_index).relu()
        h = self.dropout(h)
        h = self.conv2(h, edge_index).relu()  # Pass through the second GATConv layer
        h = self.dropout(h)
        h = self.linear(h)
        return h

model = GAT(dim_h=64, dim_out=2)
model = to_hetero(model, data.metadata(), aggr='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)['user_id'].argmax(dim=-1)
    acc = (pred[mask] == data['user_id'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)['user_id']
    mask = data['user_id'].train_mask
    loss = F.cross_entropy(out[mask], data['user_id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 5 == 0:
        train_acc = test(data['user_id'].train_mask)
        val_acc = test(data['user_id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['user_id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')


Epoch:   0 | Train Loss: 0.7665 | Train Acc: 23.70% | Val Acc: 27.33%
Epoch:   5 | Train Loss: 0.7085 | Train Acc: 33.76% | Val Acc: 35.67%
Epoch:  10 | Train Loss: 0.6862 | Train Acc: 88.94% | Val Acc: 87.00%
Epoch:  15 | Train Loss: 0.6583 | Train Acc: 89.65% | Val Acc: 90.00%
Epoch:  20 | Train Loss: 0.6465 | Train Acc: 89.79% | Val Acc: 90.67%
Epoch:  25 | Train Loss: 0.6148 | Train Acc: 89.79% | Val Acc: 91.67%
Epoch:  30 | Train Loss: 0.5942 | Train Acc: 90.15% | Val Acc: 91.67%
Epoch:  35 | Train Loss: 0.5934 | Train Acc: 90.51% | Val Acc: 91.00%
Epoch:  40 | Train Loss: 0.5471 | Train Acc: 90.72% | Val Acc: 91.33%
Epoch:  45 | Train Loss: 0.5332 | Train Acc: 91.01% | Val Acc: 91.67%
Epoch:  50 | Train Loss: 0.5423 | Train Acc: 91.29% | Val Acc: 92.33%
Epoch:  55 | Train Loss: 0.5090 | Train Acc: 91.79% | Val Acc: 93.00%
Epoch:  60 | Train Loss: 0.4890 | Train Acc: 91.79% | Val Acc: 93.33%
Epoch:  65 | Train Loss: 0.4655 | Train Acc: 92.36% | Val Acc: 93.33%
Epoch:  70 | Train L

In [5]:
test_mask = data['user_id'].test_mask
pred = model(data.x_dict, data.edge_index_dict)['user_id'].argmax(dim=-1)
true_labels = data['user_id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.9274779946421737

In [6]:
recall_score(true_labels, pred_labels, average='macro')

0.936897001303781

In [7]:

from sklearn.metrics import precision_score, recall_score

class GAT(torch.nn.Module):
    def __init__(self, dim_h, dim_out):
        super().__init__()
        self.conv = GATConv((-1, -1), dim_h, add_self_loops=False)
        self.linear = nn.Linear(dim_h, dim_out)
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x, edge_index):
        h = self.conv(x, edge_index).relu()
        h = self.dropout(h)
        h = self.linear(h)
        return h

model = GAT(dim_h=64, dim_out=2)
model = to_hetero(model, data.metadata(), aggr='sum')
#print(model)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)['user_id'].argmax(dim=-1)
    true_labels = data['user_id'].y[mask]
    pred_labels = pred[mask].cpu()
    acc = (pred_labels == true_labels).sum() / mask.sum()
    return float(acc)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)['user_id']
    mask = data['user_id'].train_mask
    loss = F.cross_entropy(out[mask], data['user_id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 5 == 0:
        train_acc = test(data['user_id'].train_mask)
        val_acc = test(data['user_id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['user_id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Epoch:   0 | Train Loss: 0.8510 | Train Acc: 27.41% | Val Acc: 28.00%
Epoch:   5 | Train Loss: 0.6648 | Train Acc: 82.51% | Val Acc: 78.67%
Epoch:  10 | Train Loss: 0.5663 | Train Acc: 84.58% | Val Acc: 82.67%
Epoch:  15 | Train Loss: 0.4597 | Train Acc: 84.87% | Val Acc: 83.33%
Epoch:  20 | Train Loss: 0.3765 | Train Acc: 86.22% | Val Acc: 84.00%
Epoch:  25 | Train Loss: 0.3197 | Train Acc: 87.22% | Val Acc: 84.00%
Epoch:  30 | Train Loss: 0.2930 | Train Acc: 88.08% | Val Acc: 83.33%
Epoch:  35 | Train Loss: 0.2690 | Train Acc: 89.01% | Val Acc: 83.67%
Epoch:  40 | Train Loss: 0.2423 | Train Acc: 89.58% | Val Acc: 83.67%
Epoch:  45 | Train Loss: 0.2317 | Train Acc: 90.29% | Val Acc: 83.67%
Epoch:  50 | Train Loss: 0.2350 | Train Acc: 90.51% | Val Acc: 84.33%
Epoch:  55 | Train Loss: 0.2129 | Train Acc: 91.08% | Val Acc: 84.67%
Epoch:  60 | Train Loss: 0.2027 | Train Acc: 91.65% | Val Acc: 84.67%
Epoch:  65 | Train Loss: 0.1975 | Train Acc: 92.08% | Val Acc: 83.67%
Epoch:  70 | Train L

In [8]:
pred = model(data.x_dict, data.edge_index_dict)['user_id'].argmax(dim=-1)
true_labels = data['user_id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.6890268592252666

In [9]:
recall_score(true_labels, pred_labels, average='macro')

0.6825945241199478

In [10]:
import torch
import torch.nn.functional as F
from torch import nn

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HANConv, Linear





class HAN(nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=64, heads=4):
        super().__init__()
        self.han = HANConv(dim_in, dim_h, heads=heads,dropout=0.5, metadata=data.metadata())
        self.linear = nn.Linear(dim_h, dim_out)

    def forward(self, x_dict, edge_index_dict):
        out = self.han(x_dict, edge_index_dict)
        out = self.linear(out['user_id'])
        return out

model = HAN(dim_in=-1, dim_out=2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
    acc = (pred[mask] == data['user_id'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['user_id'].train_mask
    loss = F.cross_entropy(out[mask], data['user_id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = test(data['user_id'].train_mask)
        val_acc = test(data['user_id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['user_id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Epoch:   0 | Train Loss: 0.7825 | Train Acc: 27.48% | Val Acc: 28.00%
Epoch:  20 | Train Loss: 0.5739 | Train Acc: 86.65% | Val Acc: 83.33%
Epoch:  40 | Train Loss: 0.4147 | Train Acc: 93.29% | Val Acc: 88.00%
Epoch:  60 | Train Loss: 0.3042 | Train Acc: 95.86% | Val Acc: 90.00%
Epoch:  80 | Train Loss: 0.2408 | Train Acc: 97.36% | Val Acc: 89.33%
Epoch: 100 | Train Loss: 0.2083 | Train Acc: 98.64% | Val Acc: 88.67%
Epoch: 120 | Train Loss: 0.1580 | Train Acc: 99.29% | Val Acc: 88.33%
Epoch: 140 | Train Loss: 0.1421 | Train Acc: 99.50% | Val Acc: 87.33%
Epoch: 160 | Train Loss: 0.1188 | Train Acc: 99.71% | Val Acc: 88.00%
Epoch: 180 | Train Loss: 0.1113 | Train Acc: 99.86% | Val Acc: 88.67%
Epoch: 200 | Train Loss: 0.0904 | Train Acc: 99.86% | Val Acc: 87.67%
Epoch: 220 | Train Loss: 0.0786 | Train Acc: 99.93% | Val Acc: 88.33%
Epoch: 240 | Train Loss: 0.0836 | Train Acc: 99.93% | Val Acc: 88.67%
Epoch: 260 | Train Loss: 0.0700 | Train Acc: 99.93% | Val Acc: 87.67%
Epoch: 280 | Train L

In [11]:
pred  =model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
true_labels = data['user_id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.8220175438596491

In [12]:
recall_score(true_labels, pred_labels, average='macro')

0.8589634941329857

In [13]:
import torch
import torch.nn.functional as F
from torch import nn

import torch_geometric.transforms as T
from torch_geometric.datasets import DBLP
from torch_geometric.nn import HANConv, Linear





class HAN(nn.Module):
    def __init__(self, dim_in, dim_out, dim_h=64, heads=4):
        super().__init__()
        self.han = HANConv(dim_in, dim_h, heads=heads,dropout=0.5, metadata=data.metadata())
        self.han2 = HANConv(dim_h, dim_h, heads=heads, dropout=0.5, metadata=data.metadata())
        self.linear = nn.Linear(dim_h, dim_out)

    def forward(self, x_dict, edge_index_dict):
        out = self.han(x_dict, edge_index_dict)
        out = self.han2(out, edge_index_dict)
        out = self.linear(out['user_id'])
        return out

model = HAN(dim_in=-1, dim_out=2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data, model = data.to(device), model.to(device)

@torch.no_grad()
def test(mask):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
    acc = (pred[mask] == data['user_id'].y[mask]).sum() / mask.sum()
    return float(acc)

for epoch in range(500):
    model.train()
    optimizer.zero_grad()
    out = model(data.x_dict, data.edge_index_dict)
    mask = data['user_id'].train_mask
    loss = F.cross_entropy(out[mask], data['user_id'].y[mask])
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        train_acc = test(data['user_id'].train_mask)
        val_acc = test(data['user_id'].val_mask)
        print(f'Epoch: {epoch:>3} | Train Loss: {loss:.4f} | Train Acc: {train_acc*100:.2f}% | Val Acc: {val_acc*100:.2f}%')

test_acc = test(data['user_id'].test_mask)
print(f'Test accuracy: {test_acc*100:.2f}%')

Epoch:   0 | Train Loss: 0.6881 | Train Acc: 83.51% | Val Acc: 81.67%
Epoch:  20 | Train Loss: 0.6121 | Train Acc: 90.22% | Val Acc: 91.67%
Epoch:  40 | Train Loss: 0.5350 | Train Acc: 92.22% | Val Acc: 92.33%
Epoch:  60 | Train Loss: 0.4833 | Train Acc: 93.29% | Val Acc: 94.00%
Epoch:  80 | Train Loss: 0.4609 | Train Acc: 94.15% | Val Acc: 94.00%
Epoch: 100 | Train Loss: 0.4339 | Train Acc: 95.29% | Val Acc: 95.33%
Epoch: 120 | Train Loss: 0.4051 | Train Acc: 96.07% | Val Acc: 95.00%
Epoch: 140 | Train Loss: 0.3738 | Train Acc: 96.57% | Val Acc: 95.67%
Epoch: 160 | Train Loss: 0.3653 | Train Acc: 97.29% | Val Acc: 94.33%
Epoch: 180 | Train Loss: 0.3405 | Train Acc: 97.86% | Val Acc: 94.33%
Epoch: 200 | Train Loss: 0.3150 | Train Acc: 98.72% | Val Acc: 95.33%
Epoch: 220 | Train Loss: 0.2973 | Train Acc: 98.79% | Val Acc: 95.67%
Epoch: 240 | Train Loss: 0.2901 | Train Acc: 99.14% | Val Acc: 94.67%
Epoch: 260 | Train Loss: 0.2757 | Train Acc: 99.36% | Val Acc: 95.33%
Epoch: 280 | Train L

In [14]:
pred  =model(data.x_dict, data.edge_index_dict).argmax(dim=-1)
true_labels = data['user_id'].y[test_mask]
pred_labels = pred[test_mask]
precision_score(true_labels, pred_labels, average='macro')

0.9313233376792699

In [15]:
recall_score(true_labels, pred_labels, average='macro')

0.9313233376792699

In [30]:
! git commit -m "commit 30-06"

[main 64e6c49] commit 30-06
 9 files changed, 4605 insertions(+), 6000 deletions(-)
 create mode 100644 Dataset creation/.ipynb_checkpoints/Graph Neural Networks-checkpoint.ipynb
 rename Dataset creation/.ipynb_checkpoints/{Charlie_hebdo_v1-checkpoint.ipynb => eda-checkpoint.ipynb} (99%)
 create mode 100644 Dataset creation/Graph Neural Networks.ipynb
 create mode 100644 Dataset creation/charlie_hebdo_graph_dataset_node_embeddings.pkl
 rename Dataset creation/{charlie_hebdo_graph_dataset.pkl => charlie_hebdo_graph_dataset_reply_node_embeddings.pkl} (59%)
 rename Dataset creation/{Charlie_hebdo_v1.ipynb => eda.ipynb} (94%)


In [None]:
!git push

Enumerating objects: 41, done.
Counting objects: 100% (41/41), done.
Delta compression using up to 2 threads
Compressing objects: 100% (35/35), done.
error: RPC failed; HTTP 400 curl 22 The requested URL returned error: 400
send-pack: unexpected disconnect while reading sideband packet
Writing objects:  45% (18/40), 103.07 MiB | 3.64 MiB/s 