# Graph Deep Learning on Graph to study Fake News

In [156]:
!pip install dgl



In [157]:
!pip install torch_geometric



In [241]:
from torch_geometric.datasets import UPFD
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, global_mean_pool
from torch.nn.modules import Linear
from torch_geometric.transforms import ToUndirected
from torch.functional import F
import torch
import argparse
import os.path as osp


The dataset is initialized there. Note that:
- `feature` can be `content` (raw content of the tweet), `bert` (content transformed by a transformer), `profile` (user profile info such as number of tweets, followers, and join date), `spacy` (content transformed by a simple NLP model)
- `dataset` is either `politifact` or `gossipcop`

In [273]:
from torch_geometric.data import DataLoader
from torch_geometric.datasets import UPFD
import os.path as osp
import torch

# Paths and settings
_file_ = '..'
file = _file_
dataset = 'gossipcop'  # or 'politifact'
path = osp.join(osp.dirname(osp.realpath(file)), '..', 'data', 'UPFD')

# Load datasets with bert and profile features separately
train_dataset_bert = UPFD(path, dataset, 'bert', 'train')
train_dataset_profile = UPFD(path, dataset, 'profile', 'train')

val_dataset_bert = UPFD(path, dataset, 'bert', 'val')
val_dataset_profile = UPFD(path, dataset, 'profile', 'val')

test_dataset_bert = UPFD(path, dataset, 'bert', 'test')
test_dataset_profile = UPFD(path, dataset, 'profile', 'test')

# Check that both datasets are aligned
assert len(train_dataset_bert) == len(train_dataset_profile)
assert len(val_dataset_bert) == len(val_dataset_profile)
assert len(test_dataset_bert) == len(test_dataset_profile)

# Function to combine features
def combine_features(dataset_bert, dataset_profile):
    combined_data = []
    for data_bert, data_profile in zip(dataset_bert, dataset_profile):
        data_bert.x = torch.cat([data_bert.x, data_profile.x], dim=-1)  # Concatenate features
        combined_data.append(data_bert)
    return combined_data

# Combine features for train, val, and test datasets
train_dataset = combine_features(train_dataset_bert, train_dataset_profile)
val_dataset = combine_features(val_dataset_bert, val_dataset_profile)
test_dataset = combine_features(test_dataset_bert, test_dataset_profile)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
print(len(train_dataset) + len(val_dataset) + len(test_dataset))

5464




In [245]:
print(train_dataset)
print(train_dataset[0].y)
print(train_dataset[0].x)
print(train_dataset[0].edge_index)
#train_dataset[i] représente l'arbre i
#Le champ x représente la valeur des noeud, il est de taille (n, 778).
#Le champ edge index représente les connexion dans l'abre. Il est de taille (2, n - 1). edge_index[0][i] est
#le père de edge_index[1][i]. En pratique, il se trouve que edge_index[1][i] est toujours un linespace, ce qui est possible
#Le champ y représente si c'est un graphe de fake news ou pas

[Data(x=[76, 778], edge_index=[2, 75], y=[1]), Data(x=[125, 778], edge_index=[2, 124], y=[1]), Data(x=[6, 778], edge_index=[2, 5], y=[1]), Data(x=[56, 778], edge_index=[2, 55], y=[1]), Data(x=[90, 778], edge_index=[2, 89], y=[1]), Data(x=[21, 778], edge_index=[2, 20], y=[1]), Data(x=[76, 778], edge_index=[2, 75], y=[1]), Data(x=[89, 778], edge_index=[2, 88], y=[1]), Data(x=[77, 778], edge_index=[2, 76], y=[1]), Data(x=[27, 778], edge_index=[2, 26], y=[1]), Data(x=[88, 778], edge_index=[2, 87], y=[1]), Data(x=[127, 778], edge_index=[2, 126], y=[1]), Data(x=[88, 778], edge_index=[2, 87], y=[1]), Data(x=[19, 778], edge_index=[2, 18], y=[1]), Data(x=[173, 778], edge_index=[2, 172], y=[1]), Data(x=[54, 778], edge_index=[2, 53], y=[1]), Data(x=[19, 778], edge_index=[2, 18], y=[1]), Data(x=[88, 778], edge_index=[2, 87], y=[1]), Data(x=[24, 778], edge_index=[2, 23], y=[1]), Data(x=[72, 778], edge_index=[2, 71], y=[1]), Data(x=[12, 778], edge_index=[2, 11], y=[1]), Data(x=[38, 778], edge_index=

### Model

In [None]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels,
                 concat=False):
        super().__init__()
        self.concat = concat
        self.conv1 = GATConv(in_channels, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.max_pooling = global_mean_pool # They use Max_pool in the article but that doesn't work pretty well.
        self.lin1 = Linear(hidden_channels, 2 * hidden_channels)
        self.lin2 = Linear(2*hidden_channels,2)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.conv2(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.max_pooling(x, batch)
        x = torch.nn.functional.selu(x)
        x = self.lin1(x)
        x = torch.nn.functional.selu(x)
        x = self.lin2(x)
        return x.softmax(dim=-1)

In [275]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(778, 256,2, concat=True).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=0.01)

In [276]:
loss_fn = torch.nn.HingeEmbeddingLoss()
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        y = torch.tensor([[1,0] if e==1 else [0,1] for e in data.y]).to(device)
        loss = loss_fn(out,y)
        #loss = F.nll_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * data.num_graphs

    return total_loss / len(train_loader.dataset)
train()

TypeError: global_mean_pool() missing 1 required positional argument: 'batch'

In [253]:
@torch.no_grad()
def test(loader):
    model.eval()

    total_correct = total_examples = 0
    for data in loader:
        data = data.to(device)
        pred = model(data.x, data.edge_index, data.batch).argmax(dim=-1)
        total_correct += int((pred ==
         data.y).sum())
        total_examples += data.num_graphs

    return total_correct / total_examples

In [254]:
for epoch in range(1, 200):
    loss = train()
    train_acc = test(train_loader)
    val_acc = test(val_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, 'f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Epoch: 01, Loss: 0.7544, Train: 0.5101, Val: 0.4853, Test: 0.4992
Epoch: 02, Loss: 0.7467, Train: 0.5101, Val: 0.4853, Test: 0.4992
Epoch: 03, Loss: 0.7462, Train: 0.4899, Val: 0.5147, Test: 0.5008
Epoch: 04, Loss: 0.7506, Train: 0.5101, Val: 0.4853, Test: 0.4992
Epoch: 05, Loss: 0.7490, Train: 0.5101, Val: 0.4853, Test: 0.4992
Epoch: 06, Loss: 0.7464, Train: 0.5101, Val: 0.4853, Test: 0.4992
Epoch: 07, Loss: 0.7439, Train: 0.6181, Val: 0.6337, Test: 0.6213
Epoch: 08, Loss: 0.7146, Train: 0.6145, Val: 0.6117, Test: 0.5967
Epoch: 09, Loss: 0.6761, Train: 0.6319, Val: 0.6429, Test: 0.6257
Epoch: 10, Loss: 0.6825, Train: 0.7134, Val: 0.7070, Test: 0.6882
Epoch: 11, Loss: 0.6640, Train: 0.6694, Val: 0.6758, Test: 0.6639
Epoch: 12, Loss: 0.6421, Train: 0.6886, Val: 0.6758, Test: 0.6803
Epoch: 13, Loss: 0.6477, Train: 0.7747, Val: 0.7875, Test: 0.7679
Epoch: 14, Loss: 0.6800, Train: 0.5989, Val: 0.6117, Test: 0.5949
Epoch: 15, Loss: 0.6604, Train: 0.7701, Val: 0.7692, Test: 0.7538
Epoch: 16,

KeyboardInterrupt: 