# Graph Deep Learning on Graph to study Fake News

Our dataset is https://arxiv.org/pdf/2104.12259 .
The one in the article to study create two kind of nodes: one per authors, one per tweet. But here there is only a node per authors, not per tweets. Hence it is more focus on the users-spreading. The datas we store for an author are the same as the article have to study except that we lack social networks part, in particular:
- BERT is a word-embedding of the preferences of the author (the result of a transformer fed with its average tweets)
- Profile is a vector representing the author informations (number of followers, like, etc.)

In [156]:
!pip install dgl



In [157]:
!pip install torch_geometric



In [42]:
from torch_geometric.datasets import UPFD
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GATConv, global_mean_pool
from torch.nn.modules import Linear
from torch_geometric.transforms import ToUndirected
from torch.functional import F
import torch
import argparse
import os.path as osp


The dataset is initialized there. Note that:
- `feature` can be `content` (raw content of the tweet), `bert` (content transformed by a transformer), `profile` (user profile info such as number of tweets, followers, and join date), `spacy` (content transformed by a simple NLP model)
- `dataset` is either `politifact` or `gossipcop`

In [58]:
from torch_geometric.data import DataLoader 
from torch_geometric.datasets import UPFD
import os.path as osp
import torch

# Paths and settings
_file_ = '..'
file = _file_
dataset = 'gossipcop'  # or 'politifact'
path = osp.join(osp.dirname(osp.realpath(file)), '..', 'data', 'UPFD')

# Load datasets with bert and profile features separately
train_dataset_bert = UPFD(path, dataset, 'bert', 'train')
train_dataset_profile = UPFD(path, dataset, 'profile', 'train')

val_dataset_bert = UPFD(path, dataset, 'bert', 'val')
val_dataset_profile = UPFD(path, dataset, 'profile', 'val')

test_dataset_bert = UPFD(path, dataset, 'bert', 'test')
test_dataset_profile = UPFD(path, dataset, 'profile', 'test')

# Check that both datasets are aligned
assert len(train_dataset_bert) == len(train_dataset_profile)
assert len(val_dataset_bert) == len(val_dataset_profile)
assert len(test_dataset_bert) == len(test_dataset_profile)

# Function to combine features
def combine_features(dataset_bert, dataset_profile):
    combined_data = []
    for data_bert, data_profile in zip(dataset_bert, dataset_profile):
        data_bert.x = torch.cat([data_bert.x, data_profile.x], dim=-1)  # Concatenate features
        combined_data.append(data_bert)
    return combined_data

# Combine features for train, val, and test datasets
train_dataset = combine_features(train_dataset_bert, train_dataset_profile)
val_dataset = combine_features(val_dataset_bert, val_dataset_profile)
test_dataset = combine_features(test_dataset_bert, test_dataset_profile)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
print(len(train_dataset) + len(val_dataset) + len(test_dataset))

5464




In [59]:
print(train_dataset[0].x.shape)
print(sum([e.x.shape[0] for e in train_dataset[0:128] ]))
for e in train_loader:
    s = [0 for i in range(128)]
    for k in e.batch:
        s[k]+=1
    print(s)
    break

torch.Size([76, 778])
6883
[7, 41, 91, 77, 69, 116, 70, 130, 107, 55, 26, 45, 28, 62, 9, 7, 8, 55, 169, 16, 91, 56, 126, 151, 118, 82, 117, 163, 10, 102, 71, 94, 42, 120, 125, 28, 20, 83, 60, 70, 8, 61, 25, 10, 91, 31, 18, 81, 192, 23, 58, 65, 64, 67, 80, 85, 70, 17, 95, 18, 48, 76, 130, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [60]:
print(train_dataset)
print(train_dataset[0].y)
print(train_dataset[0].x)
print(train_dataset[0].edge_index)
#train_dataset[i] représente l'arbre i
#Le champ x représente la valeur des noeud, il est de taille (n, 778).
#Le champ edge index représente les connexion dans l'abre. Il est de taille (2, n - 1). edge_index[0][i] est
#le père de edge_index[1][i]. En pratique, il se trouve que edge_index[1][i] est toujours un linespace, ce qui est possible
#Le champ y représente si c'est un graphe de fake news ou pas

[Data(x=[76, 778], edge_index=[2, 75], y=[1]), Data(x=[125, 778], edge_index=[2, 124], y=[1]), Data(x=[6, 778], edge_index=[2, 5], y=[1]), Data(x=[56, 778], edge_index=[2, 55], y=[1]), Data(x=[90, 778], edge_index=[2, 89], y=[1]), Data(x=[21, 778], edge_index=[2, 20], y=[1]), Data(x=[76, 778], edge_index=[2, 75], y=[1]), Data(x=[89, 778], edge_index=[2, 88], y=[1]), Data(x=[77, 778], edge_index=[2, 76], y=[1]), Data(x=[27, 778], edge_index=[2, 26], y=[1]), Data(x=[88, 778], edge_index=[2, 87], y=[1]), Data(x=[127, 778], edge_index=[2, 126], y=[1]), Data(x=[88, 778], edge_index=[2, 87], y=[1]), Data(x=[19, 778], edge_index=[2, 18], y=[1]), Data(x=[173, 778], edge_index=[2, 172], y=[1]), Data(x=[54, 778], edge_index=[2, 53], y=[1]), Data(x=[19, 778], edge_index=[2, 18], y=[1]), Data(x=[88, 778], edge_index=[2, 87], y=[1]), Data(x=[24, 778], edge_index=[2, 23], y=[1]), Data(x=[72, 778], edge_index=[2, 71], y=[1]), Data(x=[12, 778], edge_index=[2, 11], y=[1]), Data(x=[38, 778], edge_index=

### Model

In [61]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels)
        self.conv2 = GATConv(hidden_channels, hidden_channels)
        self.mean_pooling = global_mean_pool
        self.lin1 = Linear(hidden_channels, 2 * hidden_channels)
        self.lin2 = Linear(2*hidden_channels,2)

    def forward(self, x, edge_index, batch):
        x = self.conv1(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.conv2(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.mean_pooling(x, batch)
        x = torch.nn.functional.selu(x)
        x = self.lin1(x)
        x = torch.nn.functional.selu(x)
        x = self.lin2(x)
        return x.softmax(dim=-1)

In [84]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net(778, 128).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-2,amsgrad = True )#,weight_decay=0.01)
total_sum = 0
total_params = 0
for param in model.parameters():
    total_sum += param.data.sum()
    total_params += param.numel()

average = total_sum / total_params
average

tensor(0.0002, device='cuda:0')

In [None]:
loss_fn = torch.nn.HingeEmbeddingLoss()
#loss_fn = torch.nn.CrossEntropyLoss()
def train(model):
    model.train()
    total_loss = 0
    correct_predictions = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        y = torch.tensor([[1,-1] if e==1 else [-1,1] for e in data.y], dtype=torch.float32).to(device)
        loss = loss_fn(out,y)
        #loss = F.nll_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * data.num_graphs
        pred = out.argmax(dim=-1)  # Classe prédite
        correct_predictions += (pred == data.y).sum().item()

    return total_loss / len(train_loader.dataset), correct_predictions/len(train_dataset)
train(model)

TypeError: train() missing 1 required positional argument: 'model'

In [86]:
def test(loader,model):
    model.eval()

    total_correct = total_examples = 0
    for data in loader:
        data = data.to(device)
        pred = model(data.x, data.edge_index, data.batch).argmax(dim=-1)
        total_correct += int((pred ==data.y).sum())
        total_examples += data.num_graphs

    return total_correct / total_examples

In [None]:

for epoch in range(1, 200):
    loss,acc = train(model)
    train_acc = test(train_loader,model)
    val_acc = test(val_loader,model)
    test_acc = test(test_loader,model)
    total_sum = 0
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, 'f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Epoch: 01, Loss: 0.5167, Train Acc: 0.4899,Train: 0.4899, Val: 0.5147, Test: 0.5008
Epoch: 02, Loss: 0.5101, Train Acc: 0.4899,Train: 0.4899, Val: 0.5147, Test: 0.5008
Epoch: 03, Loss: 0.5101, Train Acc: 0.4899,Train: 0.4899, Val: 0.5147, Test: 0.5008
Epoch: 04, Loss: 0.5101, Train Acc: 0.4899,Train: 0.4899, Val: 0.5147, Test: 0.5008


We have the same accuracy, we need to do the ROC to be sure.