# GCN Model

# Import bibliotek

In [10]:
import pandas as pd
import torch
import json
from torch import nn
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import torch.optim as optim

### Wczytanie datasetu z plików

In [11]:
def load_data():
    data_edges = pd.read_csv('DE_edges.csv')
    data_target = pd.read_csv('DE_target.csv')
    with open('DE.json') as f:
        node_features_json = json.load(f)
    node_features_df = pd.DataFrame.from_dict(node_features_json, orient='index')
    node_features_df.index.name = 'id'
    node_features_df.reset_index(inplace=True)
    return data_edges, data_target, node_features_df

### Preprocesowanie danych

In [12]:
def preprocess_data(data_target, data_edges, node_features_df):
    data_target = data_target.drop(columns=['id'])
    data_target = data_target.rename(columns={'new_id': 'id'})
    data_edges = data_edges.rename(columns={'from': 'from_id', 'to': 'to_id'})
    data_target['mature'] = data_target['mature'].astype(int)
    data_target['partner'] = data_target['partner'].astype(int)
    data_target['days'] = (data_target['days'] - data_target['days'].mean()) / data_target['days'].std()
    data_target['views'] = (data_target['views'] - data_target['views'].mean()) / data_target['views'].std()
    node_features = torch.tensor(data_target.drop(columns=['id']).values, dtype=torch.float)
    data_target['id'] = data_target['id'].astype('int64')
    node_features_df['id'] = node_features_df['id'].astype('int64')
    data_target = pd.merge(data_target, node_features_df, on='id')
    data_target = data_target.fillna(0)
    return data_target, data_edges, node_features

### Przygotowanie danych

In [13]:
def prepare_data(data_target, data_edges, node_features):
    edge_index = torch.tensor(data_edges.values, dtype=torch.long).t().contiguous()
    edge_index = torch.cat([edge_index, edge_index[[1, 0]]], dim=1)
    node_ids = data_target['id']
    node_to_idx = {node_id: idx for idx, node_id in enumerate(node_ids)}
    labels = torch.tensor(data_target['mature'].values, dtype=torch.long)
    x = torch.eye(len(node_ids))
    train_indices, test_indices = train_test_split(range(len(node_ids)), test_size=0.20, stratify=labels)
    train_mask = torch.zeros(len(node_ids), dtype=torch.bool).scatter_(0, torch.tensor(train_indices), True)
    test_mask = torch.zeros(len(node_ids), dtype=torch.bool).scatter_(0, torch.tensor(test_indices), True)
    data = Data(x=node_features, edge_index=edge_index, y=labels, train_mask=train_mask, test_mask=test_mask)
    return data

### Zdfiniowanie modelu GCN

In [14]:
class GCN(nn.Module):
    def __init__(self, node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(node_features.shape[1], 32)
        self.conv2 = GCNConv(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

### Funkcja trenująca

In [15]:
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

### Funkcja walidująca

In [16]:
def test(model, data):
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum().item()
    accuracy = correct / int(data.test_mask.sum())
    return accuracy

### Główna pętla programu

In [20]:
def main():
    data_edges, data_target, node_features_df = load_data()
    data_target, data_edges, node_features = preprocess_data(data_target, data_edges, node_features_df)
    data = prepare_data(data_target, data_edges, node_features)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GCN(node_features).to(device)
    data = data.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    #criterion = nn.NLLLoss()
    criterion = nn.CrossEntropyLoss() # +1% accuracy
    epochs = 2000
    for epoch in range(epochs):
        loss = train(model, data, optimizer, criterion)
        acc = test(model, data)
        print(f'Epoch: {epoch + 1:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')

In [21]:
if __name__ == "__main__":
    main()

Epoch: 001, Loss: 0.7119, Acc: 0.4305
Epoch: 002, Loss: 0.6973, Acc: 0.5968
Epoch: 003, Loss: 0.6866, Acc: 0.6100
Epoch: 004, Loss: 0.6788, Acc: 0.6079
Epoch: 005, Loss: 0.6726, Acc: 0.6068
Epoch: 006, Loss: 0.6690, Acc: 0.6074
Epoch: 007, Loss: 0.6674, Acc: 0.6074
Epoch: 008, Loss: 0.6655, Acc: 0.6068
Epoch: 009, Loss: 0.6660, Acc: 0.6068
Epoch: 010, Loss: 0.6652, Acc: 0.6074
Epoch: 011, Loss: 0.6639, Acc: 0.6095
Epoch: 012, Loss: 0.6633, Acc: 0.6126
Epoch: 013, Loss: 0.6607, Acc: 0.6153
Epoch: 014, Loss: 0.6577, Acc: 0.6179
Epoch: 015, Loss: 0.6565, Acc: 0.6253
Epoch: 016, Loss: 0.6555, Acc: 0.6347
Epoch: 017, Loss: 0.6550, Acc: 0.6379
Epoch: 018, Loss: 0.6541, Acc: 0.6432
Epoch: 019, Loss: 0.6546, Acc: 0.6447
Epoch: 020, Loss: 0.6545, Acc: 0.6437
Epoch: 021, Loss: 0.6528, Acc: 0.6426
Epoch: 022, Loss: 0.6503, Acc: 0.6432
Epoch: 023, Loss: 0.6484, Acc: 0.6416
Epoch: 024, Loss: 0.6466, Acc: 0.6421
Epoch: 025, Loss: 0.6459, Acc: 0.6411
Epoch: 026, Loss: 0.6464, Acc: 0.6411
Epoch: 027, 