# GCN Model

### Import bibliotek

In [None]:
import pandas as pd
import torch
import json
from torch import nn
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
import torch.optim as optim
import matplotlib.pyplot as plt

### Wczytanie datasetu z plików

In [None]:
SOURCE_PATH = 'dataset/'

def load_data():
    data_edges = pd.read_csv(SOURCE_PATH + 'DE_edges.csv')
    data_target = pd.read_csv(SOURCE_PATH + 'DE_target.csv')
    with open(SOURCE_PATH + 'DE.json') as f:
        node_features_json = json.load(f)
    node_features_df = pd.DataFrame.from_dict(node_features_json, orient='index')
    node_features_df.index.name = 'id'
    node_features_df.reset_index(inplace=True)
    return data_edges, data_target, node_features_df

### Preprocesowanie danych

In [None]:
def preprocess_data(data_target, data_edges, node_features_df):
    data_target = data_target.drop(columns=['id'])
    data_target = data_target.rename(columns={'new_id': 'id'})
    data_edges = data_edges.rename(columns={'from': 'from_id', 'to': 'to_id'})
    data_target['mature'] = data_target['mature'].astype(int)
    data_target['partner'] = data_target['partner'].astype(int)
    data_target['days'] = (data_target['days'] - data_target['days'].mean()) / data_target['days'].std()
    data_target['views'] = (data_target['views'] - data_target['views'].mean()) / data_target['views'].std()
    node_features = torch.tensor(data_target.drop(columns=['id']).values, dtype=torch.float)
    data_target['id'] = data_target['id'].astype('int64')
    node_features_df['id'] = node_features_df['id'].astype('int64')
    data_target = pd.merge(data_target, node_features_df, on='id')
    data_target = data_target.fillna(0)
    return data_target, data_edges, node_features

### Przygotowanie danych

In [None]:
def prepare_data(data_target, data_edges, node_features):
    edge_index = torch.tensor(data_edges.values, dtype=torch.long).t().contiguous()
    edge_index = torch.cat([edge_index, edge_index[[1, 0]]], dim=1)
    node_ids = data_target['id']
    node_to_idx = {node_id: idx for idx, node_id in enumerate(node_ids)}
    labels = torch.tensor(data_target['mature'].values, dtype=torch.long)
    x = torch.eye(len(node_ids))
    train_indices, test_indices = train_test_split(range(len(node_ids)), test_size=0.20, stratify=labels)
    train_mask = torch.zeros(len(node_ids), dtype=torch.bool).scatter_(0, torch.tensor(train_indices), True)
    test_mask = torch.zeros(len(node_ids), dtype=torch.bool).scatter_(0, torch.tensor(test_indices), True)
    data = Data(x=node_features, edge_index=edge_index, y=labels, train_mask=train_mask, test_mask=test_mask)
    return data

### Zdfiniowanie modelu GCN

In [None]:
class GCN(nn.Module):
    def __init__(self, node_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(node_features.shape[1], 32)
        self.conv2 = GCNConv(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

### Funkcja trenująca

In [None]:
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

### Funkcja walidująca

In [None]:
def test(model, data):
    model.eval()
    out = model(data)
    pred = out.argmax(dim=1)
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum().item()
    accuracy = correct / int(data.test_mask.sum())
    return accuracy

### Funkcja rysująca wykresy acc/loss

In [None]:
def plot_results(loss_values, accuracy_values):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(loss_values, label='Train Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss over time')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(accuracy_values, label='Test Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Accuracy over time')
    plt.legend()

    plt.tight_layout()
    plt.show()

### Główna pętla programu

In [None]:
def main():
    data_edges, data_target, node_features_df = load_data()
    data_target, data_edges, node_features = preprocess_data(data_target, data_edges, node_features_df)
    data = prepare_data(data_target, data_edges, node_features)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GCN(node_features).to(device)
    data = data.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss() 
    epochs = 2000
    loss_values = []
    accuracy_values = []
    for epoch in range(epochs):
        loss = train(model, data, optimizer, criterion)
        acc = test(model, data)
        loss_values.append(loss)
        accuracy_values.append(acc)
        print(f'Epoch: {epoch + 1:03d}, Loss: {loss:.4f}, Acc: {acc:.4f}')

    plot_results(loss_values, accuracy_values)

In [None]:
if __name__ == "__main__":
    main()