In [211]:
import os
import gc
import random
import time
import datetime
import math

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn.init as init
from torch.utils.data.sampler import SubsetRandomSampler

import dgl
from dgl.nn.pytorch import GraphConv
from dgl.dataloading import GraphDataLoader
from dgl.data import DGLDataset
from geopy import distance
import ast

In [212]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [213]:
neighb = pd.read_csv('gnn_neighbor.csv')
neighb['distances'] = neighb.apply(lambda x: ast.literal_eval(x.distances), axis=1)
df_train = pd.read_csv('df_train.csv')
df_train_NN = df_train[df_train.columns[df_train.columns.str.contains('NN')]]
df_train = df_train.loc[:, ~df_train.columns.str.contains('NN')]
target_train = pd.read_csv('target_train.csv')
df_train = pd.merge(df_train, target_train, on='row_id')

df_test = pd.read_csv('df_test.csv')
df_test_NN = df_test[df_test.columns[df_test.columns.str.contains('NN')]]
df_test = df_test.loc[:, ~df_test.columns.str.contains('NN')]
target_test = pd.read_csv('target_test.csv')
df_test = pd.merge(df_test, target_test, on='row_id')
features = df_train.drop(['row_id', 'cfips', 'microbusiness_density', 'active'], axis=1).columns.tolist()

In [214]:
# df_train.describe()
df_train['cfips'] = df_train['cfips'].apply(lambda x: neighb.loc[neighb['original_cfips'] == x, 'cfips'].values[0])
df_test['cfips'] = df_test['cfips'].apply(lambda x: neighb.loc[neighb['original_cfips'] == x, 'cfips'].values[0])

In [233]:
class GraphDataset(DGLDataset):
    def __init__(self, edges, properties, features, train=True):
        self.edges = edges
        self.properties = properties
        self.features = features
        self.train = train
        super().__init__(name="microbussiness")

    def process(self):
        self.graphs = []
        self.labels = []

        for index, group in self.properties.groupby('idx'):
            filtered_edges = self.edges[self.edges['cfips'].isin(group['cfips'])]

            if filtered_edges.empty:
                continue

            src = []
            dst = []
            w = []

            for _, row in filtered_edges.iterrows():
                neighbours = ast.literal_eval(row['neighbors'])
                distances = row['distances']

                for neighbor, distance in zip(neighbours, distances):
                    if neighbor in group['cfips'].values:
                        src.append(row['cfips'])
                        dst.append(neighbor)
                        w.append(distance)

            if len(w) == 0:
                continue

            w = np.array(w)
            min_w = np.min(w)
            max_w = np.max(w)
            w = (w - min_w) / (max_w - min_w)

            node_id_mapping = {node_id: i for i, node_id in enumerate(group['cfips'])}
            src = [node_id_mapping[x] for x in src]
            dst = [node_id_mapping[x] for x in dst]

            g = dgl.graph((src, dst), num_nodes=len(group))
            g.ndata['x'] = torch.tensor(group[self.features].values, dtype=torch.float) # node feature
            g.edata['w'] = torch.tensor(w, dtype=torch.float)  # scalar integer feature
            g = dgl.add_self_loop(g)
            self.graphs.append(g)
            if self.train:
                self.labels.append(torch.tensor(group["target"].values, dtype=torch.float))

    def __getitem__(self, i):
        if self.train:
            return self.graphs[i].to(device), self.labels[i].to(device)

        return self.graphs[i].to(device)

    def __len__(self):
        return len(self.graphs)
    
train_graph_dataset = GraphDataset(edges=neighb, properties=df_train, features=features)
val_graph_dataset = GraphDataset(edges=neighb, properties=df_test, features=features,)


In [None]:
class GGN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GGN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.convh = nn.ModuleList([GraphConv(h_feats, h_feats) for _ in range(20)])
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        for c in self.convh:
            h = c(g, h)
            h = F.relu(h)
        h = self.conv2(g, h)
        h = torch.tanh(h)
        return h

In [None]:
def smape_loss(output, target):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE) between output and target.
    """
    return torch.mean(torch.abs(output - target) / (torch.abs(output) + torch.abs(target) + 1e-8))

def train(model, train_loader, val_loader, num_epochs, optimizer, criterion, patience=10):
    best_val_loss = float('inf')
    best_model = None
    counter = 0
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for i, (g, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(g, g.ndata['x'])
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        train_loss /= len(train_loader)
        val_loss = evaluate(model, val_loader, criterion)
        print('Epoch: {:04d}'.format(epoch+1),
              'train_loss: {:.4f}'.format(train_loss),
              'val_loss: {:.4f}'.format(val_loss))
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict()
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping...")
                break
    
    model.load_state_dict(best_model)

def evaluate(model, loader, criterion):
    model.eval()
    loss = 0.0
    with torch.no_grad():
        for i, (g, labels) in enumerate(loader):
            output = model(g, g.ndata['x'])
            loss += criterion(output, labels).item()
    
    return loss / len(loader)

model = GGN(in_feats=len(features), h_feats=64, num_classes=1).to(device)
train_loader = GraphDataLoader(train_graph_dataset, batch_size=1, shuffle=False)
val_loader = GraphDataLoader(val_graph_dataset, batch_size=1, shuffle=False)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = smape_loss

train(model, train_loader, val_loader, num_epochs=50, optimizer=optimizer, criterion=criterion, patience=10)


  assert input.numel() == input.storage().size(), (


DGLError: Invalid key "0". Must be one of the edge types.

In [None]:
# create GraphDataset object for test data
test_graph_dataset = GraphDataset(edges=neighb, properties=df_test, features=features, train=True)

# create DataLoader for test data
test_loader = GraphDataLoader(test_graph_dataset, batch_size=1, shuffle=False)

# loop through test DataLoader and generate predictions
model.eval()
predictions = []
with torch.no_grad():
    for i, (g, _) in enumerate(test_loader):
        output = model(g, g.ndata['x'])
        predictions.append(output.item())

# add predictions to the test dataframe
df_test['predictions'] = predictions


  assert input.numel() == input.storage().size(), (


RuntimeError: a Tensor with 3085 elements cannot be converted to Scalar