In [2]:
import os
import gc
import random
import time
import datetime
import math

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn.init as init
from torch.utils.data.sampler import SubsetRandomSampler

import dgl
from dgl.nn.pytorch import GraphConv
from dgl.dataloading import GraphDataLoader
from dgl.data import DGLDataset
from geopy import distance
import ast

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
SEED = 333
def seedBasic(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    
def seedTorch(seed=SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
      
# basic + torch 
def seedEverything(seed=SEED):
    seedBasic(seed)
    seedTorch(seed)

def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)


def vsmape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * smap

In [None]:
neighbours = pd.read_csv("/kaggle/input/godaddy-metadata/clean_data/county_neighbours.csv")
neighbours['neighbours'] = neighbours.apply(lambda x: ast.literal_eval(x.neighbours), axis=1)
neighbours['original_cfips'] = neighbours['cfips']
neighbours['cfips'] = range(0, len(neighbours))

replace_dict = {}
for index, row in neighbours.iterrows():
    replace_dict[row['original_cfips']] = row['cfips']
for row in range(len(neighbours)):
    n_list = []
    for n in range(len(neighbours.loc[row, 'neighbours'])):
        if neighbours.loc[row, 'neighbours'][n] in replace_dict.keys():
            n_list.append(replace_dict[neighbours.loc[row, 'neighbours'][n]])
    neighbours.loc[row, 'neighbours'] = str(n_list)
    
neighbours['distances'] = neighbours.apply(lambda x: [\
                                                    round(distance.distance(neighbours.loc[neighbours.cfips == cfips, ['lat', 'long']].values, x[['lat', 'long']]).miles) \
                                                    for cfips in ast.literal_eval(x.neighbours)], axis=1)

neighbours.tail()
neighbours['nn'] = neighbours['neighbours'].apply(lambda x: len(ast.literal_eval(x)))
neighbours['dn'] = neighbours['distances'].apply(lambda x: len(x))
len(neighbours[neighbours['nn'] != neighbours['dn']])
raw['cfips'] = raw['cfips'].apply(lambda x: neighbours.loc[neighbours['original_cfips'] == x, 'cfips'].values[0])

class GraphDataset(DGLDataset):
    def __init__(self, edges, properties, features, train=True):
        self.edges = edges
        self.properties = properties
        self.features = features
        self.train = train
        super().__init__(name="microbussiness")

    def process(self):
        src = []
        dst = []
        w = []
        for index, row in self.edges.iterrows():
            neighbours = ast.literal_eval(row['neighbours'])
            src.append([row['cfips'] for _ in range(len(neighbours))])
            dst.append(neighbours)
            w.append(row['distances'])
            if len(neighbours) != len(row['distances']):
                print(row)
        
        src = [f for l in src for f in l]
        dst = [f for l in dst for f in l]
        w = [f for l in w for f in l]
        
        w = np.array(w)
        min_w = np.min(w)
        max_w = np.max(w)
        w = (w - min_w) / (max_w - min_w)
        
        self.graphs = []
        self.labels = []
        for dcount, group in self.properties.groupby('dcount'):
            # Create a graph and add it to the list of graphs and labels.
            g = dgl.graph((src, dst), num_nodes=len(group))
            g.ndata['x'] = torch.tensor(group[self.features].values, dtype=torch.float) # node feature
            g.edata['w'] = torch.tensor(w, dtype=torch.float)  # scalar integer feature
            g = dgl.add_self_loop(g)
            self.graphs.append(g)
            if self.train:
                self.labels.append(group["target"].values)
        
        if self.train:
            # Convert the label list to tensor for saving.
            self.labels = torch.tensor(np.array(self.labels), dtype=torch.float)

    def __getitem__(self, i):
        if self.train:
            return self.graphs[i].to(device), self.labels[i].to(device)
        
        return self.graphs[i].to(device)

    def __len__(self):
        return len(self.graphs)
    
class GGN(nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GGN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats)
        self.convh = nn.ModuleList([GraphConv(h_feats, h_feats) for _ in range(12)])
        self.conv2 = GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        for c in self.convh:
            h = c(g, h)
            h = F.relu(h)
        h = self.conv2(g, h)
        h = torch.tanh(h)
        return h
    
class PseudoHuberLoss(nn.Module):
    """The Pseudo-Huber loss."""

    reductions = {'mean': torch.mean, 'sum': torch.sum, 'none': lambda x: x}
    
    def __init__(self, beta=1, reduction='mean'):
        super().__init__()
        self.beta = beta
        self.reduction = reduction

    def extra_repr(self):
        return f'beta={self.beta:g}, reduction={self.reduction!r}'

    def forward(self, x, target):
        output = self.beta**2 * x.sub(target).div(self.beta).pow(2).add(1).sqrt().sub(1)
        return self.reductions[self.reduction](output)
    
def get_lr_scheduler(optimizer, batch_size = 8):
    lr_start   = 0.000001
    lr_max     = 0.01
    lr_min     = 0.0000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.
    def lrfn(epoch):
        if epoch < lr_ramp_ep: lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep: lr = lr_max
        else: lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        print("Learning rate", lr)
        return lr
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lrfn, verbose=False)
    return lr_scheduler    

In [None]:
def validation_smape(pred, df, valid_indices):
    df.loc[valid_indices, 'predicted_microbusiness_density'] = pred + 1
    df.loc[valid_indices,'predicted_microbusiness_density'] = df.loc[valid_indices,'predicted_microbusiness_density'] * df.loc[valid_indices,'microbusiness_density']
    
    return smape(df.loc[valid_indices,'target_microbusiness_density'], df.loc[valid_indices,'predicted_microbusiness_density'])

def validate(model, val_loader):
    if not isinstance(model, nn.DataParallel):
        model = nn.DataParallel(model)
    
    model = model.to(device)
    model.eval()

    loss_list = []
    pred = []
    loss_fn = PseudoHuberLoss()
    
    with torch.no_grad():
        for i, (g, labels) in enumerate(val_loader):            
            y = model(g, g.ndata["x"])
            loss = loss_fn(y, labels)
            
            pred.append(y.to('cpu').detach().numpy())
            loss_list.append(loss.to('cpu').detach())


    loss = np.mean(loss_list)
    pred = np.array([p for l in pred for p in l])
    return loss, pred

In [None]:
def train(model, train_loader, val_loader, args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.set_grad_enabled(True)

    best_epoch = 0
    best_loss = 1000000
    epoch = 0
    score = 0
    start_time = time.time()

    if not isinstance(model, nn.DataParallel):
        model = nn.DataParallel(model)

    model = model.to(device)
    # Set up the optimizer
    trainables = [p for p in model.parameters() if p.requires_grad]
    print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in model.parameters()) / 1e6))
    print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in trainables) / 1e6))

    if args["optimizer"] == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=args["lr"], weight_decay=5e-7, betas=(0.95, 0.999))
    elif args["optimizer"] == "adamw":
        optimizer = torch.optim.AdamW(model.parameters(), lr=args["lr"], weight_decay=5e-7, amsgrad=True)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=args["lr"], momentum=0.9, nesterov=True, weight_decay=5e-7)
    
    
    if args["scheduler"] == "LambdaLR":
        scheduler = get_lr_scheduler(optimizer, batch_size = args["batch_size"])
    else:
        scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, list(range(args["lrscheduler_start"], 1000, args["lrscheduler_step"])),gamma=args["lrscheduler_decay"])
    
    loss_fn = PseudoHuberLoss()

    print("#epochs=%s" % (epoch))
    print("start training...")
    model.train()
    
    while epoch < args["n_epochs"] + 1:
        begin_time = time.time()
        train_loss = []
        model.train()
        print('---------------')
        print(datetime.datetime.now())
        print("current #epochs=%s" % (epoch))

        for i, (g, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            
            y = model(g, g.ndata["x"])
            loss = loss_fn(y, labels.reshape(-1))

            loss.backward()
            optimizer.step()
            
            l = loss.to('cpu').detach()
            if np.isnan(l):
                print(torch.any(x.isnan()))
                print(torch.any(prev_y.isnan()))
                print(torch.any(y.isnan()))
                break
            train_loss.append(l)

        train_loss = np.mean(train_loss)
        print('Train loss: {:.6f}'.format(train_loss))
        
        if train_loss < best_loss:
            best_loss = train_loss
            best_epoch = epoch
        
        if len(val_loader) > 0:
            print('start validation')
            val_loss, pred = validate(model, val_loader)
            print("validation loss: {:.6f}".format(val_loss))
            print('validation finished')
        
            if val_loss < best_loss:
                best_loss = val_loss
                best_epoch = epoch
        
        if best_epoch == epoch:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_loss': best_loss,
            }, 'fold.pth')

        scheduler.step()

        finish_time = time.time()
        print('epoch {:d} training time: {:.3f}'.format(epoch, finish_time-begin_time))

        epoch += 1
    return best_loss

args = {
    "lr": 0.001,
    "lrscheduler_start": 15,
    "lrscheduler_step": 10,
    "lrscheduler_decay": 0.5,
    "warmup": True,
    "optimizer": ["adam", "adamw", "sgd"][2],
    "scheduler": ["LambdaLR"][0],
    "batch_size": 128,
    "fold": 0,
    "n_epochs": 20
}
ts = raw.loc[raw.istest==0, 'dcount'].max() - 1
train_val_dif = 0

proccessed_df, train_df, valid_df, features = get_data(raw.copy(),
                                                       smoothing=False,
                                                       ts=ts,
                                                       train_val_dif=train_val_dif)
train_graph_dataset = GraphDataset(edges=neighbours, properties=train_df, features=features)
val_graph_dataset = GraphDataset(edges=neighbours, properties=valid_df, features=features)

train_loader = GraphDataLoader(
    train_graph_dataset, batch_size=5, drop_last=False
)
val_loader = GraphDataLoader(
    val_graph_dataset, batch_size=1, drop_last=False
)
model = GGN(len(features), len(features)*4, 1)

train(model, train_loader, val_loader, args)
submission_df = submission(model, raw.copy())
submission_df.to_csv('submission.csv', index=False)