In [1]:
import os
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import argparse
from tqdm import trange
import random
from itertools import combinations
from torch_geometric.nn import GATConv
from collections import Counter
from torch.optim.lr_scheduler import ExponentialLR

  from .autonotebook import tqdm as notebook_tqdm


## model

In [4]:
def reset_parameters(named_parameters):
    for i in named_parameters():
        if len(i[1].size()) == 1:
            std = 1.0 / math.sqrt(i[1].size(0))
            nn.init.uniform_(i[1], -std, std)
        else:
            nn.init.xavier_normal_(i[1])

class Graph_Linear(nn.Module):
    def __init__(self,num_nodes, input_size, hidden_size, bias=True):
        super(Graph_Linear, self).__init__()
        self.bias = bias
        self.W = nn.Parameter(torch.zeros(num_nodes, input_size,hidden_size))
        self.b = nn.Parameter(torch.zeros(num_nodes, hidden_size))
        self.reset_parameters()

    def reset_parameters(self):
        reset_parameters(self.named_parameters)

    def forward(self, x):
        output = torch.bmm(x.unsqueeze(1), self.W)
        output = output.squeeze(1)
        if self.bias:
            output = output + self.b
        return output

class Graph_GRUCell(nn.Module):
    def __init__(self, num_nodes, input_size, hidden_size, bias=True):
        super(Graph_GRUCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.x2h = Graph_Linear(num_nodes, input_size, 3 * hidden_size, bias=bias)
        self.h2h = Graph_Linear(num_nodes, hidden_size, 3 * hidden_size, bias=bias)
        self.reset_parameters()

    def reset_parameters(self):
        reset_parameters(self.named_parameters)

    def forward(self, x, hidden):
        gate_x = self.x2h(x)
        gate_h = self.h2h(hidden)
        # 不知道為啥要comment這邊
        # gate_x = gate_x.squeeze()
        # gate_h = gate_h.squeeze()
        i_r, i_i, i_n = gate_x.chunk(3, 1)
        h_r, h_i, h_n = gate_h.chunk(3, 1)
        resetgate = torch.sigmoid(i_r + h_r)
        inputgate = torch.sigmoid(i_i + h_i)
        newgate = torch.tanh(i_n + (resetgate * h_n))
        hy = newgate + inputgate * (hidden - newgate)
        return hy

class Graph_GRUModel(nn.Module):
    def __init__(self, num_nodes, input_dim, hidden_dim, bias=True):
        super(Graph_GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.gru_cell = Graph_GRUCell(num_nodes, input_dim, hidden_dim)
        self.reset_parameters()

    def reset_parameters(self):
        reset_parameters(self.named_parameters)

    def forward(self, x, hidden=None):
        if hidden is None:
            hidden = torch.zeros(x.size()[1], self.hidden_dim, device=x.device,dtype = x.dtype)
        for seq in range(x.size(0)):
            hidden = self.gru_cell(x[seq], hidden)
        return hidden

class CategoricalGraphAtt(nn.Module):
    def __init__(self, input_dim, window_size, output_dim, hidden_dim, inner_edge, outer_edge, company_to_sector):
        super(CategoricalGraphAtt, self).__init__()

        # basic parameters
        self.input_dim = input_dim      # feature_size
        self.window_size = window_size
        self.output_dim = output_dim    # cum_labels + 1
        self.hidden_dim = hidden_dim 
        self.inner_edge = inner_edge    # expected dimension: (2, num_edges)
        self.outer_edge = outer_edge    # expected dimension: (2, num_edges)
        self.company_to_sector = company_to_sector # dictionary to map each company to its sector
        self.num_sector = len(set(company_to_sector.values()))
        self.num_company = len(set(company_to_sector.keys()))

        # hidden layers
        self.bn = nn.BatchNorm1d(input_dim * window_size, momentum=None) # Batch Normalization Layer
        # self.sequence_encoder = nn.GRU(input_size=input_dim, hidden_size=hidden_dim)
        self.sequence_encoder = Graph_GRUModel(self.num_company, input_dim, hidden_dim)
        self.inner_gat = GATConv(hidden_dim, hidden_dim) # GAT to conduct intra-sector relation
        self.cat_gat = GATConv(hidden_dim, hidden_dim) # GAT to conduct inter-sector relation
        self.fusion = nn.Linear(hidden_dim * 3, hidden_dim)

        # output layer
        self.logit_f = nn.Linear(in_features=hidden_dim, out_features=output_dim)

    def sector_separater(self, full_embeddings):
        '''
        Separate the embeddings into different sectors based on a given mapping.

            Args:
            - full_embeddings (torch.Tensor): The embeddings tensor with shape (num_entities, hidden_dim) 
                                            representing all entities.

            Returns:
            - sectors_embeddings (dict): A dictionary where keys are sector identifiers and values are 
                                        tensors containing embeddings of all entities belonging to that sector.
        '''

        # explicit mapping to ensure each company is mapped to the correct sector
        sectors_embeddings = {} # Dictionary to store each company's inner_graph_embedding for each sector
        for company_idx, embedding in enumerate(full_embeddings):
            sector = self.company_to_sector[company_idx]

            if sector not in sectors_embeddings:
                sectors_embeddings[sector] = []

            sectors_embeddings[sector].append(embedding)
        
        # Convert lists to tensor
        for sector, embeddings in sectors_embeddings.items():
            sectors_embeddings[sector] = torch.stack(embeddings)
            # print(f"Shape of sector {sector}", sectors_embeddings[sector].shape)
            
        return sectors_embeddings

    def forward(self, daily_data_batch):
        print("Shape of daily_data_batch", daily_data_batch.shape)
        sequence_embeddings = self.sequence_encoder(daily_data_batch)
        print("Shape of sequence_embeddings:", sequence_embeddings.shape)

        # Conduct intra-sector embedding
        intra_sector_embeddings = self.inner_gat(sequence_embeddings, self.inner_edge)
        print("Shape of intra_sector_embeddings:", intra_sector_embeddings.shape)

        # MaxPool to get each sector's embedding
        sectors_embeddings = self.sector_separater(intra_sector_embeddings)
        for sector, embeddings in sectors_embeddings.items():
            sectors_embeddings[sector], _ = torch.max(sectors_embeddings[sector], dim=0) # simply adopt MaxPool on the sectors_embeddings of each sector

        sectors_embeddings = [sectors_embeddings[sector] for sector in sorted(sectors_embeddings.keys())]
        sectors_embeddings = torch.stack(sectors_embeddings, dim=0) # (num_sectors, hidden_dim)
        print("Shape of sectors_embeddings:", sectors_embeddings.shape)
        
        # Conduct inter-sector embedding
        sectors_embeddings = self.cat_gat(sectors_embeddings, self.outer_edge) # (num_sectors, hidden_dim)
        print("Shape of sectors_embeddings:", sectors_embeddings.shape)

        # fusion

        # duplicate sector embeddings for fusion
        sector_counts = Counter(self.company_to_sector.values())
        rep = torch.tensor([sector_counts[i] for i in sorted(sector_counts.keys())])  # Create repetitions tensor
        sectors_embeddings = torch.repeat_interleave(sectors_embeddings, rep, dim=0)
        print("Shape of sectors_embeddings:", sectors_embeddings.shape)

        fusion_vec = torch.cat((sequence_embeddings, sectors_embeddings, intra_sector_embeddings), dim=-1)
        fusion_vec = torch.relu(self.fusion(fusion_vec))
        print("Shape of fusion_vec", fusion_vec.shape)

        # output

        logits = self.logit_f(fusion_vec.float())
        logits = F.softmax(logits, dim=1)
        logits = torch.cumsum(logits, dim=1)
        eps = 5e-8
        logits = torch.clamp(logits, min=eps, max=1 - eps)

        return logits


## training

In [5]:
class All_Company_Dataset(Dataset):
    def __init__(self, x=None, y=None):
        self.x = x
        self.y = y

    def __getitem__(self, index):
        input_dict = {}
        input_dict['features'] = self.x[index]
        input_dict['labels'] = self.y[index]
            
        return input_dict['features'], input_dict['labels']

    def __len__(self):
        return len(self.x)
    
def set_seed(seed=0):
    np.random.seed(seed)
    torch.manual_seed(seed)

def generate_company_ids(num_ids, start=1000, end=9999):
    if num_ids > (end - start + 1):
        raise ValueError("Cannot generate the specified number of distinct IDs in the given range.")
    
    return random.sample(range(start, end + 1), num_ids)

def map_companies_to_sectors(company_ids, num_entities_per_sector):
    # Create a list of sectors based on the counts in num_entities_per_sector
    sectors = []
    for idx, count in enumerate(num_entities_per_sector):
        sectors.extend([idx + 1] * count)

    # Shuffle the sectors to achieve a random distribution
    random.shuffle(sectors)

    # Map company IDs to sectors
    return {company_id: sector for company_id, sector in zip(company_ids, sectors)}

def generate_inner_edges(df: pd.DataFrame) -> list:
    """
    Generate inner edges from a dataframe containing company IDs and their clusters.
    
    Parameters:
    - df (pd.DataFrame): Dataframe with columns 'id' (representing company ID) and 'Cluster'.
    
    Returns:
    - list: List of tuples representing the inner edges between companies in the same cluster.
    """
    inner_edges = []

    # Group by 'Cluster' and create edges
    for _, group in df.groupby('Cluster'):
        companies = group['id'].tolist()
        for combo in combinations(companies, 2): # Get all pairs
            inner_edges.append(combo)   # (a, b)
            inner_edges.append(combo[::-1])  # (b, a)

    return inner_edges

def generate_outer_edges(df: pd.DataFrame) -> list:
    outer_edges = []

    # Extract unique sectors
    sectors = df['Cluster'].unique().tolist()
    print(f'num sectors: {len(sectors)}')
    for combo in combinations(sectors, 2): # Get all pairs
        outer_edges.append(combo)   # (a, b)
        outer_edges.append(combo[::-1])  # (b, a)
    
    return outer_edges

def generate_pseudo_data(days, window_size, number_of_companies, feature_size):
    # Generate x with random values between 0 and 1
    x = np.random.rand(days, window_size, number_of_companies, feature_size)
    x = x.astype(np.float32)

    # Generate y with random values of 1 or -1
    y_choices = [1, -1]
    y = np.random.choice(y_choices, size=(days, number_of_companies, 9))

    return x, y

def generate_pseudo_data_iterator(is_training, batch_size, feature_size, window_size, company_id_list, days):
    company_id_list = sorted(company_id_list)
    x, y = generate_pseudo_data(days, window_size, len(company_id_list), feature_size)
    print("Shape of x:", x.shape)
    print("Shape of y:", y.shape)

    dataset = All_Company_Dataset(x=x, y=y)
    # DataLoader will shuffle data based on the date in each epoch
    iterator = DataLoader(dataset, batch_size=batch_size, shuffle=True if is_training else False)

    return iterator


def map_companyID_to_index(company_id_list):
    """
    return a mapping dictionary from the original company IDs to the new indices
    Note: the order of the original data should be identical to company_id_list
    """
    id_to_index = {company_id: index for index, company_id in enumerate(company_id_list)}
    return id_to_index

def map_sector_to_index(sectors):
    sector_to_index = {sector: index for index, sector in enumerate(sectors)}
    return sector_to_index

def remap_edges(id_to_index, edges):
    remapped_edges = edges.clone()
    remapped_edges[0] = torch.tensor([id_to_index[original_id.item()] for original_id in edges[0]])
    remapped_edges[1] = torch.tensor([id_to_index[original_id.item()] for original_id in edges[1]])
    return remapped_edges

def remap_company_to_sector(company_to_sector, companyID_to_index, sector_to_index):
    remapped_dict = {}

    for company, sector in company_to_sector.items():
        company_idx = companyID_to_index.get(company)
        sector_idx = sector_to_index.get(sector)

        # Check if the company and sector have corresponding indices
        if company_idx is None:
            raise ValueError(f"No index found for company '{company}' in companyID_to_index.")
        if sector_idx is None:
            raise ValueError(f"No index found for sector '{sector}' in sector_to_index.")

        remapped_dict[company_idx] = sector_idx
        
    return remapped_dict



In [60]:
# generate pseudo_data
num_entity = 15786
company_cluster = pd.read_csv('/tmp2/cwlin/explainable_credit/Att_NeuDP_GAT/data/company_cluster.csv')
company_ids = pd.read_csv('/tmp2/cwlin/explainable_credit/Att_NeuDP_GAT/data/all_company_ids.csv').id.tolist()

sectors = list(sorted(company_cluster.Cluster.unique()))
company_cluster_dict = dict()
for id, cluster in zip(company_cluster.id, company_cluster.Cluster):
    company_cluster_dict[id] = cluster

companyID_to_index = map_companyID_to_index(company_ids)
sector_to_index = map_sector_to_index(sectors)

In [17]:
inner_edge = generate_inner_edges(company_cluster)
outer_edge = generate_outer_edges(company_cluster)

inner_edge = torch.tensor(inner_edge, dtype=torch.int64).t() # (2, num_edges)
outer_edge = torch.tensor(outer_edge, dtype=torch.int64).t() # (2, num_edges)

print("Shape of inner_edge:", inner_edge.shape)
print("Shape of outer_edge:", outer_edge.shape)

torch.save(inner_edge, 'inner_edge.pt')
torch.save(outer_edge, 'outer_edge.pt')

num sectors: 9
Shape of inner_edge: torch.Size([2, 50235366])
Shape of outer_edge: torch.Size([2, 72])


In [62]:
feature_size = 14
window_size = 12
batch_size = 1
hidden_dim = 32
cum_labels = 8

device = torch.device('cpu')  # Device to run the model

In [66]:
inner_edge_idx = remap_edges(companyID_to_index, inner_edge)
outer_edge_idx = remap_edges(sector_to_index, outer_edge)

In [77]:
company_to_sector_idx = remap_company_to_sector(company_cluster_dict, companyID_to_index, sector_to_index)

In [67]:
torch.save(inner_edge_idx, 'inner_edge_idx.pt')
torch.save(outer_edge_idx, 'outer_edge_idx.pt')

In [78]:
torch.save(company_to_sector_idx, 'company_to_sector_idx.pt')

In [79]:
model = CategoricalGraphAtt(feature_size, window_size, cum_labels+1, hidden_dim, inner_edge_idx, outer_edge_idx, company_to_sector_idx)
# Move model and data to the specified device (CPU for this example)
model.to(device)
model.to(torch.float)


CategoricalGraphAtt(
  (bn): BatchNorm1d(168, eps=1e-05, momentum=None, affine=True, track_running_stats=True)
  (sequence_encoder): Graph_GRUModel(
    (gru_cell): Graph_GRUCell(
      (x2h): Graph_Linear()
      (h2h): Graph_Linear()
    )
  )
  (inner_gat): GATConv(32, 32, heads=1)
  (cat_gat): GATConv(32, 32, heads=1)
  (fusion): Linear(in_features=96, out_features=32, bias=True)
  (logit_f): Linear(in_features=32, out_features=9, bias=True)
)

In [82]:
# train_iterator = generate_pseudo_data_iterator(True, batch_size, feature_size, window_size, company_ids, days=30)


def load_dataset(is_training, filename, batch_size, feature_size, window_size, company_id_list):
    # Check the compression
    compression = "gzip" if ".gz" in filename else None
    # Get infos, features, and labels (No for_column)
    # Read the data & skip the header
    all_df = pd.read_csv(filename, compression=compression, header=0)    

    # Fill missing values
    features_coverage = 2 + feature_size * window_size
    all_df.iloc[:, :2] = all_df.iloc[:, :2].fillna("") # filled missing values in info columns (date, id) with an empty string
    all_df.iloc[:, 2:features_coverage] = all_df.iloc[:, 2:features_coverage].fillna(0.0) # feature_df
    all_df.iloc[:, features_coverage:] = all_df.iloc[:, features_coverage:].fillna(0) # label_df
    
    # Replace other events as 0: 2 -> 0
    all_df.iloc[:, features_coverage:] = all_df.iloc[:, features_coverage:].replace(2, 0) # label_df
    
    # get all features
    x, y = [], []
    results_dict = dict()

    date_group = all_df.groupby('date')
    for date in all_df.date.sort_values().unique():
        df = date_group.get_group(date)
        df_date_id = df.sort_values(by='id').set_index('id')

        # create rows with all companies fill with 0 if no data exists else fill with original data
        df_all_company_at_t = pd.DataFrame(0, index=company_id_list, columns=df.columns)
        df_all_company_at_t.loc[df_date_id.index, :] = df_date_id # fill original data to df_all_company_at_t if value exists
        df_all_company_at_t['id'] = df_all_company_at_t.index

        # extracts label values from df_all_company_at_t
        label_df = df_all_company_at_t.loc[:, ["y_cum_{:02d}".format(h) for h in range(1, 1+8)]]
        label_df['y_cum_09'] = 1 # every company will default in the infinite future
        label_df.loc[label_df.index.difference(df_date_id.index), :] = -1
        label = np.array(label_df.values, dtype=np.int32)

        df_all_company_at_t.index = range(len(df_all_company_at_t))
        results_dict[date] = df_all_company_at_t.loc[df_all_company_at_t.id.isin(df_date_id.index), :][['date', 'id']]

        # time-lagged observations at time t-delta+1, ... t-1, where delta can be 1,6,12
        feature_window = []
        for rnn_length in range(1, window_size+1):
            # feature
            feature_df = df_all_company_at_t.loc[:, ['x_fea_{:02d}_w_{:02d}'.format(feat_i, rnn_length) for feat_i in range(1, feature_size+1)]]
            feature = np.array(feature_df.values, dtype=np.float32)
            feature_window.append(feature)
        feature_window = np.stack(feature_window, axis=0)

        x.append(feature_window) # 325 * (6, 15786, 14)
        y.append(label)         # 325 * (15786, 9)
        
    x = np.stack(x) # (325, 6, 15786, 14)
    y = np.stack(y) # (325, 15786, 9)

    dataset = All_Company_Dataset(x=x, y=y)
    iterator = DataLoader(dataset, batch_size=batch_size, shuffle=True if is_training else False)
    return iterator, results_dict

data_dir = '/tmp2/cwlin/explainable_credit/explainable_credit_new/data'
train_data = os.path.join(data_dir, 'train_cum.csv')

train_inputs, _ = load_dataset(True, train_data, batch_size, feature_size, window_size, company_ids)

t = trange(len(train_inputs))
for i, (features, labels) in zip(t, train_inputs):
    # print("Shape of features", features.shape)
    # print("Shape of labels", labels.shape)
    batch_size = features.size(0)
    # print(batch_size)
    for batch in range(batch_size):
        inputs = features[batch].to(torch.float32)
        targets = labels[batch].to(device)
        print("shape of targets:", targets.shape)
        predict = model(inputs)
        print("shape of predict:", predict.shape)
    break

  0%|          | 0/325 [00:00<?, ?it/s]

shape of targets: torch.Size([15786, 9])
Shape of daily_data_batch torch.Size([12, 15786, 14])
Shape of sequence_embeddings: torch.Size([15786, 32])


  0%|          | 0/325 [00:04<?, ?it/s]

Shape of intra_sector_embeddings: torch.Size([15786, 32])
Shape of sectors_embeddings: torch.Size([9, 32])
Shape of sectors_embeddings: torch.Size([9, 32])
Shape of sectors_embeddings: torch.Size([15786, 32])
Shape of fusion_vec torch.Size([15786, 32])
shape of predict: torch.Size([15786, 9])



