### Imports

In [1]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from torch_geometric.data import HeteroData
from torch_geometric.nn import HeteroConv, SAGEConv
from torch_geometric.nn.models import GAE
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, average_precision_score
import time
import os

# for outputting fit and inference time also garbage collection
from IPython.display import clear_output
from gc import collect
clear_output();
collect()

42

### Load Data

In [2]:
cur_dir = os.getcwd()
par_dir = os.path.dirname(cur_dir)
all_data = os.path.join(par_dir, 'All Data')
os.listdir(all_data)
clear_output();
collect()

0

In [3]:
primary_folder = os.path.join(all_data, 'Bid Info Downloader') 
primary_data = os.path.join(primary_folder, 'combined_bids.csv')
prim_df = pd.read_csv(primary_data)
print(prim_df.shape)
prim_df.head()

(52881, 12)


Unnamed: 0,Contract ID,NAICS,Date Signed,Contracting Agency ID,PSC,Region,Business Entity ID,Action Obligation ($),Modification Count,Total Modified Action Obligation ($),Competition Type,Bids
0,GS11P14MAP0340,236220,2014-10-16,4740,Z2AA,Southeast,RWDWFG6WGRK9,644325,1,28426,Restricted,1
1,IND0407CT66810,236220,2006-12-21,1406,Y199,West,U4K9M66MUHR9,68450,0,0,Restricted,1
2,W912LD06C0036,236220,2006-09-25,2100,Y111,Northeast,EATDZJL6JFJ5,12573650,15,556384,Open,5
3,GS11P06ZGC0339,236220,2006-10-17,4740,J039,Southeast,QKCFMFL4MKT6,11943,1,1504,Restricted,1
4,HHSD200200618928C,236220,2006-08-30,7523,Y111,Midwest,TTHNY8N2PQR3,192500,1,38349,Open,1


In [4]:
# encoding
vendor_enc = LabelEncoder()
contract_enc = LabelEncoder()
agency_enc = LabelEncoder()

# transform the ids to integer which is easier to process
prim_df['vendor_id'] = vendor_enc.fit_transform(prim_df['Business Entity ID'])
prim_df['contract_id'] = contract_enc.fit_transform(prim_df['Contract ID'])
prim_df['agency_id'] = agency_enc.fit_transform(prim_df['Contracting Agency ID'])

# for report
num_vendors = prim_df['vendor_id'].nunique()
num_contracts = prim_df['contract_id'].nunique()
num_agencies = prim_df['agency_id'].nunique()

clear_output();
collect()

0

In [5]:
# feature engineering, adding node attributes

def process_features(df, group_key, num_cols, cat_cols):
    grouped = df.groupby(group_key).agg({
        **{col: 'mean' for col in num_cols},
        **{col: lambda x: x.mode().iloc[0] if not x.mode().empty else 'Unknown' for col in cat_cols}
    }).reset_index()

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
    ])
    features = preprocessor.fit_transform(grouped)
    return grouped[group_key].values, torch.tensor(features, dtype=torch.float)


In [6]:
%%time
vendor_ids, vendor_x = process_features(
    prim_df, 'vendor_id',
    num_cols=['Action Obligation ($)', 'Modification Count', 'Total Modified Action Obligation ($)', 'Bids'],
    cat_cols=['Region', 'NAICS']
)

contract_ids, contract_x = process_features(
    prim_df, 'contract_id',
    num_cols=['Action Obligation ($)', 'Modification Count', 'Total Modified Action Obligation ($)', 'Bids'],
    cat_cols=['PSC', 'Competition Type']
)

agency_ids, agency_x = process_features(
    prim_df, 'agency_id',
    num_cols=['Action Obligation ($)', 'Modification Count', 'Total Modified Action Obligation ($)', 'Bids'],
    cat_cols=['PSC']
)

clear_output()
collect()

CPU times: total: 35.3 s
Wall time: 36.4 s


48

In [7]:
%%time
# node and attributes
data = HeteroData()
data['vendor'].x = vendor_x
data['contract'].x = contract_x
data['agency'].x = agency_x

clear_output()
collect()

CPU times: total: 234 ms
Wall time: 250 ms


0

In [8]:
%%time
# construct edges
vendor_contract_edges = torch.tensor([prim_df['vendor_id'].values, prim_df['contract_id'].values], dtype=torch.long)
contract_agency_edges = torch.tensor([prim_df['contract_id'].values, prim_df['agency_id'].values], dtype=torch.long)

data['vendor', 'bids_on', 'contract'].edge_index = vendor_contract_edges
data['contract', 'bid_received_from', 'vendor'].edge_index = vendor_contract_edges.flip(0)
data['contract', 'awarded_by', 'agency'].edge_index = contract_agency_edges
data['agency', 'awards', 'contract'].edge_index = contract_agency_edges.flip(0)

clear_output()
collect()

CPU times: total: 344 ms
Wall time: 314 ms


0