In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from torch_geometric.data import HeteroData
import torch
import torch_geometric.transforms as T
import torch.nn.functional as F
from torch_geometric.nn import to_hetero, GraphConv

In [3]:
#read in the credit card transaction data
# The data is from IBM's credit card fraud detection dataset on Kaggle
# https://www.kaggle.com/datasets/ealtman2019/credit-card-transactions.

df = pd.read_csv('data/credit_card_transactions-ibm_v2.csv')

# Take a small sample of the data for computational efficiency
# To avoid extreme class imbalance, I will force 50% of the sample to be fraud

s1 = df[df['Is Fraud?'] == 'No'].sample(n=5000, random_state=4)
s2 = df[df['Is Fraud?'] == 'Yes'].sample(n=5000, random_state=4)

s = pd.concat([s1, s2]).reset_index(drop=True)

#add user information from user file to the data
udf = pd.read_csv('data/sd254_users.csv')
udf.reset_index(inplace=True)
udf = udf[['index','Current Age','Retirement Age','Gender','State','Zipcode','Yearly Income - Person','Total Debt','FICO Score','Num Credit Cards']]
s = pd.merge(s, udf, how='left', left_on='User',right_on='index')

# Drop zip and state as they are missing values
s.drop(columns=['Zip'], inplace=True)

# Fill in value where no error
s['Errors?'].fillna('N', inplace=True)

# Convert Is Fraud to binary
s['Is Fraud?'] = s['Is Fraud?'].map({'Yes': 1, 'No': 0})

# Separate time into hours and minutes
s['Hour'] = s['Time'].str.split(':').str[0].astype(int)
s['Minute'] = s['Time'].str.split(':').str[1].astype(int)

# Drop time
s.drop(columns='Time', inplace=True)

#convert dollars to floats
for col in ['Yearly Income - Person', 'Total Debt', 'Amount']:
    s[col] = s[col].str.replace('$', '').astype(float)

# replace unknown merhcant states with unknown
s['Merchant State']  = s['Merchant State'] .fillna('unknown')
# create an indicator for unknown state
s['Unknown State'] = s.apply(lambda row: 1 if row['Merchant State'] == 'unknown' else 0, axis=1)

# Convert categorical to numeric with label encoder 
# (for simplicity using label encoding despite introduction of ordinality)
le = LabelEncoder()
s['Use Chip'] = le.fit_transform(s['Use Chip'])

s['Merchant Name'] = le.fit_transform(s['Merchant Name'])

s['Merchant City'] = le.fit_transform(s['Merchant City'])

s['Errors?'] = le.fit_transform(s['Errors?'])

s['User'] = le.fit_transform(s['User'])

s['Gender'] = le.fit_transform(s['Gender'])

s['State'] = le.fit_transform(s['State'])

s['Zipcode'] = le.fit_transform(s['Zipcode'])

# create unique id for transaction from 0 to n-1
s['Transaction ID'] = range(0, len(s))
s.info()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  s['Errors?'].fillna('N', inplace=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   User                    10000 non-null  int64  
 1   Card                    10000 non-null  int64  
 2   Year                    10000 non-null  int64  
 3   Month                   10000 non-null  int64  
 4   Day                     10000 non-null  int64  
 5   Amount                  10000 non-null  float64
 6   Use Chip                10000 non-null  int32  
 7   Merchant Name           10000 non-null  int64  
 8   Merchant City           10000 non-null  int32  
 9   Merchant State          10000 non-null  object 
 10  MCC                     10000 non-null  int64  
 11  Errors?                 10000 non-null  int32  
 12  Is Fraud?               10000 non-null  int64  
 13  index                   10000 non-null  int64  
 14  Current Age             10000 non-null 

In [5]:
# 1. Logistic Regression for Predicting Fraud (baseline)

# We will run a logistic regression to predict fraud as a baseline for comparison to the GNN model

X = s.drop(columns=['Is Fraud?', 'User', 'Merchant Name','Transaction ID','Merchant State'])
y = s['Is Fraud?']
#scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

lr = LogisticRegression(max_iter=1000)
lr.fit(X,y)
y_pred = lr.predict(X)

print('Logistic Regression Results')
print('accuracy',accuracy_score(y, y_pred))
print('precision',precision_score(y, y_pred))
print('recall',recall_score(y, y_pred))
print('f1 score',f1_score(y, y_pred))

Logistic Regression Results
accuracy 0.764
precision 0.8139866793529972
recall 0.6844
f1 score 0.7435897435897436


In [6]:
# 2. GNN for Predicting Fraud

# create a heterogenous graph
data = HeteroData()
 
# add nodes
data['user'].num_nodes = s['User'].nunique()
data['merchant'].num_nodes = s['Merchant Name'].nunique()
data['transaction'].num_nodes = s['Transaction ID'].nunique()

#create edges

data['user','makes','transaction'].edge_index = torch.tensor(s[['User', 'Transaction ID']].values.T, dtype=torch.long)
data['transaction','reaches','merchant'].edge_index = torch.tensor(s[['Transaction ID', 'Merchant Name']].values.T, dtype=torch.long)

# add transaction features
transaction_features = torch.tensor(s.drop(columns=['Is Fraud?', 'User', 'Merchant Name','Transaction ID','Merchant State']).values, dtype=torch.float)
# print(transaction_features.shape)
data['transaction'].x = transaction_features

target = torch.tensor(s['Is Fraud?'].values, dtype=torch.float).unsqueeze(0).T
# print(target.shape)
data['transaction'].y = target

#add user node features
user_feats = s[['Current Age','Retirement Age','Gender','State','Zipcode','Yearly Income - Person','Total Debt','FICO Score','Num Credit Cards']].drop_duplicates().reset_index().drop(columns='index',axis=1)
user_features = torch.tensor(user_feats.values,dtype=torch.float)
# print(user_features.shape)
data['user'].x = user_features

#add merchant node features (just one for unknown states)
x = s[['Merchant Name','Unknown State']].drop_duplicates()
y = x.groupby('Merchant Name').prod()

#merchant data is not 1x1 to merchant id, so created a column to represent if the merchant has no known states or if all states are known
data['merchant'].x= torch.tensor(y['Unknown State'],dtype=torch.float).unsqueeze(0).T


# Add reverse edges and normalize features
data = T.ToUndirected()(data)
data = T.NormalizeFeatures()(data)

node_types, edge_types = data.metadata()

print(node_types)

print(edge_types)

print(data)

['user', 'merchant', 'transaction']
[('user', 'makes', 'transaction'), ('transaction', 'reaches', 'merchant'), ('transaction', 'rev_makes', 'user'), ('merchant', 'rev_reaches', 'transaction')]
HeteroData(
  user={
    num_nodes=1511,
    x=[1511, 9],
  },
  merchant={
    num_nodes=2010,
    x=[2010, 1],
  },
  transaction={
    num_nodes=10000,
    x=[10000, 22],
    y=[10000, 1],
  },
  (user, makes, transaction)={ edge_index=[2, 10000] },
  (transaction, reaches, merchant)={ edge_index=[2, 10000] },
  (transaction, rev_makes, user)={ edge_index=[2, 10000] },
  (merchant, rev_reaches, transaction)={ edge_index=[2, 10000] }
)


In [7]:
# Define the GNN model with two layers of GraphConv layers
class GNN(torch.nn.Module):

    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        # -1 is lazy initialization of the shape, which will be inferred from the first forward pas
        self.conv1 = GraphConv((-1, -1), hidden_channels)
        self.conv2 = GraphConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

model = GNN(hidden_channels=64, out_channels=2)

# Convert the model to a heterogenous model
# The model will now take a dictionary of node features and a dictionary of edge indices
model = to_hetero(model, data.metadata(), aggr='sum')

 
# Define the optimizer to be used in training
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)

 
# Create dictionaries of node features and edge indices from graph data
node_x_dict = {'user':data['user'].x, 'transaction': data['transaction'].x, 'merchant': data['merchant'].x}

edge_ind_dict =  {('user', 'makes', 'transaction'): data['user', 'makes', 'transaction'].edge_index,
                  ('transaction', 'reaches', 'merchant'): data['transaction', 'reaches', 'merchant'].edge_index,
                  ('transaction', 'rev_makes', 'user'): data['transaction', 'rev_makes', 'user'].edge_index,
                  ('merchant', 'rev_reaches', 'transaction'): data['merchant', 'rev_reaches', 'transaction'].edge_index
                 }

 
# Train the model
model.train()
for epoch in range(2000):
    # Zero the gradients
    optimizer.zero_grad()
    # Perform a forward pass
    out = model(node_x_dict, edge_ind_dict)
    # Compute the loss based on the nodes in the 'transaction' type
    actual = data['transaction'].y.squeeze().long()
    loss = F.cross_entropy(out['transaction'], actual)
    # Perform a backward pass
    loss.backward()
    # Update the parameters
    optimizer.step()
    # Print the loss at every 100th epoch
    # This is useful for tracking convergence
    if epoch % 100 == 0:
        print(float(loss))

 
# Evaluate the model on the training data
model.eval()
y_pred = model(node_x_dict, edge_ind_dict)
y_pred = y_pred['transaction'].argmax(dim=1)

y = data['transaction'].y.squeeze().long()

print('GNN Results')
print('accuracy',accuracy_score(y, y_pred))
print('precision',precision_score(y, y_pred))
print('recall',recall_score(y, y_pred))
print('f1 score',f1_score(y, y_pred))



2.372481107711792
0.7023478746414185
0.5558483600616455
0.5189362168312073
0.536408543586731
0.5142029523849487
0.518513560295105
0.5026587843894958
0.4921687841415405
0.4867297410964966
0.48084062337875366
0.4843391478061676
0.5168926119804382
0.506732702255249
0.49973657727241516
0.4980401396751404
0.48850879073143005
0.4860280752182007
0.4808429181575775
0.8215529322624207
GNN Results
accuracy 0.7901
precision 0.7784603570742945
recall 0.811
f1 score 0.7943971005975121
