# Load the Dataset

In [1]:
import pandas as pd

# Load the sampled dataset
df = pd.read_csv("../data/processed/paysim_subset.csv")

# View shape and preview
print("Shape:", df.shape)
df.head()

Shape: (5000, 11)


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,278,CASH_IN,330218.42,C632336343,20866.0,351084.42,C834976624,452419.57,122201.15,0,0
1,15,PAYMENT,11647.08,C1264712553,30370.0,18722.92,M215391829,0.0,0.0,0,0
2,10,CASH_IN,152264.21,C1746846248,106589.0,258853.21,C1607284477,201303.01,49038.8,0,0
3,403,TRANSFER,1551760.63,C333676753,0.0,0.0,C1564353608,3198359.45,4750120.08,0,0
4,206,CASH_IN,78172.3,C813403091,2921331.58,2999503.88,C1091768874,415821.9,337649.6,0,0


# Understand Column Meanings

In [2]:
df.info()
df.describe()
df["type"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            5000 non-null   int64  
 1   type            5000 non-null   object 
 2   amount          5000 non-null   float64
 3   nameOrig        5000 non-null   object 
 4   oldbalanceOrg   5000 non-null   float64
 5   newbalanceOrig  5000 non-null   float64
 6   nameDest        5000 non-null   object 
 7   oldbalanceDest  5000 non-null   float64
 8   newbalanceDest  5000 non-null   float64
 9   isFraud         5000 non-null   int64  
 10  isFlaggedFraud  5000 non-null   int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 429.8+ KB


type
CASH_OUT    1755
PAYMENT     1711
CASH_IN     1105
TRANSFER     405
DEBIT         24
Name: count, dtype: int64

# Basic Sanity Checks

In [3]:
# Any missing data?
df.isnull().sum()

# Unique types
print(df['type'].unique())

# Number of unique senders and receivers
print("Unique Senders:", df['nameOrig'].nunique())
print("Unique Recipients:", df['nameDest'].nunique())

['CASH_IN' 'PAYMENT' 'TRANSFER' 'CASH_OUT' 'DEBIT']
Unique Senders: 5000
Unique Recipients: 4982


# Filter to Relevant Transactions

In [4]:
df = df[df['type'].isin(['TRANSFER', 'CASH_OUT'])].copy()
df.reset_index(drop=True, inplace=True)

# Normalize Entity Names

In [5]:
df['nameOrig'] = df['nameOrig'].apply(lambda x: f'src_{x.lower()}')
df['nameDest'] = df['nameDest'].apply(lambda x: f'dst_{x.lower()}')

In [6]:
df.to_csv("../data/processed/paysim_graph_ready.csv", index=False)