# Data Exploration

#### Purpose: Initial data exploration and quality assessment

Author: Devbrew LLC

Created: 2025-10-01

Last Modified: 2025-10-01

## Notebook Configuration


#### Load packages and configure environment

In [8]:
import warnings
from pathlib import Path
import json
from typing import Optional, Tuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option('display.float_format', '{:.4f}'.format)

# Plotting configuration
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("✅ Environment configured successfully")
print(f"  - pandas: {pd.__version__}")
print(f"  - numpy: {np.__version__}")

✅ Environment configured successfully
  - pandas: 2.3.3
  - numpy: 2.3.3


## Path Configurations

Sets up project directory structure and validates data availability.

In [13]:
# Project paths
PROJECT_ROOT = Path.cwd().parent
DATA_DIR = PROJECT_ROOT / "data_catalog"
PROCESSED_DIR = DATA_DIR / "processed"
NOTEBOOKS_DIR = PROJECT_ROOT / "notebooks"

# Dataset paths
IEEE_CIS_DIR = DATA_DIR / "ieee-fraud" # IEEE-CIS Fraud Detection Dataset
PAYSIM_DIR = DATA_DIR / "paysim" # PaySim Dataset
OFAC_DIR = DATA_DIR / "ofac" # OFAC Sanctions Dataset

# Create output directiories
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)


# Validate data availability
def validate_data_path() -> dict:
    """Validate that required datasets exists"""
    paths_status = {
        'IEEE Train Transaction': (IEEE_CIS_DIR / "train_transaction.csv").exists(),
        'IEEE Train Identity': (IEEE_CIS_DIR / "train_identity.csv").exists(),
        'IEEE Test Transaction': (IEEE_CIS_DIR / "test_transaction.csv").exists(),
        'IEEE Test Identity': (IEEE_CIS_DIR / "test_identity.csv").exists(),
        'PaySim': (PAYSIM_DIR / "PS_20174392719_1491204439457_log.csv").exists(),
        'OFAC SDN': (OFAC_DIR / "sdn" / "sdn.csv").exists(),
        'OFAC SDN ADD': (OFAC_DIR / "sdn" / "add.csv").exists(),
        'OFAC SDN ALT': (OFAC_DIR / "sdn" / "alt.csv").exists(),
        'OFAC SDN COMMENTS': (OFAC_DIR / "sdn" / "sdn_comments.csv").exists(),
        'OFAC Consolidated': (OFAC_DIR / "consolidated" / "cons_prim.csv").exists(),
        'OFAC Consolidated ADD': (OFAC_DIR / "consolidated" / "cons_add.csv").exists(),
        'OFAC Consolidated ALT': (OFAC_DIR / "consolidated" / "cons_alt.csv").exists(),
        'OFAC Consolidated COMMENTS': (OFAC_DIR / "consolidated" / "cons_comments.csv").exists(),
    }

    print("Data Availability Check:")
    print("-" * 60)
    for name, exists in paths_status.items():
        status = "✅" if exists else "❌"
        print(f"{name}: {status}")

    all_exist = all(paths_status.values())
    if not all_exist:
        print("\n⚠️  Warning: Some datasets are missing. Check data_catalog/README.md")
    else: 
        print("✅ All required datasets are found.")

    return paths_status    

paths_status = validate_data_path()


Data Availability Check:
------------------------------------------------------------
IEEE Train Transaction: ✅
IEEE Train Identity: ✅
IEEE Test Transaction: ✅
IEEE Test Identity: ✅
PaySim: ✅
OFAC SDN: ✅
OFAC SDN ADD: ✅
OFAC SDN ALT: ✅
OFAC SDN COMMENTS: ✅
OFAC Consolidated: ✅
OFAC Consolidated ADD: ✅
OFAC Consolidated ALT: ✅
OFAC Consolidated COMMENTS: ✅
✅ All required datasets are found.
