In [0]:
# 1. Configuration
# This cell contains all project-wide settings and parameters.
# Keep this cell at the top for easy reference and updates.

# config.py template for Credit Risk Project

# Data paths
BASE_PATH = "/Volumes/workspace/HomeCreditDefaultRisk/creditrisk_data"
TRAIN_FILE = "application_train.csv"
TEST_FILE = "application_test.csv"
EXTERNAL_DATA = [
    "bureau.csv",
    "previous_application.csv",
    # Add other external data files as needed
]

# Unity Catalog settings
CATALOG = "creditrisk_catalog"
BRONZE_SCHEMA = "bronze_creditrisk"
SILVER_SCHEMA = "silver_creditrisk"
GOLD_SCHEMA = "gold_creditrisk"
TRAIN_TABLE = "train_dataset"
TEST_TABLE = "test_dataset"

# Feature engineering
SELECTED_FEATURES = [
    "SK_ID_CURR", "TARGET", "AMT_INCOME_TOTAL",
    # Add more features as needed
]
CATEGORICAL_FEATURES = [
    "NAME_CONTRACT_TYPE", "CODE_GENDER",
    # Add more categorical features as needed
]
NUMERICAL_FEATURES = [
    "AMT_CREDIT", "AMT_ANNUITY",
    # Add more numerical features as needed
]

# Model parameters
MODEL_TYPE = "lightgbm"
LGBM_PARAMS = {
    "num_leaves": 31,
    "learning_rate": 0.05,
    "n_estimators": 100,
}
KFOLDS = 5
RANDOM_SEED = 42

# Access control
ADMIN_GROUP = "admins@company.com"
ANALYST_GROUP = "analysts@company.com"
AIMLDS_GROUP = "AIMLDS@ds.com"

# Output settings
OUTPUT_PATH = "/dbfs/FileStore/creditrisk/output/"
SUBMISSION_FILE = "submission.csv"

# Logging
LOG_LEVEL = "INFO"


In [0]:
# Sanity checks: one row per SK_ID_CURR
print("train distinct SK_ID_CURR:", app_train.select("SK_ID_CURR").distinct().count())
print("test  distinct SK_ID_CURR:", app_test.select("SK_ID_CURR").distinct().count())

# Target distribution
app_train.groupBy("TARGET").count().orderBy("TARGET").show()

In [0]:
# 3. Schema Creation and Validation
# Create Catalog first, then Schemas using config variables
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.`{BRONZE_SCHEMA}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SILVER_SCHEMA}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{GOLD_SCHEMA}")

# Validate grain (table must exist first, otherwise this will error)
# Uncomment the following lines only after you have written app_train to the target table
# st = spark.table(f"{CATALOG}.{SILVER_SCHEMA}.app_train")
# print(st.count(), st.select("SK_ID_CURR").distinct().count())

In [0]:
# 4. Access Control and Privileges
# Use config variables for catalog, schemas, and groups

# Ensure catalog exists
spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
# Switch to catalog
spark.sql(f"USE CATALOG {CATALOG}")

# Create schemas
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.`{BRONZE_SCHEMA}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.`{SILVER_SCHEMA}`")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{CATALOG}`.`{GOLD_SCHEMA}`")

# Ownership
spark.sql(f"ALTER SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}` OWNER TO `{ADMIN_GROUP}`")
spark.sql(f"ALTER SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}` OWNER TO `{ADMIN_GROUP}`")
spark.sql(f"ALTER SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}` OWNER TO `{ADMIN_GROUP}`")

# Privileges for bronze
spark.sql(f"GRANT ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}` TO `{ADMIN_GROUP}`")
spark.sql(f"REVOKE ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}` FROM `{ANALYST_GROUP}`")
spark.sql(f"REVOKE ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}` FROM `{AIMLDS_GROUP}`")

# Privileges for silver
spark.sql(f"GRANT ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}` TO `{ADMIN_GROUP}`")
spark.sql(f"GRANT ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}` TO `{AIMLDS_GROUP}`")
spark.sql(f"REVOKE ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}` FROM `{ANALYST_GROUP}`")

# Privileges for gold
spark.sql(f"GRANT ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}` TO `{ADMIN_GROUP}`")
spark.sql(f"GRANT SELECT ON SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}` TO `{ANALYST_GROUP}`")
spark.sql(f"GRANT SELECT ON SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}` TO `{AIMLDS_GROUP}`")

# Table-level grants (uncomment after table creation)
# spark.sql(f"GRANT SELECT ON TABLE `{CATALOG}`.`{GOLD_SCHEMA}`.`{TRAIN_TABLE}` TO `{ANALYST_GROUP}`")

# Show grants
spark.sql(f"SHOW GRANTS ON SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}`")
spark.sql(f"SHOW GRANTS ON SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}`")
spark.sql(f"SHOW GRANTS ON SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}`")
# spark.sql(f"SHOW GRANTS ON TABLE `{CATALOG}`.`{GOLD_SCHEMA}`.`{TRAIN_TABLE}`")

In [0]:
spark.sql(f"ALTER SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}` OWNER TO `{ADMIN_GROUP}`")
spark.sql(f"ALTER SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}` OWNER TO `{ADMIN_GROUP}`")
spark.sql(f"ALTER SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}` OWNER TO `{ADMIN_GROUP}`")

In [0]:
# BRONZE SCHEMA privileges using config
spark.sql(f"GRANT ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}` TO `{ADMIN_GROUP}`")
spark.sql(f"REVOKE ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}` FROM `{ANALYST_GROUP}`")
spark.sql(f"REVOKE ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}` FROM `{AIMLDS_GROUP}`")

In [0]:
# Silver Schema privileges using config
spark.sql(f"GRANT ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}` TO `{ADMIN_GROUP}`")
spark.sql(f"GRANT ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}` TO `{AIMLDS_GROUP}`")
spark.sql(f"REVOKE ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}` FROM `{ANALYST_GROUP}`")

In [0]:
# Gold Schema privileges using config
spark.sql(f"GRANT ALL PRIVILEGES ON SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}` TO `{ADMIN_GROUP}`")
spark.sql(f"GRANT SELECT ON SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}` TO `{ANALYST_GROUP}`")
spark.sql(f"GRANT SELECT ON SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}` TO `{AIMLDS_GROUP}`")

In [0]:
# Minimal fix: Ensure table exists before granting access
# If the table does not exist, create it from app_train
if not spark.catalog.tableExists(f"{CATALOG}.{GOLD_SCHEMA}.{TRAIN_TABLE}"):
    app_train.write.format("delta").mode("overwrite").saveAsTable(f"{CATALOG}.{GOLD_SCHEMA}.{TRAIN_TABLE}")

# Now grant access
spark.sql(f"GRANT SELECT ON TABLE `{CATALOG}`.`{GOLD_SCHEMA}`.`{TRAIN_TABLE}` TO `{ANALYST_GROUP}`")

In [0]:
# Show grants using config
spark.sql(f"SHOW GRANTS ON SCHEMA `{CATALOG}`.`{BRONZE_SCHEMA}`")
spark.sql(f"SHOW GRANTS ON SCHEMA `{CATALOG}`.`{SILVER_SCHEMA}`")
spark.sql(f"SHOW GRANTS ON SCHEMA `{CATALOG}`.`{GOLD_SCHEMA}`")
spark.sql(f"SHOW GRANTS ON TABLE `{CATALOG}`.`{GOLD_SCHEMA}`.`{TRAIN_TABLE}`")

### 7. Documentation/Notes
#
### The workspace was running Unity Catalog metastore privilege version 1.0, which does not support USAGE privileges. Governance was implemented using schema- and table-level ALL and SELECT grants, with column-level restrictions for sensitive credit attributes.

In [0]:
# 6. Analysis and Reporting
# Target distribution
# %sql
# SELECT
#   TARGET,
#   COUNT(*) AS customers,
#   ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER (), 2) AS pct
# FROM {GOLD_SCHEMA}.{TRAIN_TABLE}
# GROUP BY TARGET
# ORDER BY TARGET;

# Risk flag prevalence
# %sql
# SELECT
#   ROUND(100.0 * AVG(severe_dpd_flag), 2)       AS severe_dpd_pct,
#   ROUND(100.0 * AVG(frequent_late_payer_flag), 2) AS frequent_late_payer_pct,
#   ROUND(100.0 * AVG(high_bureau_risk_flag), 2) AS high_bureau_risk_pct
# FROM {GOLD_SCHEMA}.{TRAIN_TABLE};

In [0]:
# %sql
# SELECT
#   ROUND(100.0 * AVG(severe_dpd_flag), 2)       AS severe_dpd_pct,
#   ROUND(100.0 * AVG(frequent_late_payer_flag), 2) AS frequent_late_payer_pct,
#   ROUND(100.0 * AVG(high_bureau_risk_flag), 2) AS high_bureau_risk_pct
# FROM workspace.gold_creditrisk.train_dataset_b;


In [0]:
# # 5. Feature Engineering and Utility Functions
# def missing_data(df):
#     total_count = df.count()
#     null_counts = df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns]).collect()[0]
#     result = spark.createDataFrame(
#         [(c, null_counts[c], (null_counts[c] / total_count * 100) if total_count > 0 else 0) for c in df.columns],
#         ["Column", "Total", "Percent"]
#     )
#     return result.orderBy(F.col("Total").desc())

In [0]:
# # 2. Data Loading and Initial Exploration
# from pyspark.sql import functions as F

# app_train = spark.read.csv(f"{BASE_PATH}/{TRAIN_FILE}", header=True, inferSchema=True)
# app_test = spark.read.csv(f"{BASE_PATH}/{TEST_FILE}", header=True, inferSchema=True)

# print("train rows:", app_train.count())
# print("test rows:", app_test.count())

# app_train.select("SK_ID_CURR", "TARGET").show(5)

# # Sanity checks: one row per SK_ID_CURR
# print("train distinct SK_ID_CURR:", app_train.select("SK_ID_CURR").distinct().count())
# print("test  distinct SK_ID_CURR:", app_test.select("SK_ID_CURR").distinct().count())

# # Target distribution
# app_train.groupBy("TARGET").count().orderBy("TARGET").show()