In [7]:
import os
import glob
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

import model_inference


SyntaxError: invalid decimal literal (1896117820.py, line 26)

In [None]:
# Build a .py script that takes a snapshot date, loads a model artefact and make an inference and save to datamart

## set up pyspark session

In [None]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

## set up config

In [None]:
snapshot_date_str = "2016-05-01"
model_name = "credit_model_2017_03_01.pkl"


In [None]:
config = {}
config["snapshot_date_str"] = snapshot_date_str
config["snapshot_date"] = datetime.strptime(config["snapshot_date_str"], "%Y-%m-%d")
config["model_name"] = model_name
config["model_bank_directory"] = "/app/notebooks/06_model_training/model_bank/" 
config["model_artefact_filepath"] = config["model_bank_directory"] + config["model_name"]

pprint.pprint(config)

## load model artefact from model bank

In [None]:
# Load the model from the pickle file
with open(config["model_artefact_filepath"], 'rb') as file:
    model_artefact = pickle.load(file)

print("Model loaded successfully! " + config["model_artefact_filepath"])

## load feature store

In [None]:
feature_location = "/app/datamart/gold/inference_feature_store"

specific_feature_file = f"{feature_location}snapshot_date={config['snapshot_date_str']}/*.parquet"
specific_feature_dir = f"{feature_location}snapshot_date={config['snapshot_date_str']}/"

print(f"Looking for feature files in: {specific_feature_dir}")

# Check if the specific partition directory exists and has parquet files
parquet_files_exist = False
features_sdf = None

# Option 1: Check if specific partition exists and load it
specific_feature_file = f"{feature_location}snapshot_date={config['snapshot_date_str']}/*.parquet"
specific_feature_dir = f"{feature_location}snapshot_date={config['snapshot_date_str']}/"

print(f"Looking for feature files in: {specific_feature_dir}")

# Check if the specific partition directory exists and has parquet files
parquet_files_exist = False
features_sdf = None

try:
    # Check if any parquet files exist in the specific partition
    parquet_files = glob.glob(specific_feature_file)
    if parquet_files:
        print(f"Found {len(parquet_files)} Parquet file(s) for the specific date")
        features_sdf = spark.read.parquet(specific_feature_dir)
        parquet_files_exist = True
    else:
        print(f"No Parquet files found in specific partition: {specific_feature_dir}")
except Exception as e:
    print(f"Error checking specific partition: {e}")

# Option 2: If specific partition not found, check entire directory
if not parquet_files_exist or features_sdf is None or features_sdf.count() == 0:
    print("Trying to load from entire feature store directory...")
    try:
        # Check if any parquet files exist in the entire feature store
        all_parquet_files = glob.glob(f"{feature_location}*.parquet") or \
                           glob.glob(f"{feature_location}*/*.parquet") or \
                           glob.glob(f"{feature_location}*/*/*.parquet")
        
        if all_parquet_files:
            print(f"Found {len(all_parquet_files)} Parquet file(s) in feature store")
            # Load all data and filter by snapshot_date
            features_store_sdf = spark.read.parquet(feature_location)
            features_sdf = features_store_sdf.filter((col("snapshot_date") == config["snapshot_date_str"]))
        else:
            print("No Parquet files found in feature store directory")
            raise ValueError(f"No Parquet files found in {feature_location}")
            
    except Exception as e:
        print(f"Error loading from feature store: {e}")
        # Final fallback: check if it's a schema issue and try with explicit schema
        try:
            print("Attempting to load with explicit schema...")
            # Define a basic schema based on your expected columns
            from pyspark.sql.types import StructType, StructField
            # Add your expected schema here based on your feature store
            # Example:
            # schema = StructType([
            #     StructField("msno", StringType(), True),
            #     StructField("snapshot_date", StringType(), True),
            #     StructField("fe_1", FloatType(), True),
            #     # ... add all your feature columns
            # ])
            # features_store_sdf = spark.read.schema(schema).parquet(feature_location)
            
            # For now, try without schema but with different options
            features_store_sdf = spark.read.option("mergeSchema", "true").parquet(feature_location)
            features_sdf = features_store_sdf.filter((col("snapshot_date") == config["snapshot_date_str"]))
        except Exception as final_error:
            raise ValueError(f"Failed to load features: {final_error}")

# Check if we finally have data
if features_sdf is None or features_sdf.count() == 0:
    raise ValueError(f"No features found for snapshot date: {config['snapshot_date_str']}")

print(f"extracted features_sdf: {features_sdf.count()} rows for snapshot_date: {config['snapshot_date_str']}")

# Show schema for debugging
print("Feature schema:")
features_sdf.printSchema()

# Show a sample of the data
print("Sample data:")
features_sdf.show(5)

# Convert to Pandas for sklearn processing
features_pdf = features_sdf.toPandas()
print(f"Converted to Pandas DataFrame with shape: {features_pdf.shape}")

## preprocess data for modeling

In [8]:
# prepare X_inference
feature_cols = [fe_col for fe_col in features_pdf.columns if fe_col.startswith('fe_')]
X_inference = features_pdf[feature_cols]

# apply transformer - standard scaler
transformer_stdscaler = model_artefact["preprocessing_transformers"]["stdscaler"]
X_inference = transformer_stdscaler.transform(X_inference)

print('X_inference', X_inference.shape[0])
X_inference

X_inference 8974


array([[ 1.37343573,  0.38727197, -1.30102033, ..., -0.63976417,
        -0.3378844 ,  0.7008268 ],
       [-1.1576494 ,  0.35743607, -0.71008323, ...,  0.70437269,
         0.2513317 ,  1.06643414],
       [ 2.58915379, -0.4282426 , -1.43122681, ..., -0.09431733,
        -1.54577741,  0.04658209],
       ...,
       [-0.26080821, -1.07468707, -1.48130622, ..., -2.14948311,
         1.39048282, -0.91554248],
       [ 0.36698062,  0.51656086, -0.68003558, ..., -1.72091773,
        -0.87799916, -0.8770575 ],
       [ 1.2239622 , -0.3685708 , -0.77017853, ...,  0.85047452,
         0.4673776 ,  0.51802313]], shape=(8974, 20))

## model prediction inference

In [7]:
# load model
model = model_artefact["model"]

# predict model
y_inference = model.predict_proba(X_inference)[:, 1]

# prepare output
y_inference_pdf = features_pdf[["msno","snapshot_date"]].copy()
y_inference_pdf["model_name"] = config["model_name"]
y_inference_pdf["model_predictions"] = y_inference
y_inference_pdf

NameError: name 'model_artefact' is not defined

## save model inference to datamart gold table

In [21]:
# create bronze datalake
gold_directory = f"/app/datamart/gold/model_predictions/{config["model_name"][:-4]}/"
print(gold_directory)

if not os.path.exists(gold_directory):
    os.makedirs(gold_directory)

# save gold table - IRL connect to database to write
partition_name = config["model_name"][:-4] + "_predictions_" + snapshot_date_str.replace('-','_') + '.parquet'
filepath = gold_directory + partition_name
spark.createDataFrame(y_inference_pdf).write.mode("overwrite").parquet(filepath)
# df.toPandas().to_parquet(filepath,
#           compression='gzip')
print('saved to:', filepath)

datamart/gold/model_predictions/credit_model_2017_03_01/


NameError: name 'y_inference_pdf' is not defined

## backfill

In the model inference pipeline context, backfill means running inference for historical dates that have been missed when the model was first deployed.

Example:

MODEL DEPLOYED: March 1, 2024
INFERENCE STARTS: March 1, 2024 → Predicts for March 2024 only
MISSING: January 2024, February 2024 predictions ❌

MODEL DEPLOYED: March 1, 2024  
BACKFILL RUN: March 1, 2024 → Generates predictions for:
- January 2024 ✅
- February 2024 ✅  
- March 2024 ✅

In [11]:
# set up config
snapshot_date_str = "2016-05-01"

start_date_str = "2016-04-01"
end_date_str = "2016-4-30"

In [12]:
# generate list of dates to process
def generate_first_of_month_dates(start_date_str, end_date_str):
    # Convert the date strings to datetime objects
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    
    # List to store the first of month dates
    first_of_month_dates = []

    # Start from the first of the month of the start_date
    current_date = datetime(start_date.year, start_date.month, 1)

    while current_date <= end_date:
        # Append the date in yyyy-mm-dd format
        first_of_month_dates.append(current_date.strftime("%Y-%m-%d"))
        
        # Move to the first of the next month
        if current_date.month == 12:
            current_date = datetime(current_date.year + 1, 1, 1)
        else:
            current_date = datetime(current_date.year, current_date.month + 1, 1)

    return first_of_month_dates

dates_str_lst = generate_first_of_month_dates(start_date_str, end_date_str)


In [13]:
for snapshot_date in dates_str_lst:
    print(snapshot_date)
    model_inference.main(snapshot_date, model_name)

2023-01-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 1, 1, 0, 0),
 'snapshot_date_str': '2023-01-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-01-01 00:00:00
X_inference 8974
datamart/gold/model_predictions/credit_model_2024_09_01/
saved to: datamart/gold/model_predictions/credit_model_2024_09_01/credit_model_2024_09_01_predictions_2023_01_01.parquet


---completed job---


2023-02-01


---starting job---


{'model_artefact_filepath': 'model_bank/credit_model_2024_09_01.pkl',
 'model_bank_directory': 'model_bank/',
 'model_name': 'credit_model_2024_09_01.pkl',
 'snapshot_date': datetime.datetime(2023, 2, 1, 0, 0),
 'snapshot_date_str': '2023-02-01'}
Model loaded successfully! model_bank/credit_model_2024_09_01.pkl
extracted features_sdf 8974 2023-02-01 

## Check datamart

In [14]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

In [15]:
folder_path = "/app/datamart/gold/model_predictions/credit_model_2017_03_01/"
files_list = [folder_path+os.path.basename(f) for f in glob.glob(os.path.join(folder_path, '*'))]
df = spark.read.option("header", "true").parquet(*files_list)
print("row_count:",df.count())

df.show()

row_count: 215376
+-----------+-------------+--------------------+-------------------+
|Customer_ID|snapshot_date|          model_name|  model_predictions|
+-----------+-------------+--------------------+-------------------+
| CUS_0x2ff7|   2023-09-01|credit_model_2024...|0.30120009183883667|
| CUS_0x303a|   2023-09-01|credit_model_2024...|0.28625059127807617|
| CUS_0x305c|   2023-09-01|credit_model_2024...| 0.3428035378456116|
| CUS_0x3082|   2023-09-01|credit_model_2024...|0.28813859820365906|
| CUS_0x308d|   2023-09-01|credit_model_2024...| 0.3139890134334564|
| CUS_0x3101|   2023-09-01|credit_model_2024...|0.22602568566799164|
| CUS_0x3127|   2023-09-01|credit_model_2024...| 0.2247123271226883|
| CUS_0x3161|   2023-09-01|credit_model_2024...| 0.2053392231464386|
| CUS_0x3187|   2023-09-01|credit_model_2024...|0.19738353788852692|
| CUS_0x31b8|   2023-09-01|credit_model_2024...|0.39255911111831665|
| CUS_0x31c0|   2023-09-01|credit_model_2024...|0.36157599091529846|
| CUS_0x3214|   