In [0]:
%pip install kagglehub

In [0]:

import os
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
from pyspark.sql.functions import datediff, col
from pyspark.sql.functions import col, explode_outer, lpad, regexp_replace
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import posexplode, concat, lit
import seaborn as sns


# Display all columns
pd.set_option('display.max_columns', None)

os.environ['KAGGLEHUB_CACHE'] = '/Volumes/workspace/gjain/project_health'
path = kagglehub.dataset_download("bonifacechosen/nhis-healthcare-claims-and-fraud-dataset")
print(path)

In [0]:
# read into a dataframe
raw = spark.read.csv("/Volumes/workspace/gjain/project_health/datasets/bonifacechosen/nhis-healthcare-claims-and-fraud-dataset/versions/3/simulated_healthcare_claims (1).csv", header=True, inferSchema=True).withColumnRenamed("Amount Billed", "paid_amt").withColumnRenamed("Fraud Type", "fraud").withColumnRenamed("Date Admitted", "admission_dt").withColumnRenamed("Date Discharged", "discharge_dt").withColumnRenamed("Patient ID", "patient_id").withColumnRenamed("Age", "age").withColumnRenamed("Gender", "gender")
display(raw)

In [0]:
# Example: Create age feature
df = raw.toPandas()
df['admission_dt'] = pd.to_datetime(df['admission_dt'])
df['discharge_dt'] = pd.to_datetime(df['discharge_dt']) 
age_bins = [0, 18, 30, 45, 60, 75, 100]
age_labels = ['0-17', '18-29', '30-44', '45-59', '60-74', '75+']
df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)


# Feature engineering: use primary diagnosis/procedure codes, gender, LOS, etc.
df['length_of_stay'] = (df['discharge_dt'] - df['admission_dt']).dt.days.astype('int') 
print("Number of null length_of_stay values:", df['length_of_stay'].isnull().sum())

In [0]:
# Split input dataset into 2 datasets for drift analysis
split_idx = len(df) // 2

df1 = df.iloc[:split_idx, :].reset_index(drop=True)
df2 = df.iloc[split_idx:, :].reset_index(drop=True)

print("First half:")
print(len(df1))
print("\nSecond half:")
print(len(df2))

In [0]:
import requests
import os
import importlib.util

# Download the raw psi.py file from GitHub
raw_url = "https://raw.githubusercontent.com/mwburke/population-stability-index/refs/heads/master/psi.py"
response = requests.get(raw_url)

with open("psi.py", "wb") as f:
    f.write(response.content)

spec = importlib.util.spec_from_file_location('psi', 'psi.py'); psi = importlib.util.module_from_spec(spec); spec.loader.exec_module(psi); calculate_psi = psi.calculate_psi

result = calculate_psi(df1['age'],df2['age'],buckettype='quantiles', buckets=10, axis=1)  
print(result)

# Optional: remove the temporary file
os.remove("psi.py")

In [0]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("white")
rs = np.random.RandomState(5)

initial = rs.normal(size = 100)
new = rs.normal(loc = 0.2, size = 120)
plot = sns.kdeplot(df1['age'], shade=True)
plot = sns.kdeplot(df2['age'], shade=True)
plot.set(yticklabels=[], xticklabels = [])
sns.despine(left=True)