In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   
from collections import defaultdict
import re
import Levenshtein
from itertools import combinations
from scipy.stats import gmean
import profiler as pf

os.chdir('C:/Users/Admin/Documents/GitHub/Data-Guide')

In [None]:
input_dir = "C:/Users/Admin/Documents/GitHub/Data-Guide/data_pipeline/transformed_feb_18" 

output_dir = "C:/Users/Admin/Documents/GitHub/Data-Guide/data_pipeline/analyses_feb_18"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the data
csv_files = {
        "aged_AR" : os.path.join(input_dir, "transformed_aged_AR.csv"),
        # "statement_submission" : os.path.join(input_dir, "statement_submission_report.csv"),
         "integrated_payments" : os.path.join(input_dir, "transformed_integrated_payments.csv"),
        #"billing_statement" : os.path.join(input_dir, "billing_statement_report.csv"),
        # "outstanding_claims" : os.path.join(input_dir, "outstanding_claims_report.csv"),
        # "unresolved_claims" : os.path.join(input_dir, "unresolved_claims_report.csv"),
        #"fee_schedule" : os.path.join(input_dir, "fee_schedule.csv"),
        #"openings" : os.path.join(input_dir,"openings.csv"),
        #"schedule" : os.path.join(input_dir,"schedule.csv"),
        "patient_list" : os.path.join(input_dir, "transformed_patient_details.csv"),
        # "processed_payments": os.path.join(input_dir, "ZR - Credit Card Processed Payments.csv"),
        "transaction_details" : os.path.join(input_dir, "transformed_transaction_details.csv"),
        "charges" : os.path.join(input_dir, "transformed_incurred_charges.csv"),
        # "treatment_tracker" : os.path.join(input_dir, "ZR - Treatment Tracker.csv"),
    }

 # Load datasets
dataframes = {dataset: pd.read_csv(file_path) for dataset, file_path in csv_files.items()}

In [None]:
def curve(x, n = 2):
    return round(x.replace([0, np.inf, -np.inf], np.nan).dropna().mean(), n)

# Patient Data Only

## Initial processing and simple checks

In [None]:
patients = dataframes["patient_list"]

patients['surviving'] = (patients['Status'] == 'ACTIVE') | (pd.to_datetime(patients['Last Visit']) > (pd.Timestamp.today() - pd.DateOffset(years=1)))

In [None]:
patients['surviving'].value_counts()

In [None]:
# Find those who only have one visit

patients['lone_vist'] = patients['Last Visit'] == patients['First Visit']
patients['lone_vist'].value_counts()

In [None]:
patients['lifespan'] = patients.apply(lambda row: max((pd.Timestamp.today() - pd.to_datetime(row['First Visit'])).days, 
                                                      (pd.to_datetime(row['Next Appointment Date']) - pd.to_datetime(row['First Visit'])).days) 
                                      if row['surviving'] else 
                                      (pd.to_datetime(row['Last Visit']) - pd.to_datetime(row['First Visit'])).days, axis=1)

In [None]:
temporal = pf.TemporalAnalyzer(patients, output_dir)
plotters = pf.DataProfilerPlots(patients)

In [None]:
numeric = pf.NumericProfiler(patients['lifespan'])
numeric.profile()

In [None]:
plotters.histogram('lifespan', f"{output_dir}/lifespan_hist.png")
plotters.kde_plot('lifespan', f"{output_dir}/lifespan_kde.png")
plotters.box_plot('lifespan', f"{output_dir}/lifespan_box.png")

In [None]:
patients['Cleaned Carrier'].value_counts().head(30)

In [None]:
carriers = ['Delta Dental', 'Blue Cross Blue Shield', 'MetLife', 'Aetna', 'Guardian','CIGNA Global Health', 'United Healthcare Dental', 'Humana', 'Sun Life Financial (PO Box 2940 Clinton IA)', 'Principal Financial Group', 'Unum (Administered by Starmount)']

In [None]:
patients['carrier'] = patients['Cleaned Carrier'].apply(lambda x: x if x in carriers else 'OTHER')
patients['carrier'].value_counts()

In [None]:
patients['ZIP'] = patients['ZIP Code'].apply(lambda x: x if x in ['60640', '60613', '60625', '60618'] else 'OTHER')
patients['ZIP'].value_counts()

In [None]:
patients['DCS'] = patients['Pat. Prim. Plan'].str.contains('DCS')
patients['DNU'] = patients['Pat. Prim. Plan'].str.contains('DNU')

In [None]:
patients.loc[(patients['lifespan'].isna()) & (patients['Status'].isin(['ACTIVE', 'INACTIVE'])) & (~patients['First Visit'].isna())].drop(columns=['Patient', 'Primary Guarantor', 'Primary Contact', 'Last Name', 'Chart Number', 'Ascend Patient ID', 'Date Of Birth', 'Phone', 'Email', 'Address']).head(30)

In [None]:
patients.loc[(patients['lifespan'].isna()) & (patients['Status'].isin(['ACTIVE', 'INACTIVE'])) & (~patients['First Visit'].isna()), 'lifespan'] = 0

In [None]:
prediction_columns = ['surviving', 'lone_vist', 'lifespan', 'Discount Plan', 'Pat. Prim. Fee Schedule', 'carrier', 'ZIP', 'student', 'patientAge', 'DCS', 'DNU']

## Survival Analyses

In [None]:
return_df = patients.loc[(patients['First Visit'] < '2024-02-01') & (patients['Status'].isin(['ACTIVE', 'INACTIVE'])) & (~patients['First Visit'].isna()), prediction_columns].copy()
survival_df = patients.loc[(patients['Status'].isin(['ACTIVE', 'INACTIVE']) & (~patients['First Visit'].isna())), prediction_columns].copy()

In [None]:
return_df

In [None]:
survival_df

In [None]:
numeric = pf.NumericProfiler(survival_df['lifespan'])
numeric.profile()

In [None]:
import lifelines
from lifelines import KaplanMeierFitter, CoxPHFitter

In [None]:
# Fit the Kaplan-Meier estimator
kmf = KaplanMeierFitter()
kmf.fit(survival_df['lifespan'], event_observed=survival_df['surviving'])

# Plot survival curve
kmf.plot_survival_function()

In [None]:
s_df = pd.get_dummies(survival_df, drop_first=True).drop(columns=['lone_vist'])
s_df

In [None]:
# Fit Cox model
cph = CoxPHFitter()
cph.fit(s_df, duration_col='lifespan', event_col='surviving')

# Print results and plot
cph.print_summary()
cph.plot()

In [None]:
# Fit Cox model
cph = CoxPHFitter()
cph.fit(s_df.loc[:,['surviving','lifespan', 'patientAge', 'DCS_True', 'DNU_True', 'carrier_Humana']], duration_col='lifespan', event_col='surviving')

# Print results and plot
cph.print_summary()
cph.plot()

## Survival Curves

In [None]:
s_df2 = survival_df.copy()

# Convert categorical predictors to strings for stratification
columns_to_convert = ['Discount Plan', 'carrier', 'Pat. Prim. Fee Schedule', 'ZIP', 'student', 'DCS', 'DNU']
s_df2[columns_to_convert] = s_df2[columns_to_convert].apply(lambda x: x.astype(str))

# Create age bins for Kaplan-Meier stratification
s_df2['Age Group'] = pd.cut(s_df2['patientAge'], bins=[18, 30, 50, 80], labels=["18-30", "30-50", "50+"])

In [None]:
def plot_km_survival(df, time_col, event_col, group_col):
    """
    Plots Kaplan-Meier survival curves stratified by a categorical column.

    Args:
        df (pd.DataFrame): The dataframe containing survival data.
        time_col (str): The column representing duration/time.
        event_col (str): The column representing event occurrence (1 = event, 0 = censored).
        group_col (str): The categorical column to stratify by.
    
    Returns:
        None (Displays the survival plot)
    """
    kmf = KaplanMeierFitter()
    plt.figure(figsize=(10, 6))

    for category in df[group_col].dropna().unique():
        subset = df[df[group_col] == category]
        kmf.fit(subset[time_col], event_observed=subset[event_col], label=str(category))
        kmf.plot_survival_function()

    plt.title(f"Kaplan-Meier Survival Curve by {group_col}")
    plt.xlabel("Time (Days)")
    plt.ylabel("Survival Probability")
    plt.legend(title=group_col)
    plt.grid(True)
    plt.show()

In [None]:
plot_km_survival(s_df2,'lifespan', 'surviving', 'Age Group')

In [None]:
carriers = ['Delta Dental', 'Blue Cross Blue Shield', 'MetLife']
s_df2['carrier'] = s_df2['carrier'].apply(lambda x: x if x in carriers else 'OTHER')
s_df2['carrier'].value_counts()

In [None]:
# columns_to_convert = ['Discount Plan', 'carrier', 'Pat. Prim. Fee Schedule', 'ZIP', 'student', 'DCS', 'DNU']
plot_km_survival(s_df2,'lifespan', 'surviving', 'carrier')

In [None]:
s_df2['Pat. Prim. Fee Schedule'].value_counts()

In [None]:
fee_schedules = ['Careington Care Platinum PPO 2025', 'Delta Dental Premier', 'CIGNA 2022 Z219', ' Juniper Office Fees']
s_df2['fee_schedule'] = s_df2['Pat. Prim. Fee Schedule'].apply(lambda x: x if x in fee_schedules else 'OTHER or None')
s_df2['fee_schedule'].value_counts()

In [None]:
# columns_to_convert = ['Discount Plan', 'carrier', 'Pat. Prim. Fee Schedule', 'ZIP', 'student', 'DCS', 'DNU']
plot_km_survival(s_df2,'lifespan', 'surviving', 'fee_schedule')

In [None]:
plot_km_survival(s_df2,'lifespan', 'surviving', 'ZIP')

In [None]:
plot_km_survival(s_df2,'lifespan', 'surviving', 'student')

In [None]:
plot_km_survival(s_df2.loc[s_df2['DCS'] != 'nan'],'lifespan', 'surviving', 'DCS')

In [None]:
plot_km_survival(s_df2.loc[s_df2['DNU'] != 'nan'],'lifespan', 'surviving', 'DNU')

# Transaction & Charges 


## Processing

In [None]:
transactions = dataframes["transaction_details"].copy()
charges = dataframes["charges"].copy()

In [None]:
charges

In [None]:
charges['Amount'] = charges['Amount']*-1
charges['Date'] = pd.to_datetime(charges['Date (Modified)'])
charges = charges[charges['Amount'] > 0]

In [None]:
charges

In [None]:
charge_data = charges.groupby('Ascend Patient ID').agg({
    "Amount": [
        ("Total Charges", lambda x: x.sum()), 
        ("Average Charge", lambda x: x.mean().round()), 
        ("Max Charge", lambda x: x.max())
    ],
    "Date": [
        ("Number of Charges", lambda x: x.count()), 
        ("First Charge Date", lambda x: x.min()), 
        ("Last Charge Date", lambda x: x.max()), 
        ("Timespan", lambda x: (x.max()-x.min()).days)
    ]
}).pipe(lambda d: d.assign(
    charges_per_year=lambda d: ( # Change to # Semiannual Charges
        (d[("Date", "Number of Charges")] * 365 / d[("Date", "Timespan")])
        .where(d[("Date", "Timespan")] > 365, d[("Date", "Number of Charges")])  # Fix for division by zero 
    ).replace([float("inf"), -float("inf")], None).fillna(0).round().astype("int")
))

charge_data

In [None]:
charge_plotters = pf.DataProfilerPlots(charge_data)

In [None]:
for col in charge_data.select_dtypes(exclude=['datetime64[ns]']).columns:
    print(f"Numeric Profiling for {col}")
    print(pf.NumericProfiler(charge_data[col]).profile())
    print("\n")
    charge_plotters.histogram(col, f"{output_dir}/{col}_hist.png")

In [None]:
transactions['Date'] = pd.to_datetime(transactions['Date'])

procedures = transactions.loc[transactions['Category'] == 'Procedures'].copy()
insurance_payments = transactions.loc[transactions['Category'] == 'Insurance Payments'].copy()
guarantor_payments = transactions.loc[transactions['Category'] == 'Guarantor Payments'].copy()
adjustments = transactions.loc[transactions['Category'] == 'Credit Adjustments'].copy()

In [None]:
transactions

In [None]:
procedures

In [None]:
proc_words = ['evaluation', 'intraoral', 'periapical', 'prophylaxis', 'bitewing', 'resin', 'composite', 'images', 'posterior']
proc_types = {'Th7, Th8, Th9, Th10': "Th7-10", 'Th23, Th24, Th25, Th26': 'Th23-26', 'Mouth ': 'Mouth'}
# Take 'Th7, Th8, Th9, Th10', 'Th23, Th24, Th25, Th26', 'Mouth', and 'Other'
# 'Proc Treatment Area'
# Top 5 and Other
# 'Proc. Description'

In [None]:
proc_data = procedures.groupby('Ascend Patient ID').agg({
    "Charges": [
        ("Total Charges", lambda x: x.sum()), 
        ("Average Charge", lambda x: x.mean().round()), 
        ("Max Charge", lambda x: x.max())
    ],
    "Date": [
        ("Number of Procedures", lambda x: x.count()), 
        ("First Procedure Date", lambda x: x.min()), 
        ("Last Procedure Date", lambda x: x.max()), 
        ("Procedure Timespan", lambda x: (x.max()-x.min()).days)
    ],
    "Proc Treatment Area": [
        ("Top Treatment Area", lambda x: x.mode().iloc[0] if not x.mode().empty else None),
        ("Number of Treatment Areas", lambda x: x.nunique())
    ],
}).pipe(lambda d: d.assign(
    procedures_per_year=lambda d: (
        (d[("Date", "Number of Procedures")] * 365 / d[("Date", "Procedure Timespan")])
        .where(d[("Date", "Procedure Timespan")] > 365, d[("Date", "Number of Procedures")])
    ).replace([float("inf"), -float("inf")], None).fillna(0).round().astype("int")
))

# Apply 'Any {w} Procedure' checks safely
for w in proc_words:
    proc_data[(f"Any {w} Procedure")] = (
        procedures.groupby('Ascend Patient ID')[w]
        .apply(lambda x: x.any())
    )

proc_data


In [None]:
procedures['Proc Area'] = procedures['Proc Treatment Area'].apply(lambda x: proc_types.get(x, 'Other'))

# Group by Patient ID and Proc Area, then aggregate
proc_data = procedures.groupby(['Ascend Patient ID', 'Proc Area']).agg({
    "Charges": [
        ("Total Charges", lambda x: x.sum()), 
        ("Average Charge", lambda x: x.mean().round()),
        ("Procedure Count", lambda x: x.count())  # Count occurrences
    ],
    "Proc. Description": [
        ("Most Common Procedure", lambda x: x.value_counts().index[0]),
        ("Number of Distinct Procedures", lambda x: x.nunique())
    ]
})
proc_data

In [None]:
proc_data_flat = proc_data.unstack(fill_value=0).copy()  # Unstack to separate into columns

# Flatten MultiIndex Columns
proc_data_flat.columns = [" - ".join(map(str, col)).strip() for col in proc_data_flat.columns]
proc_data_flat.head(30)

In [None]:
# Among other things, calculate the percent of the bills that are paid by insurance and the ratio of patient:insurance payments
guarantor_payments['Credits'] = guarantor_payments['Credits']*-1
guarantor_payments

In [None]:
payor_data = guarantor_payments.groupby('Ascend Patient ID').agg({
    "Credits": [
        ("Total Guarantor Payment", lambda x: x.sum()), 
        ("Average Guarantor Payment", lambda x: x.mean().round()), 
        ("Max Guarantor Payment", lambda x: x.max())
    ],
    "Date": [
        ("Number of Guarantor Payments", lambda x: x.count()), 
        ("First Guarantor Payment Date", lambda x: x.min()), 
        ("Last Guarantor Payment Date", lambda x: x.max()), 
        ("Guarantor Payment Timespan", lambda x: (x.max()-x.min()).days)
    ]
}).pipe(lambda d: d.assign(
    guarantor_payments_per_year=lambda d: (
        (d[("Date", "Number of Guarantor Payments")] * 365 / d[("Date", "Guarantor Payment Timespan")])
        .where(d[("Date", "Guarantor Payment Timespan")] > 365, d[("Date", "Number of Guarantor Payments")])  # Fix for division by zero
    ).replace([float("inf"), -float("inf")], None).fillna(0).round().astype("int")
))

payor_data

In [None]:
insurance_payments['Credits'] = insurance_payments['Credits']*-1
insurance_payments

In [None]:
insurer_data = insurance_payments.groupby('Ascend Patient ID').agg({
    "Credits": [
        ("Total Insurance Payment", lambda x: x.sum()), 
        ("Average Insurance Payment", lambda x: x.mean().round()), 
        ("Max Insurance Payment", lambda x: x.max())
    ],
    "Date": [
        ("Number of Insurance Payments", lambda x: x.count()), 
        ("First Insurance Payment Date", lambda x: x.min()), 
        ("Last Insurance Payment Date", lambda x: x.max()), 
        ("Insurance Payment Timespan", lambda x: (x.max()-x.min()).days)
    ]
}).pipe(lambda d: d.assign(
    insurance_payments_per_year=lambda d: (
        (d[("Date", "Number of Insurance Payments")] * 365 / d[("Date", "Insurance Payment Timespan")])
        .where(d[("Date", "Insurance Payment Timespan")] > 365, d[("Date", "Number of Insurance Payments")])  # Fix for division by zero
    ).replace([float("inf"), -float("inf")], None).fillna(0).round().astype("int")
))

insurer_data

In [None]:
# Join datasets together, including the survival data


# Time to Payment

In [None]:
transactions

In [None]:
transactions['Category'].value_counts()

In [None]:
transactions['Amount'] = transactions.apply(lambda row: row['Charges'] if row['Category'] in (['Procedures', 'Charge Adjustments']) else row['Credits'], axis=1)
pf.NumericProfiler(transactions['Amount']).profile()

In [None]:
financial_timeline = transactions.loc[:,["Ascend Patient ID", "Category", 'Date', 'Proc. Description', 'Proc Treatment Area', 'Amount']
                                      ].melt(id_vars=["Ascend Patient ID", "Category", 'Date', 'Proc. Description', 'Proc Treatment Area'], var_name="Var", value_name="Value"
                                             ).drop(axis=1, columns=['Var']
                                             ).sort_values(["Ascend Patient ID", 'Date']).query('Value != 0').groupby(["Ascend Patient ID", "Category", 'Date', 'Proc. Description']
                                                                                                                      ).agg({
                                                                                                                          "Proc Treatment Area": [
                                                                                                                                ("Number of Treatment Areas", lambda x: x.nunique()),
                                                                                                                                ("Treatment Areas", lambda x: ", ".join(x.dropna()))
                                                                                                                            ],
                                                                                                                            "Value": [
                                                                                                                                ("Value", lambda x: x.sum())
                                                                                                                            ],                                                                                                                            
                                                                                                                      }).reset_index()

In [None]:
financial_timeline.columns

In [None]:
financial_timeline.columns = financial_timeline.columns.map(lambda x: x[1] if x[1] != '' else x[0])
financial_timeline.head(60)

In [None]:
financial_timeline.to_csv(os.path.join(input_dir, "financial_timeline.csv"), index=False)

In [None]:
def allocate_payments(procedures, insurance_payments, guarantor_payments, adjustments):
    """Assigns payments (Insurance, Guarantor, Adjustments) to the most recent associated procedure unless the payment exceeds the charge."""

    # Initialize new columns for all payment types
    for payment_type in ["Insurance", "Guarantor", "Adjustment"]:
        procedures[f"{payment_type} Payment Date"] = pd.NaT
        procedures[f"{payment_type} Payment Amount"] = 0
        procedures[f"{payment_type} Time to Payment"] = None
        procedures[f"{payment_type} Percent Paid"] = None

    # Ensure datetime format
    procedures["Date"] = pd.to_datetime(procedures["Date"])
    insurance_payments["Date"] = pd.to_datetime(insurance_payments["Date"])
    guarantor_payments["Date"] = pd.to_datetime(guarantor_payments["Date"])
    adjustments["Date"] = pd.to_datetime(adjustments["Date"])

    # Sort all datasets for sequential matching
    procedures = procedures.sort_values(["Ascend Patient ID", "Proc. Description", "Date"]).reset_index(drop=True)
    
    for df, payment_type in [
        (insurance_payments, "Insurance"),
        (guarantor_payments, "Guarantor"),
        (adjustments, "Adjustment"),
    ]:
        df = df.sort_values(["Ascend Patient ID", "Proc. Description", "Date"]).reset_index(drop=True)

        # Iterate over payments
        for _, payment in df.iterrows():
            pid, pdate, pdesc, pvalue = payment["Ascend Patient ID"], payment["Date"], payment["Proc. Description"], abs(payment["Value"])

            # Find eligible procedures before the payment date
            eligible_procs = procedures[
                (procedures["Ascend Patient ID"] == pid) &
                (procedures["Proc. Description"] == pdesc) &
                (procedures["Date"] <= pdate) & 
                (procedures[f"{payment_type} Payment Amount"] == 0)  # Ensure payment hasn't already been assigned
            ]

            if not eligible_procs.empty:
                # Find the most recent procedure (last one before payment date)
                latest_proc_idx = eligible_procs.index[-1]
                proc_charge = procedures.at[latest_proc_idx, "Value"]

                # If payment is greater than the procedure charge, look for an earlier unpaid procedure
                while pvalue > proc_charge and len(eligible_procs) > 1:
                    eligible_procs = eligible_procs.iloc[:-1]  # Remove the most recent procedure
                    latest_proc_idx = eligible_procs.index[-1]
                    proc_charge = procedures.at[latest_proc_idx, "Value"]

                # Assign payment to the determined procedure
                procedures.at[latest_proc_idx, f"{payment_type} Payment Date"] = pdate
                procedures.at[latest_proc_idx, f"{payment_type} Payment Amount"] = pvalue
                procedures.at[latest_proc_idx, f"{payment_type} Time to Payment"] = (pdate - procedures.at[latest_proc_idx, "Date"]).days
                procedures.at[latest_proc_idx, f"{payment_type} Percent Paid"] = round((pvalue / proc_charge), 2)

    return procedures


In [None]:
procedures = financial_timeline.loc[financial_timeline['Category'] == 'Procedures'].copy()
insurance_payments = financial_timeline.loc[financial_timeline['Category'] == 'Insurance Payments'].copy()
guarantor_payments = financial_timeline.loc[financial_timeline['Category'] == 'Guarantor Payments'].copy()
insurance_adjustments = financial_timeline.loc[financial_timeline['Category'] == 'Credit Adjustments'].copy()

In [None]:
time_to_payments = allocate_payments(procedures, insurance_payments, guarantor_payments, insurance_adjustments)
time_to_payments.head(30)

In [None]:
time_to_payments.sort_values(by=["Ascend Patient ID", "Date", 'Proc. Description']).head(30)

In [None]:
time_to_payments.sort_values(by=["Ascend Patient ID", "Date", 'Proc. Description']).to_csv(os.path.join(input_dir, "time_to_payments.csv"), index=False)

In [None]:
financial_timeline.groupby(["Ascend Patient ID", "Category", 'Date']).agg({
    "Value": ["sum", ("Average", "mean"), "max",],
    "Proc. Description": [
        ("Number Procedures", "count"),
        ("Number Distinct Procedures", "nunique")
                          ]
}).sort_values(["Ascend Patient ID", 'Date']).head(30)

In [None]:
time_to_payments['Total Payments'] = time_to_payments['Insurance Payment Amount'] + time_to_payments['Guarantor Payment Amount']

grouped_payments = time_to_payments.groupby(["Proc. Description"]).agg({
    "Value": [
        ("Total Charges", "sum"), 
        ("Average Charges", lambda x: curve(x)), 
        ("Largest Charge", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),  
    ],
    "Total Payments" :[
        ("Total Payments", "sum"), 
        ("Average Total Payment", lambda x: curve(x)), 
        ("Largest Total Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
    ],
    "Insurance Payment Amount": [
        ("Total Insurance Payments", "sum"), 
        ("Average Insurance Payment", lambda x: curve(x)), 
        ("Largest Insurance Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        ("Number of Insurance Payments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),  # Count valid
    ],
    "Insurance Time to Payment": [
        ("Average Time to Insurance Payment", lambda x: curve(x))
    ],
    "Insurance Percent Paid": [
        ("Average Percent Paid by Insurance", lambda x: curve(x))
    ],
    "Guarantor Payment Amount": [
        ("Total Guarantor Payments", "sum"), 
        ("Average Guarantor Payment", lambda x: curve(x)),  
        ("Largest Guarantor Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        ("Number of Guarantor Payments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),
    ],
    "Guarantor Time to Payment": [
        ("Average Time to Guarantor Payment", lambda x: curve(x))
    ],
    "Guarantor Percent Paid": [
        ("Average Percent Paid by Guarantor", lambda x: curve(x))
    ],
    "Adjustment Payment Amount": [
        ("Total Adjustments", "sum"), 
        ("Average Adjustment", lambda x: curve(x)), 
        ("Largest Adjustment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        ("Number of Adjustments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),
    ],
    "Adjustment Time to Payment": [
        ("Average Time to Adjustment", lambda x: curve(x))
    ],
    "Adjustment Percent Paid": [
        ("Average Percent Adjusted", lambda x: curve(x))
    ],
})

grouped_payments.columns = grouped_payments.columns.droplevel(0)

grouped_payments['Adjustment Inflation'] = round(1 / (1 - grouped_payments['Average Percent Adjusted']), 2)

grouped_payments.sort_values("Total Charges", ascending=False).head(30)

In [None]:
time_to_payments.sort_values(by=["Ascend Patient ID", "Date", 'Proc. Description']).head(30)

In [None]:
time_to_payments.groupby(["Ascend Patient ID"]).agg({
    "Insurance Time to Payment": ["mean"],
    "Guarantor Time to Payment": ["mean"],
    "Adjustment Time to Payment": ["mean"]
})

In [None]:
full_timing = time_to_payments.merge(patients, on='Ascend Patient ID').sort_values(by=["Ascend Patient ID", "Date", 'Proc. Description'])
full_timing.head(30)

In [None]:
full_timing.columns

In [None]:
full_timing['Total Payments'] = full_timing['Insurance Payment Amount'] + full_timing['Guarantor Payment Amount']

grouped_insurance_payments = full_timing.groupby(['Cleaned Carrier', "Proc. Description"]).agg({
    "Value": [
        ("Total Charges", "sum"), 
        ("Number of Charges", lambda x: x.count()),
        ("Average Charge", lambda x: curve(x)), 
        ("Largest Charge", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),  
    ],
    "Total Payments" :[
        ("Total Payments", "sum"), 
        ("Average Total Payment", lambda x: curve(x)), 
        ("Largest Total Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
    ],
    "Insurance Payment Amount": [
        ("Total Insurance Payments", "sum"), 
        ("Average Insurance Payment", lambda x: curve(x)), 
        ("Largest Insurance Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        ("Number of Insurance Payments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),  # Count valid
    ],
    "Insurance Time to Payment": [
        ("Average Time to Insurance Payment", lambda x: curve(x))
    ],
    "Insurance Percent Paid": [
        ("Average Percent Paid by Insurance", lambda x: curve(x))
    ],
    "Guarantor Payment Amount": [
        ("Total Guarantor Payments", "sum"), 
        ("Average Guarantor Payment", lambda x: curve(x)),  
        ("Largest Guarantor Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        ("Number of Guarantor Payments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),
    ],
    "Guarantor Time to Payment": [
        ("Average Time to Guarantor Payment", lambda x: curve(x))
    ],
    "Guarantor Percent Paid": [
        ("Average Percent Paid by Guarantor", lambda x: curve(x))
    ],
    "Adjustment Payment Amount": [
        ("Total Adjustments", "sum"), 
        ("Average Adjustment", lambda x: curve(x)), 
        ("Largest Adjustment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        ("Number of Adjustments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),
    ],
    "Adjustment Time to Payment": [
        ("Average Time to Adjustment", lambda x: curve(x))
    ],
    "Adjustment Percent Paid": [
        ("Average Percent Adjusted", lambda x: curve(x))
    ],
})

grouped_insurance_payments.columns = grouped_insurance_payments.columns.droplevel(0)

grouped_insurance_payments['Adjustment Inflation'] = round(1 / grouped_insurance_payments['Average Percent Adjusted'], 2)


In [None]:

grouped_insurance_payments.sort_values("Total Payments", ascending=False).head(30)

In [None]:
grouped_insurance_payments.sort_values(['Cleaned Carrier', 'Total Charges'], ascending=False).to_csv(os.path.join(input_dir, "insurance_payment_metrics.csv"), index=False)

In [None]:
full_timing['Total Payments'] = full_timing['Insurance Payment Amount'] + full_timing['Guarantor Payment Amount']

insurance_payments_and_timing = full_timing.groupby(['Cleaned Carrier']).agg({
    "Patient": [
        ("Number of Patients", lambda x: x.nunique())
    ],
    "Value": [
        ("Total Charges", "sum"), 
        ("Number of Charges", lambda x: x.count()),
        ("Average Charge", lambda x: curve(x)), 
        #("Largest Charge", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),  
    ],
    "Total Payments" :[
        ("Total Payments", "sum"), 
        #("Average Total Payment", lambda x: round(x.replace([0, np.inf, -np.inf], np.nan).dropna().mean(), 2)), 
        #("Largest Total Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
    ],
    "Insurance Payment Amount": [
        ("Total Insurance Payments", "sum"), 
        #("Average Insurance Payment", lambda x: round(x.replace([0, np.inf, -np.inf], np.nan).dropna().mean(), 2)), 
        #("Largest Insurance Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        ("Number of Insurance Payments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),  # Count valid
    ],
    "Insurance Time to Payment": [
        ("Average Time to Insurance Payment", lambda x: curve(x))
    ],
    "Insurance Percent Paid": [
        ("Average Percent Paid by Insurance", lambda x: curve(x))
    ],
    "Guarantor Payment Amount": [
        ("Total Guarantor Payments", "sum"), 
        ("Average Guarantor Payment", lambda x: curve(x)),  
        ("Largest Guarantor Payment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        ("Number of Guarantor Payments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),
    ],
    "Guarantor Time to Payment": [
        ("Average Time to Guarantor Payment", lambda x: curve(x))
    ],
    "Guarantor Percent Paid": [
        ("Average Percent Paid by Guarantor", lambda x: curve(x))
    ],
    "Adjustment Payment Amount": [
        #("Total Adjustments", "sum"), 
        #("Average Adjustment", lambda x: round(x.replace([0, np.inf, -np.inf], np.nan).dropna().mean(), 2)), 
        #("Largest Adjustment", lambda x: x.replace([np.inf, -np.inf], np.nan).dropna().max()),
        #("Number of Adjustments", lambda x: x.loc[~x.isin([0, np.inf, -np.inf])].count()),
    ],
    # "Adjustment Time to Payment": [
    #     ("Average Time to Adjustment", lambda x: round(x.replace([0, np.inf, -np.inf], np.nan).dropna().mean(), 2))
    # ],
    "Adjustment Percent Paid": [
        ("Average Percent Adjusted", lambda x: curve(x))
    ],
})

insurance_payments_and_timing.columns = insurance_payments_and_timing.columns.droplevel(0)

insurance_payments_and_timing['Adjustment Inflation'] = round(1 / insurance_payments_and_timing['Average Percent Adjusted'], 2)


In [None]:
insurance_payments_and_timing.sort_values("Total Payments", ascending=False).head(30)

In [None]:
patients.columns

In [None]:
insurance_info = patients.groupby('Cleaned Carrier').agg({
    'Patient': [
        ("Number of Patients", lambda x: x.nunique())
    ],
    'ZIP': [
        ("ZIP Codes", lambda x: x.value_counts().to_dict()),
    ],
    'patientAge': [
        ("Average Age", lambda x: curve(x, n=0)),
        ("Oldest Patient", lambda x: x.max()),
        ("Youngest Patient", lambda x: x.min()),
    ],
    'lifespan': [
        ("Average Lifespan", lambda x: round(x.replace([np.inf, -np.inf], np.nan).dropna().mean(), 0)),
        ("Longest Patient", lambda x: x.max()),
        ('Number of Single-Visit Patients', lambda x: x.loc[x == 0].count()),
    ],
    'overdue': [
        ('Number of Overdue Patients', lambda x: x.loc[x == True].count())
    ]
})
insurance_info.columns = insurance_info.columns.droplevel(0)

In [None]:
insurance_info['Percent Single Visit'] = round(insurance_info['Number of Single-Visit Patients'] / insurance_info['Number of Patients'] * 100, 2)
insurance_info['Percent Overdue'] = round(insurance_info['Number of Overdue Patients'] / insurance_info['Number of Patients'] * 100, 2)
insurance_info.sort_values("Number of Patients", ascending=False).head(30)

In [None]:
Carrier_Decision_Data = insurance_info.merge(insurance_payments_and_timing, left_index=True, right_index=True, how='outer')

In [None]:
Carrier_Decision_Data.sort_values("Number of Patients_x", ascending=False).to_csv(os.path.join(input_dir, "Carrier_Decision_Data.csv"), index=False)

In [None]:
# Calculate time to payment
# Merge on insurance, plan, age, etc.

In [None]:
financial_timeline

In [None]:


parallel_coordinates(df[numeric_cols.to_list() + [class_column]], class_column, colormap=plt.get_cmap("tab10"))