🔹 How much revenue is locked in outstanding AR? 
<br> ---- sum total vs total payments
<br>🔹 Which claims remain unresolved longest? 
<br> ---- combine with transactions
<br>🔹 Which payers cause the most delays?
<br> ---- combine with patient details
<br>🔹 Do certain procedures have longer collection times?
<br> ---- combine with transactions
<br>🔹 Do older patients or certain insurance plans have slower payments?
<br> ---- combine with patient details
<br>🔹 What % of AR is overdue? 
<br> ---- sum total vs total payments
<br>🔹 How likely is a bill in "90+ Days" to remain unpaid?
<br>🔹 Which insurance companies pay on time vs. delay payments?
<br> ---- combine with patient details and insurance details
<br>🔹 How often do denied claims result in collection delays?
<br>🔹 Can we predict claims risk, optimize payment reminders?

In [None]:
answers = {
    "revenue_in_AR": 0,
    "longest_claims": 0,
    "delayed_payers": 0,
    "procedure_collection_times": 0,
    "slower_payments": 0,
    "overdue_AR_percentage": 0,
    "liklihood_of_default": 0,
    "insurance_punctuality": 0,
    "denied_claims_to_delays": 0,
    "claims_risk": 0,
}

Datasets Needed
<br>🔹 Aged AR (long form)<br>🔹 Outstanding Claims<br>🔹 Insurance Payments & Adjustments<br>🔹 Processed Payments <br>🔹 Statement Submissions

# Initialization and loading data

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   
from collections import defaultdict
import re
import Levenshtein
from itertools import combinations
from scipy.stats import gmean
import profiler as pf

os.chdir('C:/Users/Admin/Documents/GitHub/Data-Guide')

In [None]:
pull_date = pd.to_datetime('2025-02-18')

In [None]:
input_dir = "C:/Users/Admin/Documents/GitHub/Data-Guide/data_pipeline/transformed_feb_18" 

output_dir = "C:/Users/Admin/Documents/GitHub/Data-Guide/data_pipeline/analyses_feb_18"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the data
csv_files = {
    "aged_AR" : os.path.join(input_dir, "transformed_aged_AR.csv"),
    "aged_AR_long" : os.path.join(input_dir, "transformed_aged_AR_long.csv"),
    "statement_submission" : os.path.join(input_dir, "transformed_statement_submission.csv"),
    "integrated_payments" : os.path.join(input_dir, "transformed_integrated_payments.csv"),
    #"billing_statement" : os.path.join(input_dir, "billing_statement_report.csv"),
    "outstanding_claims" : os.path.join(input_dir, "transformed_outstanding_claims.csv"),
    # "unresolved_claims" : os.path.join(input_dir, "unresolved_claims_report.csv"),
    #"fee_schedule" : os.path.join(input_dir, "fee_schedule.csv"),
    #"openings" : os.path.join(input_dir,"openings.csv"),
    #"schedule" : os.path.join(input_dir,"schedule.csv"),
#    "patient_details" : os.path.join(input_dir, "transformed_patient_details.csv"),
    "active_patients" : os.path.join(input_dir, "transformed_active_patient_details.csv"),
    "processed_payments": os.path.join(input_dir, "transformed_processed_payments.csv"),
    "payments": os.path.join(input_dir, "transformed_payments.csv"),
    "incurred_charges": os.path.join(input_dir, "transformed_incurred_charges.csv"),
    "transaction_details" : os.path.join(input_dir, "transformed_transaction_details.csv"),
    # "treatment_tracker" : os.path.join(input_dir, "ZR - Treatment Tracker.csv"),
    # "merged_data" : os.path.join(input_dir, "merged_data.csv"),
    'carrier_decision_data' : os.path.join(input_dir, 'Carrier_Decision_Data.csv'),
    'insurance_payment_metrics' : os.path.join(input_dir, 'insurance_payment_metrics.csv'),
    "financial_timeline" : os.path.join(input_dir, "financial_timeline.csv"),
    'time_to_payments' : os.path.join(input_dir, "time_to_payments.csv"),
}
 # Load datasets
dataframes = {dataset: pd.read_csv(file_path) for dataset, file_path in csv_files.items()}

In [None]:
dataframes['aged_AR_long'].head(30)

In [None]:
class AgedARVisualizer:
    def __init__(self, dataframe):
        self.df = dataframe.copy()
        self.buckets = ['0-30', '31-60', '61-90', '91+']
        self.df = self._reshape_data()

    def _reshape_data(self):
        """
        Reshape data so that each row represents a Responsible Party, with Amounts per bucket.
        """
        pivot_df = self.df.pivot_table(index=['Responsible Party', 'Ascend Patient ID'], 
                                       columns='Bucket', values='Amount', aggfunc='sum').fillna(0)
        pivot_df.reset_index(inplace=True)
        return pivot_df

    def plot_subplots(self):
        """
        Create a subplot for each Responsible Party.
        """
        parties = self.df['Responsible Party'].unique()
        fig, axes = plt.subplots(len(parties), 1, figsize=(12, 6 * len(parties)), sharex=True)
        
        if len(parties) == 1:
            axes = [axes]  # Ensure axes is iterable
        
        for ax, party in zip(axes, parties):
            subset = self.df[self.df['Responsible Party'] == party]
            parallel_coordinates(subset, class_column='Responsible Party', cols=self.buckets, ax=ax, colormap='tab10')
            ax.set_title(f'Parallel Coordinates Plot for {party}')
            ax.set_ylabel("Amount")
            ax.legend().remove()
        
        plt.xlabel("Aging Buckets")
        plt.show()

    def plot_colored(self):
        """
        Create a single Parallel Coordinates plot with color-coded Responsible Party (excluding Total & Write-Off).
        """
        filtered_df = self.df[~self.df['Responsible Party'].isin(['Total', 'Write-Off'])]
        plt.figure(figsize=(12, 8))
        parallel_coordinates(filtered_df, class_column='Responsible Party', cols=self.buckets, colormap='tab10')
        plt.title("Parallel Coordinates Plot by Responsible Party")
        plt.ylabel("Amount")
        plt.xlabel("Aging Buckets")
        plt.legend(title='Responsible Party')
        plt.show()


# Data Manipulation

In [None]:
aged_ar_long = dataframes['aged_AR_long'].copy()
aged_ar_long.value_counts('Bucket')

In [None]:
ar_mapping = {
    '0-30' : (pull_date - pd.DateOffset(days=30), pull_date),
    '31-60' : (pull_date - pd.DateOffset(days=60), pull_date - pd.DateOffset(days=31)),
    '61-90' : (pull_date - pd.DateOffset(days=90), pull_date - pd.DateOffset(days=61)),
    '91+' : (pd.to_datetime('2000-01-01'), pull_date - pd.DateOffset(days=91)),
}

In [None]:
aged_ar_long['timespan'] = aged_ar_long['Bucket'].map(ar_mapping)
ar_long = aged_ar_long.loc[(aged_ar_long['Amount'] != 0) & (aged_ar_long['Responsible Party'] != 'Total')].copy()
ar_long['timespan_start'] = ar_long['timespan'].apply(lambda x: x[0])
ar_long['timespan_end'] = ar_long['timespan'].apply(lambda x: x[1])

In [None]:
ar_long.head(30)

In [None]:
class AgedARVisualizer:
    def __init__(self, dataframe):
        self.df = dataframe.copy()
        self.buckets = ['0-30', '31-60', '61-90', '91+']
        
        print("🔍 Checking initial DataFrame columns:", self.df.columns)
        
        self.df = self._reshape_data()

    def _reshape_data(self):
        """
        Reshape data so that each row represents a Responsible Party, with Amounts per bucket.
        """
        pivot_df = self.df.pivot_table(index=['Responsible Party', 'Ascend Patient ID'], 
                                       columns='Bucket', values='Amount', aggfunc='sum').fillna(0)
        pivot_df.reset_index(inplace=True)

        print("✅ Pivoted DataFrame columns:", pivot_df.columns)  # Debugging

        return pivot_df

    def plot_subplots(self):
        """
        Create a subplot for each Responsible Party.
        """
        parties = self.df['Responsible Party'].unique()
        fig, axes = plt.subplots(len(parties), 1, figsize=(12, 6 * len(parties)), sharex=True)
        
        if len(parties) == 1:
            axes = [axes]  # Ensure axes is iterable
        
        for ax, party in zip(axes, parties):
            subset = self.df[self.df['Responsible Party'] == party]
            
            print(f"🔍 Debugging: Subset for {party} -> Columns:", subset.columns)  # Debugging
            
            if 'Responsible Party' not in subset.columns:
                print(f"❌ 'Responsible Party' is missing from subset for {party}!")
            
            parallel_coordinates(subset, class_column='Responsible Party', cols=self.buckets, ax=ax, colormap='tab10')
            ax.set_title(f'Parallel Coordinates Plot for {party}')
            ax.set_ylabel("Amount")
            ax.legend().remove()
        
        plt.xlabel("Aging Buckets")
        plt.show()

    def plot_colored(self):
        """
        Create a single Parallel Coordinates plot with color-coded Responsible Party (excluding Total & Write-Off).
        """
        filtered_df = self.df[~self.df['Responsible Party'].isin(['Total', 'Write-Off'])]
        
        print("🔍 Filtered DataFrame (plot_colored) columns:", filtered_df.columns)  # Debugging
        
        if 'Responsible Party' not in filtered_df.columns:
            print("❌ 'Responsible Party' is missing from filtered_df!")

        plt.figure(figsize=(12, 8))
        parallel_coordinates(filtered_df, class_column='Responsible Party', cols=self.buckets, colormap='tab10', alpha=0.5)
        plt.title("Parallel Coordinates Plot by Responsible Party")
        plt.ylabel("Amount")
        plt.xlabel("Aging Buckets")
        plt.legend(title='Responsible Party')
        plt.show()


In [None]:
# Example usage
df = pd.DataFrame(aged_ar_long)
    
visualizer = AgedARVisualizer(df)
visualizer.plot_subplots()  # Option 1: Subplots per Responsible Party
visualizer.plot_colored()   # Option 2: Color-coded plot



In [None]:
from pandas.plotting import parallel_coordinates

# Parallel Coordinates plot of Amount by Bucket colored by Responsible Party

# Select relevant columns for the plot
plot_data = aged_ar_long[['Amount', 'Bucket', 'Responsible Party']]

# Create the parallel coordinates plot
plt.figure(figsize=(12, 6))
parallel_coordinates(plot_data, class_column='Responsible Party', cols=['Amount'], color=plt.cm.Set1.colors)
plt.title('Parallel Coordinates Plot of Amount by Bucket colored by Responsible Party')
plt.xlabel('Attributes')
plt.ylabel('Amount')
plt.show()



In [None]:
procedures = dataframes['financial_timeline'].query('Category == "Procedures"').copy()
procedures['Date'] = pd.to_datetime(procedures['Date'])
procedures.head(30)

In [None]:
# Performing the join
merged_df = procedures.merge(ar_long, on="Ascend Patient ID", how="inner")

# Filtering to ensure the procedure date falls within the AR timespan
matched_df = merged_df[
    (merged_df["Date"] >= merged_df["timespan_start"]) & (merged_df["Date"] <= merged_df["timespan_end"])
].drop_duplicates().drop(columns=["timespan", "timespan_start", "timespan_end", 'Category']).sort_values(['Ascend Patient ID', "Date", "Proc. Description"])

In [None]:
matched_df.loc[(matched_df['Bucket'] != "91+") & (matched_df['Responsible Party'] == "Guarantor")].head(30)

In [None]:
time_to_payments = dataframes['time_to_payments'].copy()
time_to_payments['Date'] = pd.to_datetime(time_to_payments['Date'])

time_to_payments['total_paid'] = time_to_payments['Insurance Payment Amount'] + time_to_payments['Guarantor Payment Amount'] + time_to_payments['Adjustment Payment Amount']

time_to_payments['remaining_balance'] = time_to_payments['Value'] - time_to_payments['total_paid']
time_to_payments = time_to_payments.loc[time_to_payments['remaining_balance'] != 0]
time_to_payments['row_id'] = time_to_payments.index
time_to_payments.head(30)


In [None]:
#time_to_payments['temp'] = time_to_payments['remaining_balance'] * -1
balanced = time_to_payments.merge(time_to_payments, on='Ascend Patient ID', suffixes=('_1', '_2'))
drop_ind = balanced.loc[(balanced['remaining_balance_1'] == balanced['remaining_balance_2'] * -1), ['Ascend Patient ID', 'Date_1', 'Date_2', 'remaining_balance_1', 'remaining_balance_2', 'row_id_1', 'row_id_2']]['row_id_1'].values
drop_ind

In [None]:
time_to_payments.loc[(~time_to_payments['row_id'].isin(drop_ind))]

# AR Collection Efficiency & Write-Off Risk<br>
📌 Goal: Identify delinquent accounts, aging trends, and recovery probability.<br>
✅ Steps:<br>

Calculate % of AR in each aging bucket (30, 60, 90, 120+ days).<br>
Rank patients & insurance plans by collections risk.<br>
Identify patterns in write-offs vs. successful collections.<br>
✅ Datasets Used:<br>
Aged AR (long form), Outstanding Claims, Processed Payments<br>
📌 Business Impact:<br>
🚀 Reduces bad debt write-offs.<br>
🚀 Optimizes collection strategy based on payer trends.<br>

# AR Aging Forecasting & Collections Prioritization <br>
📌 Goal: Predict which AR accounts are likely to default.<br>
✅ Approach:<br>

Use time series forecasting (Prophet, ARIMA, LSTMs) to predict AR trends.<br>
Train a classification model to rank overdue accounts by likelihood of non-payment.<br>
✅ Datasets Used:<br>
Aged AR, Processed Payments, Financial Timeline<br>
📌 Business Impact:<br>
🚀 Reduces bad debt by prioritizing high-risk accounts.<br>
🚀 Improves long-term financial planning.<br>