Questions
<br>🔹 Who is an active vs. inactive patient? <br>🔹 Are high-value patients leaving? <br>🔹 Which patient segments need retention efforts?

Datasets
<br>🔹 All Patient Details <br>🔹 Active Patient Details <br>🔹 Incurred Charges <br>🔹 Guarantor Payments

# Active vs. Inactive Patient Analysis (Attrition Risk) <br>
📌 Goal: Identify patients who haven't returned & optimize outreach efforts.<br>
✅ Steps:<br>

Compare active vs. inactive patient lists (who hasn’t visited in 12-24 months?).<br>
Segment patients by treatment history, insurance plan, visit frequency.<br>
Rank patient segments by retention risk (likelihood to never return).<br>
✅ Datasets Used:<br>
All Patient Details, Active Patient Details, Incurred Charges<br>
📌 Business Impact:<br>
🚀 Enables targeted retention campaigns.<br>
🚀 Reduces lost revenue from patient churn.<br>

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   
from collections import defaultdict
import re
import Levenshtein
from itertools import combinations
from scipy.stats import gmean
import profiler as pf
import math

os.chdir('C:/Users/Admin/Documents/GitHub/Data-Guide')

In [None]:
pull_date = pd.to_datetime('2025-02-18')

In [None]:
procedure_map = {
    "Crowns": [
        "Crown - 3/4 porcelain/ceramic", "Full Cast HNM Crown", "Full Porcelain/Ceramic Crown",
        "Implant supported crown - porcelain fused to high noble alloys",
        "Porcelain/HNM Crown", "Porcelain/HNM Pontic", "Porcelain/Noble Crown",
        "Retainer crown - porcelain fused to high noble metal", "Re-cement or re-bond crown", 
        "Core Buildup w/ Any Pins",
    ],
    
    "Prophies": ["Prophylaxis - Adult", "Prophylaxis - Child", "Topical Applic Fluoride Varnish", 
        "Topical Application of Fluoride", "Sealant", "StellaLife Gel", "StellaLife Rinse"],
    
    "Fillings": [
        "Anterior Resin Composite 1s", "Anterior Resin Composite 2s", "Anterior Resin Composite 3s", 
        "Anterior Resin Composite 4+s", "Posterior Resin Composite 1s", "Posterior Resin Composite 2s", 
        "Posterior Resin Composite 3s", "Posterior Resin Composite 4+s",
        "Custom Abutment"
    ],
    
    "Imaging": [
        "2D Oral/Facial Photo Images", "Bitewing Four Images", "Bitewing Single Image", "Bitewing Two Images",
        "Intraoral - comprehensive series of radiographic images", "Intraoral Periapical Add'l", 
        "Intraoral Periapical Images", "Panoramic Image", "Intraoral – comprehensive series of radiographic images"
    ],
    
    "Evaluations": [
        "Comprehensive Evaluation", "Periodic Evaluation", "Limited Evaluation", "Re-eval - Post-op Office Visit",
        "Periodontal Evaluation"
    ],
    
    "SRP": ["Scaling & Root Planing (1-3)", "Scaling & Root Planing (4-8)", "Scaling in presence of generalized gingival inflammation, full mouth"],
    
    "Perio Maintenance": ["Periodontal Maintenance"],
    
    "Appliance": [
        "Occlusal guard - hard appliance, full arch", "Orthodontic Retention", 
        "Replacement of lost or broken retainer - mandibular", "Re-cement or re-bond fixed retainer - maxillary",
        "Recement/bnd inlay/onlay/part", "Recemnt/bnd cast/prefab pst/cor"
    ],
    
    "Other": [
        "Bone Replacement Graft", "Palliative treatment of dental pain - per visit",
        "Removal of fixed orthodontic appliances for reasons other than completion of treatment",
        "Cancelled Appointment", "Late cancellation fee", "Teeth White - In Office", "Teeth White - Take Home",
        "Diagnostic/Study Models", "Editorial change to the descriptor", 
        "Misc Invoice", "Routine Extraction", 
        "Remove Coronal Remnants - primary tooth", "Limited Occlusal Adjustment",
        "External Bleaching-Office-Arch"
    ],

    "Dental Wellness Plan": ["Dental Wellness Plan"],
    
    "Dentures & Partials": [
        "Interim Lower Partial Denture", "Interim Upper Partial Denture", "Lower Partial w/ Resin Base"
    ],
}

In [None]:
input_dir = "C:/Users/Admin/Documents/GitHub/Data-Guide/data_pipeline/transformed_feb_18" 

output_dir = "C:/Users/Admin/Documents/GitHub/Data-Guide/data_pipeline/analyses_feb_18"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the data
csv_files = {
    #"aged_AR" : os.path.join(input_dir, "transformed_aged_AR.csv"),
    #"aged_AR_long" : os.path.join(input_dir, "transformed_aged_AR_long.csv"),
    #"statement_submission" : os.path.join(input_dir, "transformed_statement_submission.csv"),
    #"integrated_payments" : os.path.join(input_dir, "transformed_integrated_payments.csv"),
    #"billing_statement" : os.path.join(input_dir, "billing_statement_report.csv"),
    #"outstanding_claims" : os.path.join(input_dir, "transformed_outstanding_claims.csv"),
    # "unresolved_claims" : os.path.join(input_dir, "unresolved_claims_report.csv"),
    #"fee_schedule" : os.path.join(input_dir, "fee_schedule.csv"),
    #"openings" : os.path.join(input_dir,"openings.csv"),
    #"schedule" : os.path.join(input_dir,"schedule.csv"),
    "patient_details" : os.path.join(input_dir, "transformed_patient_details.csv"),
    "active_patients" : os.path.join(input_dir, "transformed_active_patient_details.csv"),
    #"processed_payments": os.path.join(input_dir, "transformed_processed_payments.csv"),
    #"payments": os.path.join(input_dir, "transformed_payments.csv"),
    "incurred_charges": os.path.join(input_dir, "transformed_incurred_charges.csv"),
    "transaction_details" : os.path.join(input_dir, "transformed_transaction_details.csv"),
    # "treatment_tracker" : os.path.join(input_dir, "ZR - Treatment Tracker.csv"),
    # "merged_data" : os.path.join(input_dir, "merged_data.csv"),
    #'carrier_decision_data' : os.path.join(input_dir, 'Carrier_Decision_Data.csv'),
    #'insurance_payment_metrics' : os.path.join(input_dir, 'insurance_payment_metrics.csv'),
    "financial_timeline" : os.path.join(input_dir, "financial_timeline.csv"),
    #'time_to_payments' : os.path.join(input_dir, "time_to_payments.csv"),
}
 # Load datasets
dataframes = {dataset: pd.read_csv(file_path) for dataset, file_path in csv_files.items()}

# Markov Model

In [None]:
transactions = dataframes['transaction_details']
transactions.head(30)

In [None]:
procedure_timeline = transactions.loc[transactions['Category'] == 'Procedures',
                                    ["Ascend Patient ID", "Category", 'Date', 'Proc. Description', 'Proc Treatment Area', 'Charges']
                                    ].melt(id_vars=["Ascend Patient ID", "Category", 'Date', 'Proc. Description', 'Proc Treatment Area'],
                                            var_name="Var", 
                                            value_name="Value"
                                            ).drop(axis=1, columns=['Var']
                                            ).sort_values(["Ascend Patient ID", 'Date']
                                                        ).query('Value != 0'
                                                                ).groupby(["Ascend Patient ID", "Category", 'Date', 'Proc. Description']
                                                                            ).agg({
                                                                                "Proc Treatment Area": [
                                                                                    ("Number of Treatment Areas", lambda x: x.nunique()),
                                                                                    ("Treatment Areas", lambda x: ", ".join(x.dropna()))
                                                                                    ],
                                                                                "Value": [
                                                                                    ("Value", lambda x: x.sum())
                                                                                    ]
                                                                                    }).reset_index()

In [None]:
procedure_timeline.columns = procedure_timeline.columns.map(lambda x: x[1] if x[1] != '' else x[0])

In [None]:
procedure_timeline.head(30)

In [None]:
procedure_timeline["proc_group"] = procedure_timeline["Proc. Description"].apply(lambda x: next((k for k, v in procedure_map.items() if x in v), "Other"))

In [None]:
procedure_timeline.groupby(['proc_group']).agg({
    'Value': ['sum', 'count']
}).sort_values(('Value', 'sum'), ascending=False)

In [None]:
procedure_timeline.loc[procedure_timeline['proc_group'] == 'Other', 'Proc. Description'].unique()

In [None]:
procedure_timeline.loc[procedure_timeline['proc_group'] == 'Other'].groupby(['Proc. Description']).agg({
    'Value': ['sum', 'count']
}).sort_values(('Value', 'sum'), ascending=False)

In [None]:
state_space = procedure_timeline.groupby(['Ascend Patient ID', 'Date']).agg({
    'Value': [('Total Charges','sum')],
    "Proc. Description": [
        ("Number of Procedures", "count"),
        ("Number Distinct Procedures", "nunique"),
        ("Procedures", lambda x: ", ".join(x))
    ],
    "proc_group": [
        ("Number of Proc Groups", "nunique"),
        ("Groups", lambda x: ", ".join(sorted(pd.unique(x))))
    ]
}).reset_index()

In [None]:
state_space.columns = state_space.columns.map(lambda x: x[1] if x[1] != '' else x[0])
state_space['Date'] = pd.to_datetime(state_space['Date'])

state_space = state_space.sort_values(['Ascend Patient ID', 'Date'], ascending=True)

In [None]:
state_space["Prior Groups"] = state_space.groupby("Ascend Patient ID")["Groups"].shift(1)
state_space["Next Groups"] = state_space.groupby("Ascend Patient ID")["Groups"].shift(-1)

state_space["Prior Date"] = state_space.groupby("Ascend Patient ID")["Date"].shift(1)
state_space["Next Date"] = state_space.groupby("Ascend Patient ID")["Date"].shift(-1)

state_space["Days Since Prior"] = (state_space["Date"] - state_space["Prior Date"]).dt.days
state_space["Days Until Next"] = (state_space["Next Date"] - state_space["Date"]).dt.days

In [None]:
state_space

In [None]:
state_space['Groups'].value_counts().map(lambda x: 100 *x / state_space.shape[0]).head(30)

In [None]:
state_space['Groups'].value_counts().map(lambda x: x / state_space.shape[0]).cumsum().head(30)

In [None]:
transition_space = state_space.groupby(['Groups', 'Next Groups']).agg({
    'Total Charges': [('Total Charges', 'sum')],
    'Days Until Next' : [('Mean Days Until Next Procedure', lambda x: round(np.nanmean(x), 0))],
    'Ascend Patient ID': [('Number of Patients', 'nunique'),
                      ('Number of Transitions', 'count'),
                      ('Transition Rate', lambda x: x.count() / state_space.shape[0])
    ]
}).reset_index()

transition_space.columns = transition_space.columns.map(lambda x: x[1] if x[1] != '' else x[0])
transition_space
transition_space.sort_values('Number of Transitions', ascending=False).head(30)

In [None]:
steps = state_space[['Prior Groups', 'Days Since Prior', 'Groups', 'Days Until Next', 'Next Groups', 'Ascend Patient ID']].copy()
steps['Next Groups'].fillna('End', inplace=True)
steps['Prior Groups'].fillna('Start', inplace=True)

steps

In [None]:
left = steps.loc[:,['Prior Groups', 'Groups', 'Days Since Prior', 'Ascend Patient ID']].copy()
right = steps.loc[:,['Groups', 'Next Groups', 'Days Until Next', 'Ascend Patient ID']].copy()

left.columns = ['Start', 'End', 'Days', 'ID']
right.columns = ['Start', 'End', 'Days', 'ID']

transitions = pd.concat([left, right], axis=0).drop_duplicates()  

In [None]:
transitions

In [None]:
graph_data = transitions.groupby(['Start', 'End']).agg({
    'Days': [('Average Time Between', lambda x: round(np.nanmean(x), 0)),
             ],
    'ID' : [('Count', 'count'),
            ('Patients', 'nunique'),
            #('Transition Rate', lambda x: x.count() / state_space.shape[0])
            ],
}).reset_index()

graph_data.columns = graph_data.columns.map(lambda x: x[1] if x[1] != '' else x[0])
graph_data.sort_values('Count', ascending=False)

In [None]:
node_data = graph_data.groupby('Start').agg({
    'Count': ['sum']
}).sort_values(('Count', 'sum'), ascending=False).reset_index()
node_data.columns = ['Start', 'Total']

node_data

In [None]:
graph_data = graph_data.merge(node_data, on='Start', how='left')
end_total = graph_data.loc[graph_data['End'] == 'End', 'Count'].sum()
graph_data.loc[graph_data['End'] == 'End', 'Total'] = end_total
graph_data['Transition Rate'] = graph_data['Count'] / graph_data['Total']
graph_data

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
from matplotlib.patches import Patch
import numpy as np
import networkx as nx
import pandas as pd
import math

class MarkovChainVisualizer:
    def __init__(self, df):
        self.df = df.copy()
        self.graph = nx.DiGraph()
        self._build_graph()

    def _build_graph(self):
        """Build the directed graph from the DataFrame."""
        for _, row in self.df.iterrows():
            start = row['Start']
            end = row['End']
            transition_rate = row['Transition Rate']
            count = row['Count']
            avg_time = row.get('Average Time Between', None)

            if not pd.isna(start):
                self.graph.add_node(start, count=count)
            if not pd.isna(end):
                self.graph.add_node(end, count=count)
            if not pd.isna(start) and not pd.isna(end):
                self.graph.add_edge(start, end, weight=transition_rate, avg_time=avg_time)

    def draw_graph(self):
        """Visualize the Markov Chain using NetworkX, with sorted circular layout and edge color gradient."""
        fig, ax = plt.subplots(figsize=(60, 40))  # <-- Needed for proper axes reference

        # --- Define keyword groups and colors in order of priority ---
        keyword_groups = [
            (['Evaluations', 'Imaging', 'Prophies'], 'skyblue'),
            (['Fillings'], 'orange'),
            (['SRP'], 'green'),
            (['Crowns'], 'red')
        ]

        def get_node_category(name):
            for i, (keywords, color) in enumerate(keyword_groups):
                if any(keyword in name for keyword in keywords):
                    return i, color
            return 999, 'lightgray'  # Default

        # --- Sort nodes ---
        nodes = self.graph.nodes(data=True)
        node_info = []
        for name, data in nodes:
            priority, color = get_node_category(name)
            size = data.get('count', 1)
            if name == "Start":
                priority = -2
            elif name == "End":
                priority = 1000
            node_info.append((name, priority, size, color))

        sorted_nodes = [name for name, *_ in sorted(node_info, key=lambda x: (x[1], -x[2], x[0]))]

        # --- Sorted layout ---
        pos = nx.circular_layout(self.graph)
        pos = {name: pos[name] for name in sorted_nodes if name in pos}

        node_sizes = [math.log10(self.graph.nodes[n].get('count', 1)) * 1200 for n in sorted_nodes]
        node_colors = [get_node_category(n)[1] for n in sorted_nodes]

        nx.draw_networkx_nodes(
            self.graph, pos,
            nodelist=sorted_nodes,
            node_color=node_colors,
            node_size=node_sizes,
            edgecolors='black',
            ax=ax  # <-- explicitly attach to ax
        )

        # --- Edge coloring logic ---
        edges = list(self.graph.edges(data=True))
        avg_times = [d['avg_time'] for (_, _, d) in edges if pd.notna(d.get('avg_time')) and d['avg_time'] > 0]
        if avg_times:
            log_times = np.log(avg_times)
            norm = mcolors.Normalize(vmin=min(log_times), vmax=max(log_times))
        else:
            norm = mcolors.Normalize(vmin=0, vmax=1)
        cmap = cm.get_cmap('viridis')

        for u, v, d in edges:
            weight = d.get('weight', 0.01) * 100
            avg_time = d.get('avg_time')
            if pd.notna(avg_time) and avg_time > 0:
                log_val = np.log(avg_time)
                color = cmap(norm(log_val))
            else:
                color = 'gray'

            nx.draw_networkx_edges(
                self.graph, pos,
                edgelist=[(u, v)],
                width=0 + math.log10(weight) * 5,
                edge_color=[color],
                alpha=0.8,
                arrows=True,
                arrowstyle='-|>',
                arrowsize=30,
                min_source_margin=15,
                min_target_margin=25,
                connectionstyle='arc3,rad=0.2',
                ax=ax
            )

        nx.draw_networkx_labels(self.graph, pos, font_size=10, font_weight='bold', ax=ax)

        # --- Colorbar: fix for manual edges ---
        if avg_times:
            sm = cm.ScalarMappable(cmap=cmap, norm=norm)
            sm.set_array([])  # required to avoid warning
            cbar = fig.colorbar(sm, ax=ax, fraction=0.02, pad=0.01)
            cbar.set_label("Log(Average Days Between)", fontsize=14)

        # --- Legend for node color categories ---
        from matplotlib.patches import Patch
        legend_elements = [
            Patch(facecolor='skyblue', edgecolor='black', label='Evaluations / Imaging / Prophies'),
            Patch(facecolor='orange', edgecolor='black', label='Fillings'),
            Patch(facecolor='green', edgecolor='black', label='SRP'),
            Patch(facecolor='red', edgecolor='black', label='Crowns'),
            Patch(facecolor='lightgray', edgecolor='black', label='Uncategorized')
        ]
        ax.legend(
            handles=legend_elements,
            title="Node Categories",
            loc='upper right',
            fontsize=12,
            title_fontsize=13
        )

        ax.set_title("Markov Chain of Patient Procedure Transitions", fontsize=20)
        ax.set_axis_off()
        plt.show()


In [None]:

# Example usage (assuming df is your DataFrame with transition data)
import pandas as pd

df = graph_data.copy()  # Load your data
viz = MarkovChainVisualizer(df)
viz.draw_graph()


In [None]:
transitions

In [None]:
# --- Define keyword groups and colors in order of priority ---
keyword_groups = [
    (['Crowns'], 'Crowns'),
    (['SRP'], 'SRP'),
    (['Fillings'], 'Fillings'),
    (['Evaluations', 'Imaging', 'Prophies'], 'Evaluations / Imaging / Prophies'),
    (['Start'], 'Start'),
    (['End'], 'End')
]

def get_node_category(name):
    for i, (keywords, mapping) in enumerate(keyword_groups):
        if any(keyword in name for keyword in keywords):
            return mapping
    return 'Other'  # Default

In [None]:
transitions_simplified = transitions.copy()

transitions_simplified['Start_simple'] = transitions_simplified['Start'].apply(get_node_category)
transitions_simplified['End_simple'] = transitions_simplified['End'].apply(get_node_category)

transitions_simplified

In [None]:
transitions_simplified['Start'] = transitions_simplified['Start'].apply(get_node_category)
transitions_simplified['End'] = transitions_simplified['End'].apply(get_node_category)

transitions_simplified

In [None]:
graph_data_simplified = transitions_simplified.groupby(['Start', 'End']).agg({
    'Days': [('Average Time Between', lambda x: round(np.nanmean(x), 0)),
             ],
    'ID' : [('Count', 'count'),
            ('Patients', 'nunique'),
            #('Transition Rate', lambda x: x.count() / state_space.shape[0])
            ],
}).reset_index()

graph_data_simplified.columns = graph_data_simplified.columns.map(lambda x: x[1] if x[1] != '' else x[0])
graph_data_simplified.sort_values('Count', ascending=False)

In [None]:
node_data_simplified = graph_data_simplified.sort_values('Count', ascending=False).groupby('Start').agg({
    'Count': ['sum']
}).sort_values(('Count', 'sum'), ascending=False).reset_index()
node_data_simplified.columns = ['Start', 'Total']

node_data_simplified

In [None]:
graph_data_simplified = graph_data_simplified.merge(node_data_simplified, on='Start', how='left')
end_total = graph_data_simplified.loc[graph_data_simplified['End'] == 'End', 'Count'].sum()
graph_data_simplified.loc[graph_data_simplified['End'] == 'End', 'Total'] = end_total
graph_data_simplified['Transition Rate'] = graph_data_simplified['Count'] / graph_data_simplified['Total']
graph_data_simplified

In [None]:
df = graph_data_simplified.copy()  # Load your data
viz = MarkovChainVisualizer(df)
viz.draw_graph()