In [36]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   
from collections import defaultdict
import re

os.chdir('C:/Users/Admin/Documents/GitHub/Data-Guide')

In [141]:
def convert_unix_timestamps(df, column, in_milliseconds=True):
    """
    Convert Unix timestamps in a specified column to datetime.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column (str): The name of the column containing Unix timestamps.
        in_milliseconds (bool): Whether the timestamps are in milliseconds. Default is True.

    Returns:
        pd.DataFrame: The DataFrame with the converted datetime column.
    """
    try:
        factor = 1000 if in_milliseconds else 1
        coll = pd.to_datetime(df[column] / factor, unit='s', errors='coerce')
        print(f"Successfully converted {column} to datetime.")
    except Exception as e:
        coll = df[column]
        print(f"Error converting column {column}: {e}")
    return coll

In [99]:
input_dir = "C:/Users/Admin/Documents/GitHub/Data-Guide/data_pipeline/pull_jan_28" 

output_dir = "C:/Users/Admin/Documents/GitHub/Data-Guide/data_pipeline/transformed_jan_28"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the data
csv_files = {
        "aged_AR" : os.path.join(input_dir, "aged_ar_report.csv"),
        "statement_submission" : os.path.join(input_dir, "statement_submission_report.csv"),
        "integrated_payments" : os.path.join(input_dir, "integrated_payments_report.csv"),
        #"billing_statement" : os.path.join(input_dir, "billing_statement_report.csv"),
        "outstanding_claims" : os.path.join(input_dir, "outstanding_claims_report.csv"),
        "unresolved_claims" : os.path.join(input_dir, "unresolved_claims_report.csv"),
        #"fee_schedule" : os.path.join(input_dir, "fee_schedule.csv"),
        #"openings" : os.path.join(input_dir,"openings.csv"),
        #"schedule" : os.path.join(input_dir,"schedule.csv"),
        "patient_list" : os.path.join(input_dir, "ZR - Patient List with Details.csv"),
        "processed_payments": os.path.join(input_dir, "ZR - Credit Card Processed Payments.csv"),
        "transaction_details" : os.path.join(input_dir, "ZR - Transaction Detail.csv"),
        "treatment_tracker" : os.path.join(input_dir, "ZR - Treatment Tracker.csv"),
    }

 # Load datasets
dataframes = {dataset: pd.read_csv(file_path) for dataset, file_path in csv_files.items()}

In [100]:
for d in dataframes.keys():
    print(d)
    print(dataframes[d].columns)
    print("\n")
    print(dataframes[d].head())
    print("\n")

aged_AR
Index(['id', 'guarantor', 'phoneNumber', 'billingStatement', 'claimsPending',
       'chargeBalance', 'suspendedCredits', 'hasInvalidHistories',
       'lastPayment.datedAs', 'lastPayment.amount', 'before30.amount',
       'before30.insurancePortion', 'before30.guarantorPortion',
       'before30.writeOff', 'before60.amount', 'before60.insurancePortion',
       'before60.guarantorPortion', 'before60.writeOff', 'before90.amount',
       'before90.insurancePortion', 'before90.guarantorPortion',
       'before90.writeOff', 'over90.amount', 'over90.insurancePortion',
       'over90.guarantorPortion', 'over90.writeOff', 'balance.amount',
       'balance.insurancePortion', 'balance.guarantorPortion',
       'balance.writeOff'],
      dtype='object')


               id             guarantor     phoneNumber  billingStatement  \
0  14000002286730        Herbert, Ken D  (847) 204-5464      1.400002e+13   
1  14000003021513        Doucette, Joel  (414) 467-7414      1.400005e+13   
2  14

In [101]:
dataframes['aged_AR']['Ascend Patient ID'] = dataframes['aged_AR']['id']
dataframes['outstanding_claims']['Ascend Patient ID'] = dataframes['outstanding_claims']['patient.id']
dataframes['statement_submission']['Ascend Patient ID'] = dataframes['statement_submission']['patient.id']


for key in dataframes:
    if key not in ['integrated_payments', 'unresolved_claims', 'processed_payments']:
        print(key)
        print(dataframes[key]['Ascend Patient ID'].head())
        df_rows = dataframes[key]['Ascend Patient ID'].apply(lambda x: str(x).isnumeric() and 'Total' not in str(x))
        print(df_rows.head())
        dataframes[key] = dataframes[key].loc[df_rows, :]
        print(dataframes[key].head())
        dataframes[key]['merge_key'] = dataframes[key]['Ascend Patient ID'].astype('Int64').astype(str)

aged_AR
0    14000002286730
1    14000003021513
2    14000008567895
3    14000007712809
4    14000007960989
Name: Ascend Patient ID, dtype: int64
0    True
1    True
2    True
3    True
4    True
Name: Ascend Patient ID, dtype: bool
               id             guarantor     phoneNumber  billingStatement  \
0  14000002286730        Herbert, Ken D  (847) 204-5464      1.400002e+13   
1  14000003021513        Doucette, Joel  (414) 467-7414      1.400005e+13   
2  14000008567895  Fitzpatrick, Caitlin  (847) 507-1188      1.400005e+13   
3  14000007712809          Onyx, Sappho  (650) 384-5159      1.400005e+13   
4  14000007960989        Burns, Mikaela  (708) 310-0899               NaN   

   claimsPending  chargeBalance  suspendedCredits  hasInvalidHistories  \
0            NaN            0.0             -56.2                False   
1            2.0          381.4               0.0                False   
2            1.0          264.0               0.0                False   
3       

In [72]:
# Merge the data
dataframes
merged_df = dataframes['patient_list'].copy()
for key in dataframes:
        if key in ['statement_submission', 'aged_AR']:
                print(key)
                merged_df = merged_df.merge(dataframes[key], on='Ascend Patient ID', how='left')

aged_AR
statement_submission


In [44]:
merged_df.to_csv(f"{output_dir}/merged_data.csv", index=False)

# Transform the Data

In [102]:
transformed_data = defaultdict(pd.DataFrame)

Transformations for Aged AR:<br>
    - ~~Flag for "remaining guarantor portion"~~<br>
    - ~~Total amount and distributions based on bucket - overlaid or grouped~~<br>
	- Wide-to-long<br>
	- - Aging bucket, responsible party, amount<br>

## Row sums

In [103]:
df_t = dataframes['aged_AR'].copy()

df_t['total_guarantorPortion'] = df_t.loc[:, ['before30.guarantorPortion', 'before60.guarantorPortion', 'before90.guarantorPortion', 'over90.guarantorPortion']].sum(axis=1)
df_t['remaining_guarantorPortion'] = df_t['total_guarantorPortion'] > 0

df_t['total_insurancePortion'] = df_t.loc[:, ['before30.insurancePortion', 'before60.insurancePortion', 'before90.insurancePortion', 'over90.insurancePortion']].sum(axis=1)
df_t['remaining_insurancePortion'] = df_t['total_insurancePortion'] > 0

## Wide to Long

In [104]:
df_t
df_long = df_t.melt(id_vars=['id', 'Ascend Patient ID'], 
                    value_vars=['before30.amount', 'before60.amount', 'before90.amount', 'over90.amount',
                                'before30.guarantorPortion', 'before60.guarantorPortion', 'before90.guarantorPortion', 'over90.guarantorPortion',
                                'before30.insurancePortion', 'before60.insurancePortion', 'before90.insurancePortion', 'over90.insurancePortion'], 
                    var_name='Aging Bucket', 
                    value_name='Amount')

df_long[['Bucket', 'Responsible Party']] = df_long['Aging Bucket'].str.split('.', expand=True)
df_long['Responsible Party'] = df_long['Responsible Party'].str.replace('amount', 'Total').str.replace('guarantorPortion', 'Guarantor').str.replace('insurancePortion', 'Insurance')
df_long['Bucket'] = df_long['Bucket'].str.replace('before30', '0-30').str.replace('before60', '0-60').str.replace('before90', '0-90').str.replace('over90', '91+')

df_long = df_long.drop(columns=['Aging Bucket'])

print(df_long.head())

               id  Ascend Patient ID  Amount Bucket Responsible Party
0  14000002286730     14000002286730     0.0   0-30             Total
1  14000003021513     14000003021513   381.4   0-30             Total
2  14000008567895     14000008567895   264.0   0-30             Total
3  14000007712809     14000007712809     0.0   0-30             Total
4  14000007960989     14000007960989   197.0   0-30             Total


In [105]:
transformed_data['aged_AR'] = df_t.copy()
transformed_data['aged_AR'].to_csv(f"{output_dir}/transformed_aged_AR.csv", index=False)

transformed_data['aged_AR_long'] = df_long.copy()
transformed_data['aged_AR_long'].to_csv(f"{output_dir}/transformed_aged_AR_long.csv", index=False)

Transformations for Integrated Payments <br>
    - ~~Parse transactionCardholderName to pull "discover", "visa", etc~~ <br>
    - ~~Check "ZZ" for transactionCardholderName~~ <br>
    - ~~Amount over time~~ <br>
    - ~~Pair and remove refunds and voids~~ <br>

## Rolling Sums

In [138]:
df_t = dataframes['integrated_payments'].copy()

df_t.head()


Unnamed: 0,id,transactionDateTime,transactionCardholderName,transactionId,transactionSource,transactionReferenceNumber,ledgerType,transactionAmount,transactionCardLogo
0,14000059563439,1738100073993,MAZARIEGOS/DAVIS,1249350830,TERMINAL,Mazariegos Davis efeda813a3a64eee9d5ba6cb29949ed7,CREDIT_CARD_PAYMENT,80.4,14000000000234
1,14000059566743,1738099006000,seth B,1249166446,ONLINE,d5befb54e5e44a2aaa,CREDIT_CARD_PAYMENT,89.0,14000000000234
2,14000059566745,1738098817000,Kurt K,1249133868,ONLINE,5b55f00af9814aaba6,CREDIT_CARD_PAYMENT,86.0,14000000000255
3,14000059558724,1738098646990,NELSON/DAVID C,1249106703,TERMINAL,Nelson David 2fc5768b39214beb9c981a74f6145246,CREDIT_CARD_PAYMENT,64.8,14000000000255
4,14000059554657,1738097446620,GARRITY/PATRICK,1248896607,TERMINAL,Garrity Patrick f443b87c19194664b2d32929f7dd82db,CREDIT_CARD_PAYMENT,6.0,14000000000234


In [143]:

# Rolling Sum

# Ensure the transactionDateTime column is in datetime format
df_t['transactionDateTime'] = convert_unix_timestamps(df_t,'transactionDateTime')

# Set the transactionDateTime as the index
df_t.set_index('transactionDateTime', inplace=True)

# Calculate the rolling sum and count of transactions week-over-week
df_t['rolling_sum'] = df_t['transactionAmount'].rolling('7D').sum()
df_t['transaction_count'] = df_t['transactionAmount'].rolling('7D').count()

df_t['event'] = df_t['ledgerType'].str.replace('CREDIT_CARD_', '').str.lower()

# Reset the index
df_t.reset_index(inplace=True)

print(df_t[['transactionDateTime', 'transactionAmount', 'ledgerType', 'event', 'rolling_sum', 'transaction_count']].head(20))

Successfully converted transactionDateTime to datetime.
             transactionDateTime  transactionAmount           ledgerType  \
0  2025-01-28 21:34:33.992999936              80.40  CREDIT_CARD_PAYMENT   
1  2025-01-28 21:16:46.000000000              89.00  CREDIT_CARD_PAYMENT   
2  2025-01-28 21:13:37.000000000              86.00  CREDIT_CARD_PAYMENT   
3  2025-01-28 21:10:46.990000128              64.80  CREDIT_CARD_PAYMENT   
4  2025-01-28 20:50:46.620000000               6.00  CREDIT_CARD_PAYMENT   
5  2025-01-28 18:29:21.433000192              86.00  CREDIT_CARD_PAYMENT   
6  2025-01-27 23:26:39.496000000              80.20  CREDIT_CARD_PAYMENT   
7  2025-01-27 22:45:28.563000064             177.80  CREDIT_CARD_PAYMENT   
8  2025-01-27 22:29:17.956000000             382.61  CREDIT_CARD_PAYMENT   
9  2025-01-27 22:28:41.963000064             294.39  CREDIT_CARD_PAYMENT   
10 2025-01-27 22:25:38.256000000              20.00  CREDIT_CARD_PAYMENT   
11 2025-01-27 22:23:42.132999936

## Substring extraction

In [144]:
# Extract card type from transactionCardholderName
df_t['Card Type'] = df_t['transactionCardholderName'].str.extract(
    '(visa|discover|mc|mastercard|amex|americanexpress)', 
    flags=re.IGNORECASE, 
    expand=False
).str.lower()

# Fill missing values with 'unknown'
df_t['Card Type'] = df_t['Card Type'].fillna('unknown')

print(df_t.head(20))

             transactionDateTime              id   transactionCardholderName  \
0  2025-01-28 21:34:33.992999936  14000059563439            MAZARIEGOS/DAVIS   
1  2025-01-28 21:16:46.000000000  14000059566743                      seth B   
2  2025-01-28 21:13:37.000000000  14000059566745                      Kurt K   
3  2025-01-28 21:10:46.990000128  14000059558724  NELSON/DAVID C               
4  2025-01-28 20:50:46.620000000  14000059554657             GARRITY/PATRICK   
5  2025-01-28 18:29:21.433000192  14000059527505                         NaN   
6  2025-01-27 23:26:39.496000000  14000059468254             CARDHOLDER/VISA   
7  2025-01-27 22:45:28.563000064  14000059463874             MORRIS/ANDREW B   
8  2025-01-27 22:29:17.956000000  14000059461411             CARDHOLDER/VISA   
9  2025-01-27 22:28:41.963000064  14000059461310             CARDHOLDER/VISA   
10 2025-01-27 22:25:38.256000000  14000059460750                         NaN   
11 2025-01-27 22:23:42.132999936  140000

In [145]:
transformed_data['integrated_payments'] = df_t.copy()
transformed_data['integrated_payments'].to_csv(f"{output_dir}/transformed_integrated_payments.csv", index=False)

Transformations for Outstanding Claims
    <br>- ~~Subscriber DoB > 20 years from Guarantor DoB~~
	<br>- Everything by Insurer
	<br>- ~~Parse and consolidate group plan name~~
	<br>- Balance - Estimate?
	<br>- ~~Flag for student plans~~
	<br>- ~~- patient DoB vs subscriber DoB~~
	<br>- Aggregate insurance carriers that are state-specific
	<br>- - Blue Cross Blue Shield
	<br>- - Delta Dental

In [109]:
df_t = dataframes['outstanding_claims'].copy()

df_t['student'] = df_t['student'] = (df_t['subscriber.firstName'] != df_t['patient.firstName']) & \
                  (pd.to_datetime(df_t['subscriber.dateOfBirth']) > pd.to_datetime(df_t['patient.dateOfBirth']) + pd.DateOffset(years=18))

df_t['student'] = df_t['student'].replace({True: 'Student', False: 'Non-Student'})

df_t.student.value_counts()

student
Non-Student    107
Name: count, dtype: int64

In [110]:
df_t = dataframes['outstanding_claims'].copy()

In [111]:
transformed_data['outstanding_claims'] = df_t.copy()

Transformations for Patient Details
	<br>- ~~Calculate Patient Lifespan~~
	<br>- - ~~Today - First Visit and Active~~
	<br>- - ~~Last Visit - First Visit~~
	<br>- - ~~Next Appointment Date - First Visit~~
	<br>- ~~Aggregate insurance carriers that are state-specific~~
	<br>- ~~Year of Birth~~
	<br>- ~~Month of Birth~~
	<br>- ~~Last Visit != Last Procedure~~
	<br>- ~~Remove test patients~~
	<br>- ~~View uppercase and lowercase patients, empty values~~
	<br>- ~~Patient != Guarantor~~
	<br>- - ~~Flag for student plans~~
	<br>- Parse and aggregate plans
	<br>- - Awaiting client response
	<br>- ~~Drop "685806 - " from Pat. Prim. Fee Schedule and Discount Plan~~
	<br>- Parse address
	<br>- ~~Group dates to month~~
	<br>- ~~Flag who hasn't been in 6 months~~
	<br>- ~~Flag for hasNextAppointment~~
	<br>- - ~~and isn't in the past~~
	<br>- Flag for Medicare/Medicaid

In [147]:
df_t = dataframes['patient_list'].copy()

In [148]:
# Filter rows where Patient is all uppercase or all lowercase
uppercase_patients = df_t[df_t['Patient'].str.isupper()]
lowercase_patients = df_t[df_t['Patient'].str.islower()]

# Combine the results
filtered_patients = pd.concat([uppercase_patients, lowercase_patients])

print(filtered_patients)
# Isolate patients with "test" in the name or three or more repeated letters
test_patients = df_t[df_t['Patient'].str.lower().str.contains(r'test', case=False, regex=True)]
repeated_letter_patients = df_t[df_t['Patient'].str.contains(r'(.)\1{2,}', case=False, regex=True)]

# Combine the results
patients_to_drop = pd.concat([test_patients, repeated_letter_patients]).drop_duplicates()

print(patients_to_drop)

# Drop the isolated patients from the original dataframe
df_t = df_t[~df_t.index.isin(patients_to_drop.index)]

                                Patient                  Primary Guarantor  \
156                     BRESEMAN, GRACE                    BRESEMAN, GRACE   
573                GERSHEVICH, VALERIYA               GERSHEVICH, VALERIYA   
996                    LUMSDEN, RICHARD                   LUMSDEN, RICHARD   
1397                    ROMEU, ANGELICA                    ROMEU, ANGELICA   
1753                WEHMEIER, CHARLOTTE                WEHMEIER, CHARLOTTE   
242                            cccc, cc                           cccc, cc   
631                         hall, sasha                        hall, sasha   
1644  the return of the test, testy two  the return of the test, testy two   
1836                     zwick, emanuel                      Zwick, Qinnan   

                        Primary Contact               Last Name Chart Number  \
156                     BRESEMAN, GRACE                BRESEMAN          NaN   
573                GERSHEVICH, VALERIYA              GERSHE

  repeated_letter_patients = df_t[df_t['Patient'].str.contains(r'(.)\1{2,}', case=False, regex=True)]


In [149]:
df_t['patientAge'] = (pd.to_datetime('today') - pd.to_datetime(df_t['Date Of Birth'])).dt.days // 365.25
df_t['student'] = (df_t['Patient'] != df_t['Primary Guarantor']) & (df_t['patientAge'] < 25)

df_t['student'] = df_t['student'].replace({True: 'Student', False: 'Non-Student'})

df_t.student.value_counts()

student
Non-Student    1806
Student          28
Name: count, dtype: int64

In [150]:
# Ensure the date columns are in datetime format
df_t['First Visit'] = pd.to_datetime(df_t['First Visit'])
df_t['Last Visit'] = pd.to_datetime(df_t['Last Visit'])
df_t['Next Appointment Date'] = pd.to_datetime(df_t['Next Appointment Date'])
df_t['Last Procedure Date'] = pd.to_datetime(df_t['Last Procedure Date'])

# Calculate the patient lifespan
df_t['Lifespan (Today - First Visit)'] = (pd.to_datetime('today') - df_t['First Visit']).dt.days
df_t['Lifespan (Last Visit - First Visit)'] = (df_t['Last Visit'] - df_t['First Visit']).dt.days
df_t['Lifespan (Next Appointment Date - First Visit)'] = (df_t['Next Appointment Date'] - df_t['First Visit']).dt.days

df_t['Time Since Last Visit'] = (pd.to_datetime('today') - df_t['Last Visit']).dt.days
df_t['Visit and Procedure Mismatch'] = df_t['Last Visit'] != df_t['Last Procedure Date']

df_t['hasNextAppointment'] = df_t['Next Appointment Date'].notnull() & (df_t['Next Appointment Date'] > pd.to_datetime('today'))
df_t['overdue'] = (df_t['Time Since Last Visit'] > 183) & (df_t['hasNextAppointment'] == False)

print(df_t[['First Visit', 'Last Visit', 'Last Procedure Date', 'Next Appointment Date', 'Lifespan (Today - First Visit)', 'Lifespan (Last Visit - First Visit)', 'Lifespan (Next Appointment Date - First Visit)', 'Time Since Last Visit',
            'Visit and Procedure Mismatch', 'hasNextAppointment', 'overdue']].head())

  First Visit Last Visit Last Procedure Date Next Appointment Date  \
0  2024-06-27 2024-07-26          2024-07-26            2025-01-31   
1  2024-12-09 2025-01-10          2025-01-10            2025-05-09   
2         NaT        NaT                 NaT                   NaT   
3  2022-03-24 2024-11-25          2023-12-18                   NaT   
4  2023-02-21 2024-11-04          2024-11-04            2025-04-22   

   Lifespan (Today - First Visit)  Lifespan (Last Visit - First Visit)  \
0                           215.0                                 29.0   
1                            50.0                                 32.0   
2                             NaN                                  NaN   
3                          1041.0                                977.0   
4                           707.0                                622.0   

   Lifespan (Next Appointment Date - First Visit)  Time Since Last Visit  \
0                                           218.0         

In [151]:
df_t['Pat. Prim. Fee Schedule'] = df_t['Pat. Prim. Fee Schedule'].str.replace('685806 - ', '')
df_t['Discount Plan'] = df_t['Discount Plan'].str.replace('685806 - ', '')

print(df_t[['Pat. Prim. Fee Schedule', 'Discount Plan']].head())

             Pat. Prim. Fee Schedule Discount Plan
0                                NaN           NaN
1  Careington Care Platinum PPO 2025           NaN
2                                NaN           NaN
3  Careington Care Platinum PPO 2025           NaN
4               Delta Dental Premier           NaN


In [152]:
# Define a function to clean the carrier names
def clean_carrier_name(carrier):
    if pd.isnull(carrier):
        return carrier
    if isinstance(carrier, str):
        if "Blue Cross Blue Shield" in carrier:
            return "Blue Cross Blue Shield"
        elif "Delta Dental" in carrier:
            return "Delta Dental"
        else:
            return re.sub(r' of \w+', '', carrier)
    return carrier

# Extract state information
def extract_state(carrier):
    if pd.isnull(carrier):
        return carrier
    if isinstance(carrier, str):
        match = re.search(r' of (\w+)', carrier)
        return match.group(1) if match else None
    return carrier

# Apply the function to create a new column
df_t['Affiliate State'] = df_t['Pat. Prim. Carrier'].apply(extract_state)

# Apply the function to create a new column
df_t['Cleaned Carrier'] = df_t['Pat. Prim. Carrier'].apply(clean_carrier_name)

print(df_t[['Pat. Prim. Carrier', 'Cleaned Carrier', 'Affiliate State']].head(20))

                    Pat. Prim. Carrier           Cleaned Carrier  \
0                                  NaN                       NaN   
1   Blue Cross Blue Shield of Illinois    Blue Cross Blue Shield   
2                                  NaN                       NaN   
3   Blue Cross Blue Shield of Illinois    Blue Cross Blue Shield   
4           Delta Dental of California              Delta Dental   
5             United Healthcare Dental  United Healthcare Dental   
6             United Healthcare Dental  United Healthcare Dental   
7                              MetLife                   MetLife   
8               Delta Dental of Kansas              Delta Dental   
9                              MetLife                   MetLife   
10                 CIGNA Global Health       CIGNA Global Health   
11            Delta Dental of Illinois              Delta Dental   
12                               Aetna                     Aetna   
13  Blue Cross Blue Shield of Illinois    Blue C

In [21]:
#!pip install fuzzywuzzy



In [21]:
from fuzzywuzzy import process, fuzz
import itertools

# Extract unique values from the 'Pat. Prim. Plan' column
plans = df_t['Pat. Prim. Plan'].dropna().unique()

# Calculate similarity scores for all pairs
similarity_scores = []
for plan1, plan2 in itertools.combinations(plans, 2):
    score = fuzz.ratio(plan1, plan2)
    similarity_scores.append((plan1, plan2, score))

# Sort the pairs by similarity score in descending order
similarity_scores.sort(key=lambda x: x[2], reverse=True)

# Convert similarity scores to a DataFrame
similarity_df = pd.DataFrame(similarity_scores, columns=['Plan 1', 'Plan 2', 'Similarity Score'])

# Save the DataFrame to a CSV file
similarity_df.to_csv("similarity_scores.csv", index=False)



KeyboardInterrupt: 

In [118]:
df_t.head()

Unnamed: 0,Patient,Primary Guarantor,Primary Contact,Last Name,Chart Number,Ascend Patient ID,Date Of Birth,Phone,Email,Address,...,student,Lifespan (Today - First Visit),Lifespan (Last Visit - First Visit),Lifespan (Next Appointment Date - First Visit),Time Since Last Visit,Visit and Procedure Mismatch,hasNextAppointment,overdue,Affiliate State,Cleaned Carrier
0,"Abelmann, Anders","Abelmann, Anders","Abelmann, Anders",Abelmann,,14000007592477,06/18/1980,(312) 804-6686,anders.abelmann@gmail.com,"2104 W Balmoral Ave, Chicago, IL, 60625",...,Non-Student,215.0,29.0,218.0,186.0,False,True,False,,
1,"Aberle, Kyle","Aberle, Kyle","Aberle, Kyle",Aberle,,14000008693320,07/12/1985,(309) 838-0212,kyle.aberle@gmail.com,"1708 W Carmen Ave, Chicago, IL, 60640",...,Non-Student,50.0,32.0,151.0,18.0,False,True,False,Illinois,Blue Cross Blue Shield
2,"Abrahms, Rhonda","Abrahms, Rhonda","Abrahms, Rhonda",Abrahms,,14000006720773,09/03/1956,,,"none, none, IL, 60640",...,Non-Student,,,,,True,False,False,,
3,"Abramowitz, Rebecca","Abramowitz, Rebecca","Abramowitz, Rebecca",Abramowitz,RA5143S,14000002734466,04/11/1981,(786) 975-7455,rebecca_abramowitz@yahoo.com,"4555 N Ravenswood, Unit 204, Chicago, IL, 60640",...,Non-Student,1041.0,977.0,,64.0,True,False,False,Illinois,Blue Cross Blue Shield
4,"Abud, Kevin","Abud, Kevin","Abud, Kevin",Abud,KA5164K,14000006871467,04/14/1975,(312) 848-4142,kevinabud@gmail.com,"3042 W Wilson Ave, Chicago, IL, 60625",...,Non-Student,707.0,622.0,791.0,85.0,False,True,False,California,Delta Dental


In [154]:
# Process the address data
parsed_data = []

addresses = df_t['Address']

for address in addresses:
    parts = [p.strip() for p in address.split(",")]  # Split by comma and remove extra spaces
    
    # Handle different address structures dynamically
    street = parts[0] if len(parts) > 0 else ""
    apartment = parts[1] if len(parts) > 3 else ""  # If there's a second part but before city/state
    city = parts[-3].lower() if len(parts) > 2 else ""  # City is the third-to-last part
    state = parts[-2] if len(parts) > 1 else ""  # State is the second-to-last part
    zip_code = parts[-1] if len(parts) > 0 else ""  # ZIP is always last

    parsed_data.append({
        "Street": street,
        #"Apartment": apartment,
        "City": city,
        "State": state,
        "ZIP Code": zip_code
    })

# Convert to DataFrame for display
df_addresses = pd.DataFrame(parsed_data)

df_addresses.head(20)
# Add parsed columns back to the original dataframe
df_t = pd.concat([df_t.reset_index(drop=True), df_addresses.reset_index(drop=True)], axis=1)

In [155]:
df_t.head()

Unnamed: 0,Patient,Primary Guarantor,Primary Contact,Last Name,Chart Number,Ascend Patient ID,Date Of Birth,Phone,Email,Address,...,Time Since Last Visit,Visit and Procedure Mismatch,hasNextAppointment,overdue,Affiliate State,Cleaned Carrier,Street,City,State,ZIP Code
0,"Abelmann, Anders","Abelmann, Anders","Abelmann, Anders",Abelmann,,14000007592477,06/18/1980,(312) 804-6686,anders.abelmann@gmail.com,"2104 W Balmoral Ave, Chicago, IL, 60625",...,186.0,False,True,False,,,2104 W Balmoral Ave,chicago,IL,60625
1,"Aberle, Kyle","Aberle, Kyle","Aberle, Kyle",Aberle,,14000008693320,07/12/1985,(309) 838-0212,kyle.aberle@gmail.com,"1708 W Carmen Ave, Chicago, IL, 60640",...,18.0,False,True,False,Illinois,Blue Cross Blue Shield,1708 W Carmen Ave,chicago,IL,60640
2,"Abrahms, Rhonda","Abrahms, Rhonda","Abrahms, Rhonda",Abrahms,,14000006720773,09/03/1956,,,"none, none, IL, 60640",...,,True,False,False,,,none,none,IL,60640
3,"Abramowitz, Rebecca","Abramowitz, Rebecca","Abramowitz, Rebecca",Abramowitz,RA5143S,14000002734466,04/11/1981,(786) 975-7455,rebecca_abramowitz@yahoo.com,"4555 N Ravenswood, Unit 204, Chicago, IL, 60640",...,64.0,True,False,False,Illinois,Blue Cross Blue Shield,4555 N Ravenswood,chicago,IL,60640
4,"Abud, Kevin","Abud, Kevin","Abud, Kevin",Abud,KA5164K,14000006871467,04/14/1975,(312) 848-4142,kevinabud@gmail.com,"3042 W Wilson Ave, Chicago, IL, 60625",...,85.0,False,True,False,California,Delta Dental,3042 W Wilson Ave,chicago,IL,60625


In [156]:
transformed_data['patient_details'] = df_t.copy()
transformed_data['patient_details'].to_csv(f"{output_dir}/transformed_patient_details.csv", index=False)

Transformations for Processed Payments
	<br>- Break out and parse transaction types
	<br>- Parse Reference Number to pull out Aenta, DD, Cigna, etc
	<br>- Split dataset into Insurance, Patient, Office, etc
	<br>- Group by card type, insurance, 
	<br>- Amount by date
	<br>- Calculate differences between Not Available dates and other dates to get time to various metrics, like time to full payment
	<br>- Calculate running totals 
	<br>- Maybe make dataset that's more of as point-in-time option
	<br>- Flip negative and positive

In [122]:
df_t = dataframes['processed_payments'].copy()

Transformations for Statement Submissions
	<br>- Drop or flag negative balance
	<br>- Grouping by generatedFrom, source, statement type, month
	<br>- Balance over time total
	<br>- Balance over time - patient

In [123]:
df_t = dataframes['statement_submission'].copy()

In [124]:
transformed_data['statement_submission'] = df_t.copy()

Tranformations for Transaction Details
	<br>- Charge by Category
	<br>- Categorical Comparisons
	<br>- Charges by date
	<br>- Possibly split based on Category
	<br>- Flag for referrals based on Proc Code
	<br>- Group by substrings on Proc Description

In [125]:
df_t = dataframes['transaction_details'].copy()

In [126]:
transformed_data['transaction_details'] = df_t.copy()

Transformations for Treatment Tracker
	<br>- Add month grouping
	<br>- Remove test patients
	<br>- Remove canceled and/or invalidated (?)

In [127]:
df_t = dataframes['treatment_tracker'].copy()

Transformations for Unresolved Claims
	<br>- Plans by carrier grouping
	<br>- Parse StateId
	<br>- Group by: Carrier, Plan

In [128]:
df_t = dataframes['unresolved_claims'].copy()

In [135]:
transformed_data['patient_details'].dtypes

Patient                                                   object
Primary Guarantor                                         object
Primary Contact                                           object
Last Name                                                 object
Chart Number                                              object
Ascend Patient ID                                          int64
Date Of Birth                                             object
Phone                                                     object
Email                                                     object
Address                                                   object
Unnamed: 10                                              float64
Status                                                    object
First Visit                                       datetime64[ns]
Last Visit                                        datetime64[ns]
Last Procedure Date                               datetime64[ns]
Next Appointment Date    

In [136]:
# Merge the data
transformed_data
merged_df = transformed_data['patient_details'].copy()
for key in transformed_data:
        if key in ['statement_submission', 'aged_AR']:
                print(key)
                merged_df = merged_df.merge(transformed_data[key], on='Ascend Patient ID', how='left')

aged_AR
statement_submission


In [137]:
merged_df.to_csv(f"{output_dir}/merged_data.csv", index=False)

In [157]:
merged_df.head

<bound method NDFrame.head of                    Patient     Primary Guarantor       Primary Contact  \
0         Abelmann, Anders      Abelmann, Anders      Abelmann, Anders   
1             Aberle, Kyle          Aberle, Kyle          Aberle, Kyle   
2          Abrahms, Rhonda       Abrahms, Rhonda       Abrahms, Rhonda   
3      Abramowitz, Rebecca   Abramowitz, Rebecca   Abramowitz, Rebecca   
4              Abud, Kevin           Abud, Kevin           Abud, Kevin   
...                    ...                   ...                   ...   
3736       Zimbler, Daniel       Zimbler, Daniel       Zimbler, Daniel   
3737  Ziolkowski, Kimberly  Ziolkowski, Kimberly  Ziolkowski, Kimberly   
3738  Ziolkowski, Kimberly  Ziolkowski, Kimberly  Ziolkowski, Kimberly   
3739        zwick, emanuel         Zwick, Qinnan         Zwick, Qinnan   
3740         Zwick, Qinnan         Zwick, Qinnan         Zwick, Qinnan   

       Last Name Chart Number  Ascend Patient ID Date Of Birth  \
0       Abelman