In [69]:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
import src.util as util
import yaml
import copy
from tqdm import tqdm
import os
from datetime import datetime
import openpyxl
from transformers import BertTokenizer, BertModel
import torch
import math

In [70]:
config_data=util.config_load()

In [71]:
df_inp=util.pickle_load(config_data['raw_dataset_path_train_inp'])
df_outp=util.pickle_load(config_data['raw_dataset_path_train_outp'])
df_ben=util.pickle_load(config_data['raw_dataset_path_train_ben'])

In [159]:
df_inp_outp_ben=util.pickle_load( config_data["raw_dataset_path_test_ben"])

In [168]:
data_clus=df_inp_outp_ben[df_inp_outp_ben.Provider=='PRV57172']

In [169]:
data_clus

Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DiagnosisGroupCode,...,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,AGE,DOD_Flag,TotalIPAnnualAmt,TotalOPAnnualAmt
0,BENE100000,CLM126832,PRV57172,50,PHY383401,,,,0,,...,2,2,0,0,120,30,71,0,0,150
1,BENE100000,CLM351838,PRV57172,70,PHY370909,,PHY363377,,0,,...,2,2,0,0,120,30,71,0,0,150
1252,BENE100330,CLM378370,PRV57172,70,PHY339481,,,V7611,0,,...,1,2,0,0,500,370,83,0,0,870
1253,BENE100330,CLM529900,PRV57172,300,PHY338968,PHY402352,PHY402352,,0,,...,1,2,0,0,500,370,83,0,0,870
1254,BENE100330,CLM644917,PRV57172,50,PHY402772,,,,0,,...,1,2,0,0,500,370,83,0,0,870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553813,BENE98825,CLM466566,PRV57172,50,PHY408107,,,,0,,...,2,2,0,0,50,60,47,0,0,110
554601,BENE99033,CLM444066,PRV57172,40,PHY428926,,PHY423231,,0,,...,2,2,0,0,40,20,73,0,0,60
555965,BENE99406,CLM570221,PRV57172,70,PHY414495,,,,0,,...,2,2,0,0,180,110,86,0,0,290
555966,BENE99406,CLM570222,PRV57172,70,PHY361788,,,,0,,...,2,2,0,0,180,110,86,0,0,290


In [196]:
data_clus[data_clus.ClmDiagnosisCode_8=='34670']

Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DiagnosisGroupCode,...,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,AGE,DOD_Flag,TotalIPAnnualAmt,TotalOPAnnualAmt


In [21]:
def standardize_conditions(df_ben_outp):
    for i in ['RenalDiseaseIndicator','ChronicCond_Alzheimer',
        'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
        'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
        'ChronicCond_Depression', 'ChronicCond_Diabetes',
        'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
        'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke','Gender']:
        df_ben_outp[i] = df_ben_outp[i].apply(lambda val: 0 if val =='2' else 1)
    

In [22]:
standardize_conditions(df_inp_outp_ben)

In [23]:
df_inp_outp_ben.columns

Index(['BeneID', 'ClaimID', 'Provider', 'InscClaimAmtReimbursed',
       'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician',
       'ClmAdmitDiagnosisCode', 'DeductibleAmtPaid', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'Claim_period', 'Beneficiary_cost', 'Count_diag_code',
       'Count_proc_code', 'Admit_Period', 'Is_admit', 'Gender', 'Race',
       'RenalDiseaseIndicator', 'State', 'County', 'NoOfMonths_PartACov',
       'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'Chro

In [24]:
df_inp_outp_ben.drop(columns=['IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt','OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt','TotalIPAnnualAmt', 'TotalOPAnnualAmt'],inplace=True)

In [25]:
df_inp_outp_ben.groupby('Provider')['ClaimID'].count().reset_index()

Unnamed: 0,Provider,ClaimID
0,PRV51001,25
1,PRV51003,132
2,PRV51004,149
3,PRV51005,1165
4,PRV51007,72
...,...,...
5405,PRV57759,28
5406,PRV57760,22
5407,PRV57761,82
5408,PRV57762,1


In [26]:
def pad_code(code):
    if pd.isna(code):
        return code  # Keep NaN as is
    return str(code).zfill(4)  # Pad with zeros to make it 4 characters

In [27]:
for i in [ 'ClmAdmitDiagnosisCode', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3']:
    df_inp_outp_ben[i].astype(str)
    df_inp_outp_ben[i].replace(r'\.0$', '', regex=True,inplace=True)
    df_inp_outp_ben[i].replace(r'nan', np.nan, regex=True,inplace=True)
    df_inp_outp_ben[i]=df_inp_outp_ben[i].apply(pad_code)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_inp_outp_ben[i].replace(r'\.0$', '', regex=True,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_inp_outp_ben[i].replace(r'nan', np.nan, regex=True,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediat

In [12]:
df_inp_outp_ben[df_inp_outp_ben.BeneID=='BENE99999']

Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DiagnosisGroupCode,...,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,AGE,DOD_Flag
558205,BENE99999,CLM187739,PRV55510,60,PHY318212,,,,0,,...,1,0,0,1,1,1,0,0,73,0
558206,BENE99999,CLM359214,PRV55826,80,PHY341548,,,V7283,0,,...,1,0,0,1,1,1,0,0,73,0
558207,BENE99999,CLM475422,PRV54670,80,PHY351000,,,,0,,...,1,0,0,1,1,1,0,0,73,0
558208,BENE99999,CLM591853,PRV57336,30,PHY422310,PHY363262,,,0,,...,1,0,0,1,1,1,0,0,73,0
558209,BENE99999,CLM712394,PRV51690,700,PHY379564,,,78720,0,,...,1,0,0,1,1,1,0,0,73,0
558210,BENE99999,CLM750563,PRV54670,50,PHY359641,,,7224,0,,...,1,0,0,1,1,1,0,0,73,0


In [28]:
nominal=['ClmDiagnosisCode_1','ClmDiagnosisCode_2','ClmDiagnosisCode_3','ClmDiagnosisCode_4','ClmDiagnosisCode_5','ClmDiagnosisCode_6','ClmDiagnosisCode_7','ClmDiagnosisCode_8','ClmDiagnosisCode_9','ClmDiagnosisCode_10']

In [29]:
def OHE_cat(data, encoder_col = None, encoder = None) -> pd.DataFrame:

    data_ohe = data[nominal]

    if encoder == None:
        # Create Object
        encoder = OneHotEncoder(handle_unknown = 'ignore',
                                drop = 'if_binary')
        encoder.fit(data_ohe)
        encoder_col = encoder.get_feature_names_out(data_ohe.columns)


    # Transform the data
    data_encoded = encoder.transform(data_ohe).toarray()
    data_encoded = pd.DataFrame(data_encoded,
                                index = data_ohe.index,
                                columns = encoder_col)


    return data_encoded, encoder_col, encoder

In [30]:
df=df_inp_outp_ben[['ClmDiagnosisCode_1','ClmDiagnosisCode_2','ClmDiagnosisCode_3','ClmDiagnosisCode_4','ClmDiagnosisCode_5','ClmDiagnosisCode_6','ClmDiagnosisCode_7','ClmDiagnosisCode_8','ClmDiagnosisCode_9','ClmDiagnosisCode_10']]

In [31]:
OHE_cat_df, OHE_col, OHE_encoder = OHE_cat(data = df)

In [18]:
OHE_cat_df

Unnamed: 0,ClmDiagnosisCode_1_0010,ClmDiagnosisCode_1_0011,ClmDiagnosisCode_1_0019,ClmDiagnosisCode_1_0020,ClmDiagnosisCode_1_0021,ClmDiagnosisCode_1_0022,ClmDiagnosisCode_1_0023,ClmDiagnosisCode_1_0024,ClmDiagnosisCode_1_0025,ClmDiagnosisCode_1_0029,...,ClmDiagnosisCode_10_V741,ClmDiagnosisCode_10_V7651,ClmDiagnosisCode_10_V850,ClmDiagnosisCode_10_V851,ClmDiagnosisCode_10_V8531,ClmDiagnosisCode_10_V8533,ClmDiagnosisCode_10_V854,ClmDiagnosisCode_10_V860,ClmDiagnosisCode_10_V8801,ClmDiagnosisCode_10_nan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
558207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
558208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
558209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [42]:
data_combined=df_inp_outp_ben[0:100].merge(OHE_cat_df[0:100],left_index=True,right_index=True)

In [33]:
data_combined

Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DiagnosisGroupCode,...,ClmDiagnosisCode_10_V741,ClmDiagnosisCode_10_V7651,ClmDiagnosisCode_10_V850,ClmDiagnosisCode_10_V851,ClmDiagnosisCode_10_V8531,ClmDiagnosisCode_10_V8533,ClmDiagnosisCode_10_V854,ClmDiagnosisCode_10_V860,ClmDiagnosisCode_10_V8801,ClmDiagnosisCode_10_nan
0,BENE100000,CLM126832,PRV57172,50,PHY383401,,,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,BENE100000,CLM351838,PRV57172,70,PHY370909,,PHY363377,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,BENE100001,CLM229075,PRV55158,70,PHY383478,,,64880,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,BENE100001,CLM258631,PRV54966,70,PHY347511,,,36401,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,BENE100001,CLM332544,PRV54890,90,PHY343317,PHY408367,PHY408367,78009,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,BENE100020,CLM397394,PRV51433,800,PHY387026,,,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
96,BENE100020,CLM491511,PRV51749,90,PHY376641,,PHY376641,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
97,BENE100021,CLM119587,PRV55209,20,PHY362544,,PHY362544,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
98,BENE100021,CLM174738,PRV55368,50,PHY382037,,,,0,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [86]:
data_comb

Unnamed: 0,BeneID,Provider,InscClaimAmtReimbursed,Count_diag_code,Count_proc_code,Claim_period,Admit_Period,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,...,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,Is_admit,State,County,RenalDiseaseIndicator
0,BENE100000,PRV57172,120,2,4,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,49.0,430.0,1.0
1,BENE100001,PRV52145,10,2,2,0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,33.0,420.0,1.0
2,BENE100001,PRV54683,100,3,2,0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,33.0,420.0,1.0
3,BENE100001,PRV54890,90,9,2,2,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,33.0,420.0,1.0
4,BENE100001,PRV54966,1130,11,8,0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,33.0,420.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363295,BENE99999,PRV51690,700,1,2,0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,29.0,150.0,1.0
363296,BENE99999,PRV54670,130,2,4,0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,29.0,150.0,1.0
363297,BENE99999,PRV55510,60,1,2,0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,29.0,150.0,1.0
363298,BENE99999,PRV55826,80,2,2,0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,29.0,150.0,1.0


In [43]:
data_combined.columns.to_frame(index=False).to_csv('data_check.csv', header=False, index=False)

In [65]:
data_combined.drop(columns=['AttendingPhysician',
'OperatingPhysician',
'OtherPhysician',
'ClmAdmitDiagnosisCode',
'DeductibleAmtPaid',
'DiagnosisGroupCode',
'ClmDiagnosisCode_1',
'ClmDiagnosisCode_2',
'ClmDiagnosisCode_3',
'ClmDiagnosisCode_4',
'ClmDiagnosisCode_5',
'ClmDiagnosisCode_6',
'ClmDiagnosisCode_7',
'ClmDiagnosisCode_8',
'ClmDiagnosisCode_9',
'ClmDiagnosisCode_10',
'ClmProcedureCode_1',
'ClmProcedureCode_2',
'ClmProcedureCode_3',
'ClmProcedureCode_4',
'ClmProcedureCode_5',
'Beneficiary_cost',
'Gender',
'RenalDiseaseIndicator',
'AGE'],inplace=True)

KeyError: "['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician', 'ClmAdmitDiagnosisCode', 'DeductibleAmtPaid', 'DiagnosisGroupCode', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5', 'Beneficiary_cost', 'Gender', 'RenalDiseaseIndicator', 'AGE'] not found in axis"

In [50]:
data_prilim=df_inp_outp_ben[['BeneID', 'Provider', 'InscClaimAmtReimbursed',
        'Admit_Period', 'Is_admit', 'Gender',
       'RenalDiseaseIndicator', 'State', 'County', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke', 'AGE',
       'DOD_Flag','Claim_period', 'Beneficiary_cost',
       'Count_diag_code', 'Count_proc_code', ]]

In [45]:
data_combined['State']=data_combined['State'].astype('int64')
data_combined['County']=data_combined['County'].astype('int64')

In [75]:
data_comb.dtypes

BeneID                              object
Provider                            object
InscClaimAmtReimbursed               int64
Count_diag_code                      int64
Count_proc_code                      int64
Claim_period                         int64
Admit_Period                       float64
ChronicCond_Alzheimer              float64
ChronicCond_Heartfailure           float64
ChronicCond_KidneyDisease          float64
ChronicCond_Cancer                 float64
ChronicCond_ObstrPulmonary         float64
ChronicCond_Depression             float64
ChronicCond_Diabetes               float64
ChronicCond_IschemicHeart          float64
ChronicCond_Osteoporasis           float64
ChronicCond_rheumatoidarthritis    float64
ChronicCond_stroke                 float64
Is_admit                           float64
State                              float64
County                             float64
dtype: object

In [46]:
data_combined.fillna(0,inplace=True)

In [47]:
data_batch1=data_combined[0:100000]
data_batch2=data_combined[100001:200000]
data_batch3=data_combined[200001:300000]

In [48]:
first_500_prov=data_combined['Provider'].unique().tolist()[0:2000]
second_500_prov=data_combined['Provider'].unique().tolist()[2001:4000]
third_500_prov=data_combined['Provider'].unique().tolist()[4001:6000]
# forth_500_prov=data_prilim['Provider'].unique().tolist()[3001:4000]
# fifth_1000_prov=data_prilim['Provider'].unique().tolist()[4001:6000]
# # sixth_1000_prov=data_prilim['Provider'].unique().tolist()[3001:4000]
# # seventh_2000_prov=data_prilim['Provider'].unique().tolist()[4001:6000]

In [49]:
data_batch1=data_combined[data_combined.Provider.isin(first_500_prov)]
data_batch2=data_combined[data_combined.Provider.isin(second_500_prov)]
data_batch3=data_combined[data_combined.Provider.isin(third_500_prov)]
# data_batch4=data_prilim[data_prilim.Provider.isin(forth_500_prov)]
# data_batch5=data_prilim[data_prilim.Provider.isin(fifth_1000_prov)]
# data_batch6=data_prilim[data_prilim.Provider.isin(sixth_1000_prov)]
# data_batch7=data_prilim[data_prilim.Provider.isin(seventh_2000_prov)]

In [66]:
data_batch1

Unnamed: 0,ClaimID,Provider,InscClaimAmtReimbursed,Claim_period,Count_diag_code,Count_proc_code,Admit_Period,Is_admit,Race,State,...,ClmDiagnosisCode_10_V741,ClmDiagnosisCode_10_V7651,ClmDiagnosisCode_10_V850,ClmDiagnosisCode_10_V851,ClmDiagnosisCode_10_V8531,ClmDiagnosisCode_10_V8533,ClmDiagnosisCode_10_V854,ClmDiagnosisCode_10_V860,ClmDiagnosisCode_10_V8801,ClmDiagnosisCode_10_nan
0,CLM126832,PRV57172,50,0,1,2,0.0,0,1,49,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,CLM351838,PRV57172,70,0,1,2,0.0,0,1,49,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,CLM229075,PRV55158,70,0,1,2,0.0,0,1,33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,CLM258631,PRV54966,70,0,2,2,0.0,0,1,33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,CLM332544,PRV54890,90,2,9,2,0.0,0,1,33,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,CLM397394,PRV51433,800,0,3,2,0.0,0,1,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
96,CLM491511,PRV51749,90,0,1,2,0.0,0,1,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
97,CLM119587,PRV55209,20,0,1,2,0.0,0,1,34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
98,CLM174738,PRV55368,50,0,2,2,0.0,0,1,34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [90]:
import pandas as pd
from statsmodels.formula.api import ols

# Example list of DataFrames (batches)
data_batches = [data_batch1, data_batch2, data_batch3]  # Replace with your actual data batches

# Initialize a list to store coefficients for each batch
coefficients_list = []

# Iterate over each batch of data
for data_comb in data_batches:
    # Convert 'Provider' to categorical for fixed effects
    data_comb['Provider'] = data_comb['Provider'].astype('category')
    
    # Fit fixed effects model
    model = ols('InscClaimAmtReimbursed ~ C(Provider) + Claim_period+ Admit_Period+ Is_admit +   RenalDiseaseIndicator + '
                ' ChronicCond_Alzheimer + ChronicCond_Heartfailure + '
                'ChronicCond_KidneyDisease + ChronicCond_Cancer + ChronicCond_ObstrPulmonary + '
                'ChronicCond_Depression + ChronicCond_Diabetes + ChronicCond_IschemicHeart + '
                'ChronicCond_Osteoporasis + ChronicCond_rheumatoidarthritis + ChronicCond_stroke  +'
                'Count_diag_code + Count_proc_code ',
                data=data_comb).fit()
    
    # Store coefficients in the list
    coefficients_list.append(model.params)
    print(model.summary())

# Combine coefficients into a single DataFrame
coefficients_df = pd.concat(coefficients_list, axis=1)



# Create a DataFrame for better visualization
mean_coefficients_df = pd.DataFrame(coefficients_df)

# Print the mean coefficients
print(mean_coefficients_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_comb['Provider'] = data_comb['Provider'].astype('category')


                              OLS Regression Results                              
Dep. Variable:     InscClaimAmtReimbursed   R-squared:                       0.530
Model:                                OLS   Adj. R-squared:                  0.526
Method:                     Least Squares   F-statistic:                     158.1
Date:                    Tue, 15 Oct 2024   Prob (F-statistic):               0.00
Time:                            13:03:37   Log-Likelihood:            -2.7170e+06
No. Observations:                  284988   AIC:                         5.438e+06
Df Residuals:                      282972   BIC:                         5.459e+06
Df Model:                            2015                                         
Covariance Type:                nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_comb['Provider'] = data_comb['Provider'].astype('category')


                              OLS Regression Results                              
Dep. Variable:     InscClaimAmtReimbursed   R-squared:                       0.525
Model:                                OLS   Adj. R-squared:                  0.510
Method:                     Least Squares   F-statistic:                     36.21
Date:                    Tue, 15 Oct 2024   Prob (F-statistic):               0.00
Time:                            13:04:26   Log-Likelihood:            -6.5797e+05
No. Observations:                   68071   AIC:                         1.320e+06
Df Residuals:                       66056   BIC:                         1.338e+06
Df Model:                            2014                                         
Covariance Type:                nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_comb['Provider'] = data_comb['Provider'].astype('category')


                              OLS Regression Results                              
Dep. Variable:     InscClaimAmtReimbursed   R-squared:                       0.574
Model:                                OLS   Adj. R-squared:                  0.504
Method:                     Least Squares   F-statistic:                     8.191
Date:                    Tue, 15 Oct 2024   Prob (F-statistic):               0.00
Time:                            13:04:32   Log-Likelihood:                -98512.
No. Observations:                   10088   AIC:                         1.999e+05
Df Residuals:                        8663   BIC:                         2.102e+05
Df Model:                            1424                                         
Covariance Type:                nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------

In [52]:
data_batch1

Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,Claim_period,Count_diag_code,Count_proc_code,Admit_Period,Is_admit,Race,...,ClmDiagnosisCode_10_V741,ClmDiagnosisCode_10_V7651,ClmDiagnosisCode_10_V850,ClmDiagnosisCode_10_V851,ClmDiagnosisCode_10_V8531,ClmDiagnosisCode_10_V8533,ClmDiagnosisCode_10_V854,ClmDiagnosisCode_10_V860,ClmDiagnosisCode_10_V8801,ClmDiagnosisCode_10_nan
0,BENE100000,CLM126832,PRV57172,50,0,1,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,BENE100000,CLM351838,PRV57172,70,0,1,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,BENE100001,CLM229075,PRV55158,70,0,1,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,BENE100001,CLM258631,PRV54966,70,0,2,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,BENE100001,CLM332544,PRV54890,90,2,9,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,BENE100020,CLM397394,PRV51433,800,0,3,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
96,BENE100020,CLM491511,PRV51749,90,0,1,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
97,BENE100021,CLM119587,PRV55209,20,0,1,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
98,BENE100021,CLM174738,PRV55368,50,0,2,2,0.0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [59]:
data_batch1.drop(columns=['BeneID'],inplace=True)

KeyError: "['BeneID'] not found in axis"

In [58]:
formula

'InscClaimAmtReimbursed ~ C(Provider) + Admit_Period + BeneID + ChronicCond_Alzheimer + ChronicCond_Cancer + ChronicCond_Depression + ChronicCond_Diabetes + ChronicCond_Heartfailure + ChronicCond_IschemicHeart + ChronicCond_KidneyDisease + ChronicCond_ObstrPulmonary + ChronicCond_Osteoporasis + ChronicCond_rheumatoidarthritis + ChronicCond_stroke + ClaimID + Claim_period + ClmDiagnosisCode_10_0042 + ClmDiagnosisCode_10_00845 + ClmDiagnosisCode_10_0088 + ClmDiagnosisCode_10_0135 + ClmDiagnosisCode_10_0138 + ClmDiagnosisCode_10_0179 + ClmDiagnosisCode_10_0185 + ClmDiagnosisCode_10_0260 + ClmDiagnosisCode_10_0261 + ClmDiagnosisCode_10_0262 + ClmDiagnosisCode_10_0311 + ClmDiagnosisCode_10_0319 + ClmDiagnosisCode_10_0340 + ClmDiagnosisCode_10_03811 + ClmDiagnosisCode_10_03812 + ClmDiagnosisCode_10_03819 + ClmDiagnosisCode_10_03843 + ClmDiagnosisCode_10_03849 + ClmDiagnosisCode_10_0389 + ClmDiagnosisCode_10_04102 + ClmDiagnosisCode_10_04104 + ClmDiagnosisCode_10_04109 + ClmDiagnosisCode_10_0

In [63]:
independent_vars

Index(['Admit_Period', 'ChronicCond_Alzheimer', 'ChronicCond_Cancer',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_Heartfailure', 'ChronicCond_IschemicHeart',
       'ChronicCond_KidneyDisease', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Osteoporasis',
       ...
       'Count_diag_code', 'Count_proc_code', 'County', 'DOD_Flag', 'Is_admit',
       'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'Provider', 'Race',
       'State'],
      dtype='object', length=42817)

In [67]:
independent_vars

Index(['Admit_Period', 'ChronicCond_Alzheimer', 'ChronicCond_Cancer',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_Heartfailure', 'ChronicCond_IschemicHeart',
       'ChronicCond_KidneyDisease', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Osteoporasis',
       ...
       'Count_diag_code', 'Count_proc_code', 'County', 'DOD_Flag', 'Is_admit',
       'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', 'Provider', 'Race',
       'State'],
      dtype='object', length=42817)

In [68]:
 # Convert 'Provider' to categorical for fixed effects
data_batch1['Provider'] = data_batch1['Provider'].astype('category')
    
independent_vars = data_batch1.columns.difference(['InscClaimAmtReimbursed'])

# Create the formula
formula = 'InscClaimAmtReimbursed ~ C(Provider) + ' + ' + '.join(independent_vars)

# Fit the OLS model
    # Fit fixed effects model
model = ols(formula,
                data=data_batch1).fit()

RecursionError: maximum recursion depth exceeded

In [61]:
import pandas as pd
from statsmodels.formula.api import ols

# Example list of DataFrames (batches)
data_batches = [data_batch1]  # Replace with your actual data batches

# Initialize a list to store coefficients for each batch
coefficients_list = []

# Iterate over each batch of data
for data_comb in data_batches:
    # Convert 'Provider' to categorical for fixed effects
    data_comb['Provider'] = data_comb['Provider'].astype('category')
    
    independent_vars = data_comb.columns.difference(['InscClaimAmtReimbursed'])

# Create the formula
    formula = 'InscClaimAmtReimbursed ~ C(Provider) + ' + ' + '.join(independent_vars)

# Fit the OLS model
    # Fit fixed effects model
    model = ols(formula,
                data=data_comb).fit()
    
    # Store coefficients in the list
    coefficients_list.append(model.params)
    print(model.summary())

# Combine coefficients into a single DataFrame
coefficients_df = pd.concat(coefficients_list, axis=1)



# Create a DataFrame for better visualization
mean_coefficients_df = pd.DataFrame(coefficients_df)

# Print the mean coefficients
print(mean_coefficients_df)

RecursionError: maximum recursion depth exceeded

In [91]:
data_comb[0:450]

Unnamed: 0,BeneID,Provider,InscClaimAmtReimbursed,Count_diag_code,Count_proc_code,Claim_period,Admit_Period,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,...,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,Is_admit,State,County,RenalDiseaseIndicator
30443,BENE112164,PRV52476,5000,9,0,5,5.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,12.0,50.0,1.0
30543,BENE112205,PRV54029,300,3,2,0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,23.0,800.0,1.0
30575,BENE112218,PRV55299,10,1,2,0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,34.0,150.0,1.0
30629,BENE112238,PRV52793,30,1,2,0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,14.0,141.0,1.0
30632,BENE112239,PRV53293,90,9,4,6,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,18.0,991.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44854,BENE117468,PRV53835,50,1,2,0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,22.0,90.0,1.0
44890,BENE117481,PRV56507,500,11,6,0,0.0,1.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,44.0,160.0,1.0
45036,BENE117534,PRV51958,800,1,2,0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,10.0,50.0,1.0
45097,BENE11755,PRV52747,300,1,2,0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,14.0,141.0,1.0


In [96]:
data_sample=data_comb[0:450]

In [97]:
from sklearn.neighbors import LocalOutlierFactor

# Fit LOF
lof = LocalOutlierFactor()
data_sample['Anomaly'] = lof.fit_predict(data_sample[0:450].drop(['BeneID','Provider'],axis=1))

# Identify anomalies (Anomalies labeled as -1)
anomalies = data_sample[data_sample['Anomaly'] == -1]
print(anomalies)

           BeneID  Provider  InscClaimAmtReimbursed  Count_diag_code  \
31443  BENE112513  PRV51258                   28000                9   
32780  BENE113054  PRV55601                     200                2   
34049  BENE113548  PRV57711                   28000                9   
34845  BENE113825  PRV56199                     200                3   
35619  BENE114124  PRV57399                   57000                9   
36127  BENE114308  PRV55044                     700                8   
36543  BENE114458  PRV51919                   30000                9   
37445  BENE114784  PRV51412                     200                3   
38263  BENE115087  PRV55606                   24000                8   
39019  BENE115360  PRV55506                   26000                4   
39801  BENE115651  PRV55754                     200                0   
40266  BENE115837  PRV51755                     200                1   
41643  BENE116313  PRV55928                     200             

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sample['Anomaly'] = lof.fit_predict(data_sample[0:450].drop(['BeneID','Provider'],axis=1))


In [98]:
anomalies

Unnamed: 0,BeneID,Provider,InscClaimAmtReimbursed,Count_diag_code,Count_proc_code,Claim_period,Admit_Period,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,...,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,Is_admit,State,County,RenalDiseaseIndicator,Anomaly
31443,BENE112513,PRV51258,28000,9,2,7,7.0,0.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,4.0,510.0,1.0,-1
32780,BENE113054,PRV55601,200,2,2,0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,18.0,180.0,1.0,-1
34049,BENE113548,PRV57711,28000,9,1,9,9.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,45.0,700.0,1.0,-1
34845,BENE113825,PRV56199,200,3,2,0,0.0,0.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,54.0,560.0,1.0,-1
35619,BENE114124,PRV57399,57000,9,1,15,17.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,50.0,160.0,1.0,-1
36127,BENE114308,PRV55044,700,8,2,0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,54.0,999.0,1.0,-1
36543,BENE114458,PRV51919,30000,9,3,1,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,6.0,340.0,1.0,-1
37445,BENE114784,PRV51412,200,3,2,0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,5.0,660.0,1.0,-1
38263,BENE115087,PRV55606,24000,8,0,4,4.0,0.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,36.0,460.0,1.0,-1
39019,BENE115360,PRV55506,26000,4,1,1,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,36.0,750.0,1.0,-1
