In [1]:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
import src.util as util
import yaml
import copy
from tqdm import tqdm
import os
from datetime import datetime
import openpyxl
from transformers import BertTokenizer, BertModel
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_data=util.config_load()

In [3]:
df_inp=util.pickle_load(config_data['raw_dataset_path_train_inp'])
df_outp=util.pickle_load(config_data['raw_dataset_path_train_outp'])
df_ben=util.pickle_load(config_data['raw_dataset_path_train_ben'])

In [4]:
df_inp_outp_ben=util.pickle_load( config_data["raw_dataset_path_test_ben"])

In [5]:
def standardize_conditions(df_ben_outp):
    for i in ['RenalDiseaseIndicator','ChronicCond_Alzheimer',
        'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
        'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
        'ChronicCond_Depression', 'ChronicCond_Diabetes',
        'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
        'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke','Gender']:
        df_ben_outp[i] = df_ben_outp[i].apply(lambda val: 0 if val =='2' else 1)
    

In [6]:
standardize_conditions(df_inp_outp_ben)

In [7]:
df_inp_outp_ben.columns

Index(['BeneID', 'ClaimID', 'Provider', 'InscClaimAmtReimbursed',
       'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician',
       'ClmAdmitDiagnosisCode', 'DeductibleAmtPaid', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'Claim_period', 'Beneficiary_cost', 'Count_diag_code',
       'Count_proc_code', 'Admit_Period', 'Is_admit', 'Gender', 'Race',
       'RenalDiseaseIndicator', 'State', 'County', 'NoOfMonths_PartACov',
       'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'Chro

In [8]:
df_inp_outp_ben.drop(columns=['IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt','OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt','TotalIPAnnualAmt', 'TotalOPAnnualAmt'],inplace=True)

In [9]:
def pad_code(code):
    if pd.isna(code):
        return code  # Keep NaN as is
    return str(code).zfill(4)  # Pad with zeros to make it 4 characters

In [10]:
df_inp_outp_ben.columns

Index(['BeneID', 'ClaimID', 'Provider', 'InscClaimAmtReimbursed',
       'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician',
       'ClmAdmitDiagnosisCode', 'DeductibleAmtPaid', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'Claim_period', 'Beneficiary_cost', 'Count_diag_code',
       'Count_proc_code', 'Admit_Period', 'Is_admit', 'Gender', 'Race',
       'RenalDiseaseIndicator', 'State', 'County', 'NoOfMonths_PartACov',
       'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'Chro

In [11]:
df_inp_outp_ben[df_inp_outp_ben.BeneID=='BENE99999']

Unnamed: 0,BeneID,ClaimID,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DiagnosisGroupCode,...,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,AGE,DOD_Flag
558205,BENE99999,CLM187739,PRV55510,60,PHY318212,,,,0,,...,1,0,0,1,1,1,0,0,73,0
558206,BENE99999,CLM359214,PRV55826,80,PHY341548,,,V7283,0,,...,1,0,0,1,1,1,0,0,73,0
558207,BENE99999,CLM475422,PRV54670,80,PHY351000,,,,0,,...,1,0,0,1,1,1,0,0,73,0
558208,BENE99999,CLM591853,PRV57336,30,PHY422310,PHY363262,,,0,,...,1,0,0,1,1,1,0,0,73,0
558209,BENE99999,CLM712394,PRV51690,700,PHY379564,,,78720,0,,...,1,0,0,1,1,1,0,0,73,0
558210,BENE99999,CLM750563,PRV54670,50,PHY359641,,,7224,0,,...,1,0,0,1,1,1,0,0,73,0


In [12]:
df_inp_outp_ben.groupby(['BeneID','Provider']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,ClaimID,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,...,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,AGE,DOD_Flag
BeneID,Provider,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BENE100000,PRV57172,2,2,2,0,1,0,2,0,2,0,...,2,2,2,2,2,2,2,2,2,2
BENE100001,PRV52145,1,1,1,0,1,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,1
BENE100001,PRV54683,1,1,1,0,0,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,1
BENE100001,PRV54890,1,1,1,1,1,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,1
BENE100001,PRV54966,4,4,4,2,2,2,4,0,4,3,...,4,4,4,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
BENE99999,PRV51690,1,1,1,0,0,1,1,0,1,0,...,1,1,1,1,1,1,1,1,1,1
BENE99999,PRV54670,2,2,2,0,0,1,2,0,2,0,...,2,2,2,2,2,2,2,2,2,2
BENE99999,PRV55510,1,1,1,0,0,0,1,0,1,0,...,1,1,1,1,1,1,1,1,1,1
BENE99999,PRV55826,1,1,1,0,0,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,1


In [12]:
df_inp_outp_ben[df_inp_outp_ben.Provider=='PRV52804']

Unnamed: 0,BeneID,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,...,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,Claim_period,Beneficiary_cost,Count_diag_code,Count_proc_code,Admit_Period,Is_admit
490195,BENE81899,1,1,1,14,141,12,12,0,0,...,3849.0,,,,13,34932,4,2,13.0,1


In [15]:
data_prilim=df_inp_outp_ben[['BeneID', 'Provider', 'InscClaimAmtReimbursed',
        'Admit_Period', 'Is_admit', 'Gender',
       'RenalDiseaseIndicator', 'State', 'County', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke', 'AGE',
       'DOD_Flag']]

In [14]:
data_prilim_outlier_detect=data_prilim[[ 'InscClaimAmtReimbursed',
        'Admit_Period', 'Is_admit', 'Gender',
       'RenalDiseaseIndicator', 'State', 'County', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depression', 'ChronicCond_Diabetes',
       'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis',
       'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke', 'AGE',
       'DOD_Flag']]

In [16]:
data_prilim['State']=data_prilim['State'].astype('int64')
data_prilim['County']=data_prilim['County'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prilim['State']=data_prilim['State'].astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prilim['County']=data_prilim['County'].astype('int64')


In [17]:
data_prilim.dtypes

BeneID                              object
Provider                            object
InscClaimAmtReimbursed               int64
Admit_Period                       float64
Is_admit                             int64
Gender                               int64
RenalDiseaseIndicator                int64
State                                int64
County                               int64
ChronicCond_Alzheimer                int64
ChronicCond_Heartfailure             int64
ChronicCond_KidneyDisease            int64
ChronicCond_Cancer                   int64
ChronicCond_ObstrPulmonary           int64
ChronicCond_Depression               int64
ChronicCond_Diabetes                 int64
ChronicCond_IschemicHeart            int64
ChronicCond_Osteoporasis             int64
ChronicCond_rheumatoidarthritis      int64
ChronicCond_stroke                   int64
AGE                                  int64
DOD_Flag                            object
dtype: object

In [18]:
data_prilim.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prilim.fillna(0,inplace=True)


In [19]:
data_grp_amt=data_prilim.groupby(['Provider','BeneID'])['InscClaimAmtReimbursed'].sum().reset_index()
#data_grp_cols=data_prilim.groupby(['Provider','BeneID'])['InscClaimAmtReimbursed'].sum().reset_index()

In [20]:
data_prilim_sorted=data_prilim.sort_values(by='Provider')

In [59]:
data_prilim_sorted.to_csv('data_spark.csv')

In [21]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [4]:
data_combined_Fe=util.pickle_load(config_data["raw_dataset_path_data_combined"])

In [22]:
# Convert 'Provider' to categorical for fixed effects
data_prilim['Provider'] = data_prilim['Provider'].astype('category')

# Fit fixed effects model
model = ols('InscClaimAmtReimbursed ~ C(Provider)  + Admit_Period+Is_admit + Gender + RenalDiseaseIndicator + '
            'C(State) + C(County) + ChronicCond_Alzheimer + ChronicCond_Heartfailure + '
            'ChronicCond_KidneyDisease + ChronicCond_Cancer + ChronicCond_ObstrPulmonary + '
            'ChronicCond_Depression + ChronicCond_Diabetes + ChronicCond_IschemicHeart + '
            'ChronicCond_Osteoporasis + ChronicCond_rheumatoidarthritis + ChronicCond_stroke + AGE',
            data=data_prilim_sorted).fit()

# Summary of the model
print(model.summary())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_prilim['Provider'] = data_prilim['Provider'].astype('category')


KeyboardInterrupt: 

In [23]:
data_batch1=data_prilim[0:100000]
data_batch2=data_prilim[100001:200000]
data_batch3=data_prilim[200001:300000]

In [24]:
first_500_prov=data_prilim['Provider'].unique().tolist()[0:2000]
second_500_prov=data_prilim['Provider'].unique().tolist()[2001:4000]
third_500_prov=data_prilim['Provider'].unique().tolist()[4001:6000]
# forth_500_prov=data_prilim['Provider'].unique().tolist()[3001:4000]
# fifth_1000_prov=data_prilim['Provider'].unique().tolist()[4001:6000]
# # sixth_1000_prov=data_prilim['Provider'].unique().tolist()[3001:4000]
# # seventh_2000_prov=data_prilim['Provider'].unique().tolist()[4001:6000]

In [25]:
data_batch1=data_prilim[data_prilim.Provider.isin(first_500_prov)]
data_batch2=data_prilim[data_prilim.Provider.isin(second_500_prov)]
data_batch3=data_prilim[data_prilim.Provider.isin(third_500_prov)]
# data_batch4=data_prilim[data_prilim.Provider.isin(forth_500_prov)]
# data_batch5=data_prilim[data_prilim.Provider.isin(fifth_1000_prov)]
# data_batch6=data_prilim[data_prilim.Provider.isin(sixth_1000_prov)]
# data_batch7=data_prilim[data_prilim.Provider.isin(seventh_2000_prov)]

In [27]:
import pandas as pd
from statsmodels.formula.api import ols

# Example list of DataFrames (batches)
data_batches = [data_batch1, data_batch2, data_batch3]  # Replace with your actual data batches

# Initialize a list to store coefficients for each batch
coefficients_list = []

# Iterate over each batch of data
for data_prilim in data_batches:
    # Convert 'Provider' to categorical for fixed effects
    data_prilim['Provider'] = data_prilim['Provider'].astype('category')
    
    # Fit fixed effects model
    model = ols('InscClaimAmtReimbursed ~ C(Provider) + Admit_Period + Is_admit + Gender + RenalDiseaseIndicator + '
                ' ChronicCond_Alzheimer + ChronicCond_Heartfailure + '
                'ChronicCond_KidneyDisease + ChronicCond_Cancer + ChronicCond_ObstrPulmonary + '
                'ChronicCond_Depression + ChronicCond_Diabetes + ChronicCond_IschemicHeart + '
                'ChronicCond_Osteoporasis + ChronicCond_rheumatoidarthritis + ChronicCond_stroke ',
                data=data_prilim).fit()
    
    # Store coefficients in the list
    coefficients_list.append(model.params)
    print(model.summary())

# Combine coefficients into a single DataFrame
coefficients_df = pd.concat(coefficients_list, axis=1)



# Create a DataFrame for better visualization
mean_coefficients_df = pd.DataFrame(coefficients_df)

# Print the mean coefficients
print(mean_coefficients_df)

In [76]:
from sklearn.neighbors import LocalOutlierFactor

# Fit LOF
lof = LocalOutlierFactor()
data_prilim['Anomaly'] = lof.fit_predict(data_prilim_outlier_detect)

# Identify anomalies (Anomalies labeled as -1)
anomalies = data_prilim[data_prilim['Anomaly'] == -1]
print(anomalies)

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Python(65470) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
