In [1]:
import pandas as pd
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, StandardScaler
import src.util as util
import yaml
import copy
from tqdm import tqdm
import os
from datetime import datetime

In [2]:
config_data=util.config_load()

1. Function to Read data from the folder

In [3]:
def read_raw_data(config: dict,filetype) -> pd.DataFrame:
    # Create variable to store raw dataset
    raw_dataset = pd.DataFrame()

    # Raw Dataset Dir
    raw_dataset_dir = config["raw_dataset_dir"]

     # List files in the directory and filter those with filetype in the filename
    files = [f for f in os.listdir(raw_dataset_dir) if filetype in f and f.endswith('.csv')]

    # Process and concatenate each filtered file
    for i in tqdm(files):
        file_path = os.path.join(raw_dataset_dir, i)
        raw_dataset = pd.concat([raw_dataset, pd.read_csv(file_path)], ignore_index=True)


    # Return the concatenated DataFrame
    return raw_dataset

In [5]:
df_ben=read_raw_data(config_data,'Beneficiary')
df_inp=read_raw_data(config_data,'Inpatient')
df_outp=read_raw_data(config_data,'Outpatient')

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.67it/s]
100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
100%|██████████| 1/1 [00:02<00:00,  2.33s/it]


In [6]:
print(len(df_ben))
print(len(df_inp))
print(len(df_outp))

138556
40474
517737


**Beneficiary File**

In [6]:
df_ben.dtypes

BeneID                             object
DOB                                object
DOD                                object
Gender                              int64
Race                                int64
RenalDiseaseIndicator              object
State                               int64
County                              int64
NoOfMonths_PartACov                 int64
NoOfMonths_PartBCov                 int64
ChronicCond_Alzheimer               int64
ChronicCond_Heartfailure            int64
ChronicCond_KidneyDisease           int64
ChronicCond_Cancer                  int64
ChronicCond_ObstrPulmonary          int64
ChronicCond_Depression              int64
ChronicCond_Diabetes                int64
ChronicCond_IschemicHeart           int64
ChronicCond_Osteoporasis            int64
ChronicCond_rheumatoidarthritis     int64
ChronicCond_stroke                  int64
IPAnnualReimbursementAmt            int64
IPAnnualDeductibleAmt               int64
OPAnnualReimbursementAmt          

In [7]:
df_ben.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
BeneID,138556.0,138556.0,BENE159198,1.0,,,,,,,
DOB,138556.0,900.0,1939-10-01,540.0,,,,,,,
DOD,1421.0,11.0,2009-12-01,182.0,,,,,,,
Gender,138556.0,,,,1.570932,0.494945,1.0,1.0,2.0,2.0,2.0
Race,138556.0,,,,1.254511,0.717007,1.0,1.0,1.0,1.0,5.0
RenalDiseaseIndicator,138556.0,2.0,0,118978.0,,,,,,,
State,138556.0,,,,25.666734,15.223443,1.0,11.0,25.0,39.0,54.0
County,138556.0,,,,374.424745,266.277581,0.0,141.0,340.0,570.0,999.0
NoOfMonths_PartACov,138556.0,,,,11.907727,1.032332,0.0,12.0,12.0,12.0,12.0
NoOfMonths_PartBCov,138556.0,,,,11.910145,0.936893,0.0,12.0,12.0,12.0,12.0


In [8]:
df_ben.isna().sum()

BeneID                                  0
DOB                                     0
DOD                                137135
Gender                                  0
Race                                    0
RenalDiseaseIndicator                   0
State                                   0
County                                  0
NoOfMonths_PartACov                     0
NoOfMonths_PartBCov                     0
ChronicCond_Alzheimer                   0
ChronicCond_Heartfailure                0
ChronicCond_KidneyDisease               0
ChronicCond_Cancer                      0
ChronicCond_ObstrPulmonary              0
ChronicCond_Depression                  0
ChronicCond_Diabetes                    0
ChronicCond_IschemicHeart               0
ChronicCond_Osteoporasis                0
ChronicCond_rheumatoidarthritis         0
ChronicCond_stroke                      0
IPAnnualReimbursementAmt                0
IPAnnualDeductibleAmt                   0
OPAnnualReimbursementAmt          

In [9]:
df_ben_copy=df_ben.copy()

2. Convert the datatype of fields appropriately. The fields are mentioned in config file.

In [10]:
def type_conv(set_data, config_data,datetime_columns,obj_columns,int_columns):
  
    
    # --- Convert columns to datetime format --- #
    for col in config_data[datetime_columns]:
        if set_data[col].dtype != 'datetime64[ns]':
            set_data[col] = pd.to_datetime(set_data[col])

    # --- Convert specified columns to object type --- #
    for col in config_data[obj_columns]:
        if set_data[col].dtype != 'object':
                set_data[col] = set_data[col].astype(str)

    # --- Convert specified float columns to integer format --- #
    for col in config_data[int_columns]:
        set_data[col]=set_data[col].fillna(0)
        if set_data[col].dtype != 'int64':
            set_data[col] = set_data[col].astype(int)

    # --- Return the modified DataFrame --- #
    return set_data

  


In [11]:
df_ben_conv=type_conv(df_ben,config_data,"datetime_columns_ben","obj_columns_ben","int_columns_ben")

In [12]:
df_ben_conv.dtypes

BeneID                                     object
DOB                                datetime64[ns]
DOD                                datetime64[ns]
Gender                                     object
Race                                       object
RenalDiseaseIndicator                      object
State                                       int64
County                                      int64
NoOfMonths_PartACov                         int64
NoOfMonths_PartBCov                         int64
ChronicCond_Alzheimer                       int64
ChronicCond_Heartfailure                    int64
ChronicCond_KidneyDisease                   int64
ChronicCond_Cancer                          int64
ChronicCond_ObstrPulmonary                  int64
ChronicCond_Depression                      int64
ChronicCond_Diabetes                        int64
ChronicCond_IschemicHeart                   int64
ChronicCond_Osteoporasis                    int64
ChronicCond_rheumatoidarthritis             int64


In [13]:
df_ben_conv.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
BeneID,138556.0,138556.0,BENE159198,1.0,,,,,,,
DOB,138556.0,,,,1936-04-30 16:46:00.772539520,1909-01-01 00:00:00,1928-01-01 00:00:00,1935-09-01 00:00:00,1941-12-01 00:00:00,1983-12-01 00:00:00,
DOD,1421.0,,,,2009-07-27 22:27:46.995073792,2009-02-01 00:00:00,2009-06-01 00:00:00,2009-08-01 00:00:00,2009-10-01 00:00:00,2009-12-01 00:00:00,
Gender,138556.0,2.0,2,79106.0,,,,,,,
Race,138556.0,4.0,1,117057.0,,,,,,,
RenalDiseaseIndicator,138556.0,2.0,0,118978.0,,,,,,,
State,138556.0,,,,25.666734,1.0,11.0,25.0,39.0,54.0,15.223443
County,138556.0,,,,374.424745,0.0,141.0,340.0,570.0,999.0,266.277581
NoOfMonths_PartACov,138556.0,,,,11.907727,0.0,12.0,12.0,12.0,12.0,1.032332
NoOfMonths_PartBCov,138556.0,,,,11.910145,0.0,12.0,12.0,12.0,12.0,0.936893


3. Adding Features in beneficiary Table

In [14]:
def feature_addition_ben(dataset_conv):
    #Age from max value from set
    max_bene_DOD = max(dataset_conv['DOD'].dropna().unique()[1:])
    dataset_conv['DOD_imputed']=dataset_conv['DOD']
    dataset_conv['DOD_imputed']=dataset_conv['DOD_imputed'].apply(lambda i: i if pd.notna(i)  else max_bene_DOD )
    dataset_conv['AGE'] = np.round(((dataset_conv['DOD_imputed'] - dataset_conv['DOB']).dt.days)/365.0,1)
    dataset_conv['AGE']= dataset_conv['AGE'].astype('int64')
    dataset_conv['DOD_Flag']=dataset_conv['DOD'].apply(lambda i: 1 if pd.notna(i)  else 0 )
    dataset_conv['DOD_Flag']= dataset_conv['DOD_Flag'].astype('object')
    dataset_conv['TotalIPAnnualAmt']=dataset_conv['IPAnnualReimbursementAmt']+dataset_conv['IPAnnualReimbursementAmt']
    dataset_conv['TotalOPAnnualAmt']= dataset_conv['OPAnnualReimbursementAmt']+dataset_conv['OPAnnualDeductibleAmt']
    dataset_conv.drop(columns=['DOB','DOD','DOD_imputed'],axis=1,inplace=True)
    



In [15]:
feature_addition_ben(df_ben_conv)

In [16]:
def col_separation(dataset):
    cat_cols=dataset.select_dtypes('object').columns.to_list()
    int_cols=dataset.select_dtypes('int64').columns.to_list()
    time_cols=dataset.select_dtypes('datetime64[ns]').columns.to_list()
    return cat_cols,int_cols,time_cols

In [17]:
cat_cols=col_separation(df_ben_conv)[0]
int_cols=col_separation(df_ben_conv)[1]
tim_cols=col_separation(df_ben_conv)[2]

5. Univariate Analysis

In [18]:
#Check the distinct value in each field and count of each of them.
for i in cat_cols:
  print("-------",i,"--------")
  print(df_ben_conv[i].value_counts(dropna=False))
        


------- BeneID --------
BeneID
BENE159198    1
BENE11001     1
BENE11002     1
BENE11003     1
BENE11004     1
             ..
BENE11034     1
BENE11036     1
BENE11037     1
BENE11038     1
BENE11039     1
Name: count, Length: 138556, dtype: int64
------- Gender --------
Gender
2    79106
1    59450
Name: count, dtype: int64
------- Race --------
Race
1    117057
2     13538
3      5059
5      2902
Name: count, dtype: int64
------- RenalDiseaseIndicator --------
RenalDiseaseIndicator
0    118978
Y     19578
Name: count, dtype: int64
------- ChronicCond_Alzheimer --------
ChronicCond_Alzheimer
2    92530
1    46026
Name: count, dtype: int64
------- ChronicCond_Heartfailure --------
ChronicCond_Heartfailure
2    70154
1    68402
Name: count, dtype: int64
------- ChronicCond_KidneyDisease --------
ChronicCond_KidneyDisease
2    95277
1    43279
Name: count, dtype: int64
------- ChronicCond_Cancer --------
ChronicCond_Cancer
2    121935
1     16621
Name: count, dtype: int64
------- Chroni

In [19]:
def cat_columns_univariate (df):
    sns.set_theme(style="whitegrid")
    # Loop through categorical columns
    for i in cat_cols:
        if df[i].nunique() < 10:
            fig, ax = plt.subplots(figsize=(10, 8))
            ax = sns.countplot(data=df, x=i)

            # Calculate percentages
            total = float(len(df))
            for p in ax.patches:
                percentage = '{:.1f}%'.format(100 * p.get_height() / total)
                x = p.get_x() + p.get_width() / 2 - 0.05
                y = p.get_height()
                ax.annotate(percentage, (x, y), ha='center', va='bottom')

            plt.show()
        elif 10 < df[i].nunique() < 30:
        # Get the sorted value counts
            value_counts = df[i].value_counts().sort_values()
            sorted_df = pd.DataFrame({i: value_counts.index, 'count': value_counts.values})

            fig, ax = plt.subplots(figsize=(10, 8))
            sns.countplot(data=df, y=i, order=sorted_df[i], ax=ax)

        # Calculate percentages
            total = float(len(df))
            for p in ax.patches:
                percentage = '{:.1f}%'.format(100 * p.get_width() / total)
                x = p.get_width() + 2.5
                y = p.get_y() + p.get_height() / 2
                ax.annotate(percentage, (x, y), ha='center', va='center')

            plt.show()
    else:
        pass


In [None]:
cat_columns_univariate(df_ben_conv)

In [95]:
def num_univariate(df):
    for i in int_cols:
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.histplot(data=df, x=i, bins=5,  kde=True)


In [None]:
num_univariate(df_ben_conv)

6. Outlier Analysis

In [22]:
def outlier_analysis(df):
    for i in int_cols:
        sns.boxplot(data=df[i])
        plt.show()

In [None]:
outlier_analysis(df_ben_conv)

**In Patient File**

In [18]:
df_inp.dtypes

BeneID                     object
ClaimID                    object
ClaimStartDt               object
ClaimEndDt                 object
Provider                   object
InscClaimAmtReimbursed      int64
AttendingPhysician         object
OperatingPhysician         object
OtherPhysician             object
AdmissionDt                object
ClmAdmitDiagnosisCode      object
DeductibleAmtPaid         float64
DischargeDt                object
DiagnosisGroupCode         object
ClmDiagnosisCode_1         object
ClmDiagnosisCode_2         object
ClmDiagnosisCode_3         object
ClmDiagnosisCode_4         object
ClmDiagnosisCode_5         object
ClmDiagnosisCode_6         object
ClmDiagnosisCode_7         object
ClmDiagnosisCode_8         object
ClmDiagnosisCode_9         object
ClmDiagnosisCode_10        object
ClmProcedureCode_1        float64
ClmProcedureCode_2        float64
ClmProcedureCode_3        float64
ClmProcedureCode_4        float64
ClmProcedureCode_5        float64
ClmProcedureCo

In [180]:
df_inp.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
BeneID,40474.0,31289.0,BENE134170,8.0,,,,,,,
ClaimID,40474.0,40474.0,CLM69948,1.0,,,,,,,
ClaimStartDt,40474.0,398.0,2009-02-10,145.0,,,,,,,
ClaimEndDt,40474.0,365.0,2009-02-11,153.0,,,,,,,
Provider,40474.0,2092.0,PRV52019,516.0,,,,,,,
InscClaimAmtReimbursed,40474.0,,,,10087.884074,10303.099402,0.0,4000.0,7000.0,12000.0,125000.0
AttendingPhysician,40362.0,11604.0,PHY422134,386.0,,,,,,,
OperatingPhysician,23830.0,8287.0,PHY429430,225.0,,,,,,,
OtherPhysician,4690.0,2877.0,PHY416093,81.0,,,,,,,
AdmissionDt,40474.0,398.0,2009-02-10,144.0,,,,,,,


In [181]:
df_inp.isna().sum()

BeneID                        0
ClaimID                       0
ClaimStartDt                  0
ClaimEndDt                    0
Provider                      0
InscClaimAmtReimbursed        0
AttendingPhysician          112
OperatingPhysician        16644
OtherPhysician            35784
AdmissionDt                   0
ClmAdmitDiagnosisCode         0
DeductibleAmtPaid           899
DischargeDt                   0
DiagnosisGroupCode            0
ClmDiagnosisCode_1            0
ClmDiagnosisCode_2          226
ClmDiagnosisCode_3          676
ClmDiagnosisCode_4         1534
ClmDiagnosisCode_5         2894
ClmDiagnosisCode_6         4838
ClmDiagnosisCode_7         7258
ClmDiagnosisCode_8         9942
ClmDiagnosisCode_9        13497
ClmDiagnosisCode_10       36547
ClmProcedureCode_1        17326
ClmProcedureCode_2        35020
ClmProcedureCode_3        39509
ClmProcedureCode_4        40358
ClmProcedureCode_5        40465
ClmProcedureCode_6        40474
dtype: int64

1. Convert the file to pappropriate datatypes.

In [19]:
df_inp_conv=type_conv(df_inp,config_data,"datetime_columns_inp","obj_columns_inp","int_columns_inp")

In [65]:
df_inp_conv.dtypes

BeneID                            object
ClaimID                           object
ClaimStartDt              datetime64[ns]
ClaimEndDt                datetime64[ns]
Provider                          object
InscClaimAmtReimbursed             int64
AttendingPhysician                object
OperatingPhysician                object
OtherPhysician                    object
AdmissionDt               datetime64[ns]
ClmAdmitDiagnosisCode             object
DeductibleAmtPaid                  int64
DischargeDt               datetime64[ns]
DiagnosisGroupCode                object
ClmDiagnosisCode_1                object
ClmDiagnosisCode_2                object
ClmDiagnosisCode_3                object
ClmDiagnosisCode_4                object
ClmDiagnosisCode_5                object
ClmDiagnosisCode_6                object
ClmDiagnosisCode_7                object
ClmDiagnosisCode_8                object
ClmDiagnosisCode_9                object
ClmDiagnosisCode_10               object
ClmProcedureCode

In [146]:
df_inp_conv.isna().sum()

BeneID                        0
ClaimID                       0
ClaimStartDt                  0
ClaimEndDt                    0
Provider                      0
InscClaimAmtReimbursed        0
AttendingPhysician          112
OperatingPhysician        16644
OtherPhysician            35784
AdmissionDt                   0
ClmAdmitDiagnosisCode         0
DeductibleAmtPaid             0
DischargeDt                   0
DiagnosisGroupCode            0
ClmDiagnosisCode_1            0
ClmDiagnosisCode_2          226
ClmDiagnosisCode_3          676
ClmDiagnosisCode_4         1534
ClmDiagnosisCode_5         2894
ClmDiagnosisCode_6         4838
ClmDiagnosisCode_7         7258
ClmDiagnosisCode_8         9942
ClmDiagnosisCode_9        13497
ClmDiagnosisCode_10       36547
ClmProcedureCode_1            0
ClmProcedureCode_2            0
ClmProcedureCode_3            0
ClmProcedureCode_4            0
ClmProcedureCode_5            0
ClmProcedureCode_6            0
dtype: int64

In [147]:
df_inp_conv.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
BeneID,40474.0,31289.0,BENE134170,8.0,,,,,,,
ClaimID,40474.0,40474.0,CLM69948,1.0,,,,,,,
ClaimStartDt,40474.0,,,,2009-06-19 17:46:36.284034048,2008-11-27 00:00:00,2009-03-20 00:00:00,2009-06-16 00:00:00,2009-09-17 00:00:00,2009-12-31 00:00:00,
ClaimEndDt,40474.0,,,,2009-06-25 09:37:21.972624384,2009-01-01 00:00:00,2009-03-26 00:00:00,2009-06-22 00:00:00,2009-09-23 00:00:00,2009-12-31 00:00:00,
Provider,40474.0,2092.0,PRV52019,516.0,,,,,,,
InscClaimAmtReimbursed,40474.0,,,,10087.884074,0.0,4000.0,7000.0,12000.0,125000.0,10303.099402
AttendingPhysician,40362.0,11604.0,PHY422134,386.0,,,,,,,
OperatingPhysician,23830.0,8287.0,PHY429430,225.0,,,,,,,
OtherPhysician,4690.0,2877.0,PHY416093,81.0,,,,,,,
AdmissionDt,40474.0,,,,2009-06-19 17:38:12.493946880,2008-11-27 00:00:00,2009-03-20 00:00:00,2009-06-16 00:00:00,2009-09-17 00:00:00,2009-12-31 00:00:00,


In [148]:
for i in df_inp_conv.columns:
    print("++++++"+i+"+++++++++++++")
    print(df_inp_conv[i].value_counts())

++++++BeneID+++++++++++++
BeneID
BENE134170    8
BENE62091     7
BENE117116    7
BENE64791     7
BENE121796    7
             ..
BENE158955    1
BENE158959    1
BENE158962    1
BENE158963    1
BENE158968    1
Name: count, Length: 31289, dtype: int64
++++++ClaimID+++++++++++++
ClaimID
CLM69948    1
CLM46614    1
CLM66048    1
CLM68358    1
CLM38412    1
           ..
CLM60118    1
CLM56288    1
CLM63127    1
CLM42596    1
CLM35816    1
Name: count, Length: 40474, dtype: int64
++++++ClaimStartDt+++++++++++++
ClaimStartDt
2009-02-10    145
2009-02-26    144
2009-01-31    143
2009-01-06    139
2009-02-02    138
             ... 
2008-11-28      2
2008-12-03      1
2008-12-06      1
2009-12-31      1
2008-12-10      1
Name: count, Length: 398, dtype: int64
++++++ClaimEndDt+++++++++++++
ClaimEndDt
2009-02-11    153
2009-01-10    146
2009-03-03    144
2009-04-11    141
2009-05-31    138
             ... 
2009-11-29     82
2009-11-03     82
2009-10-19     79
2009-11-11     77
2009-12-25     75

2. Feature addition for Inpatient file

In [20]:
def Feature_addition_inp_outp(df,type_of_data):
    df['Claim_period']=np.round(((df['ClaimEndDt'] - df['ClaimStartDt']).dt.days),1)
    df['Beneficiary_cost']=(df['InscClaimAmtReimbursed'] - df['DeductibleAmtPaid'])
    if type_of_data.lower()=='inpatient':
        diagnosis_code_columns = ['ClmDiagnosisCode_1','ClmDiagnosisCode_2','ClmDiagnosisCode_3','ClmDiagnosisCode_4',
                                'ClmDiagnosisCode_5','ClmDiagnosisCode_6','ClmDiagnosisCode_7','ClmDiagnosisCode_8','ClmDiagnosisCode_9','ClmDiagnosisCode_10']
        diagnosis_proc_columns = ['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3']
                            
        df['Count_diag_code']=df[diagnosis_code_columns].notna().sum(axis=1)
        for i in diagnosis_proc_columns:
            df[i]=df[i].replace('nan', np.nan)
        df['Count_proc_code']=df[diagnosis_proc_columns].notna().sum(axis=1)
        df['Admit_Period']=np.round(((df['DischargeDt'] - df['AdmissionDt']).dt.days),1)
        df=df.drop(columns=['ClaimEndDt','ClaimStartDt','DischargeDt','AdmissionDt','ClmProcedureCode_6','ClmProcedureCode_4',
                                'ClmProcedureCode_5'],axis=1)
        df['Is_admit']=1
    elif type_of_data.lower().strip()=='outpatient':
        df['Claim_period']=np.round(((df['ClaimEndDt'] - df['ClaimStartDt']).dt.days),1)
        df['Beneficiary_cost']=(df['InscClaimAmtReimbursed'] - df['DeductibleAmtPaid'])
        diagnosis_code_columns = ['ClmDiagnosisCode_1','ClmDiagnosisCode_2','ClmDiagnosisCode_3','ClmDiagnosisCode_4',
                                    'ClmDiagnosisCode_5','ClmDiagnosisCode_6','ClmDiagnosisCode_7','ClmDiagnosisCode_8','ClmDiagnosisCode_9','ClmDiagnosisCode_10']
        diagnosis_proc_columns = ['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3']
                                
        df['Count_diag_code']=df[diagnosis_code_columns].notna().sum(axis=1)
        for i in diagnosis_proc_columns:
                df[i]=df[i].replace('nan', np.nan)
        df['Count_proc_code']=df[diagnosis_proc_columns].notna().sum(axis=1)
        df=df.drop(columns=['ClaimEndDt','ClaimStartDt','ClmProcedureCode_6','ClmProcedureCode_4',
                                'ClmProcedureCode_5'],axis=1)
        df['Is_admit']=0
    return df
    
    

In [21]:
df_inp_conv=Feature_addition_inp_outp(df_inp_conv,'inpatient')


3. Standardizing the code values

In [34]:
def pad_code(code):
    if pd.isna(code):
        return code  # Keep NaN as is
    return str(code).zfill(4)  # Pad with zeros to make it 4 characters

In [35]:
def diag_proccode_stnd(df):
    for i in [ 'ClmAdmitDiagnosisCode', 'DiagnosisGroupCode',
        'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
        'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
        'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
        'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
        'ClmProcedureCode_3']:
        df[i].astype(str)
        df[i]=df[i].replace(r'\.0$', '', regex=True)
        df[i]=df[i].replace(r'nan', np.nan, regex=True)
        df[i]=df[i].apply(pad_code)
        return df

In [36]:
df_inp_conv=diag_proccode_stnd(df_inp_conv)

In [20]:
df_inp_conv.columns

Index(['BeneID', 'ClaimID', 'Provider', 'InscClaimAmtReimbursed',
       'AttendingPhysician', 'OperatingPhysician', 'OtherPhysician',
       'ClmAdmitDiagnosisCode', 'DeductibleAmtPaid', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'Claim_period', 'Beneficiary_cost', 'Count_diag_code',
       'Count_proc_code', 'Admit_Period', 'Is_admit'],
      dtype='object')

In [21]:
df_inp_conv[['Claim_period','Admit_Period','Beneficiary_cost','Count_diag_code','Count_proc_code']].nunique()

Claim_period         37
Admit_Period         36
Beneficiary_cost    206
Count_diag_code      10
Count_proc_code       6
dtype: int64

4. Separating columns for analysis

In [22]:
def col_separation(dataset):
    cat_cols=dataset.select_dtypes('object').columns.to_list()
    int_cols=dataset.select_dtypes('int64').columns.to_list()
    time_cols=dataset.select_dtypes('datetime64[ns]').columns.to_list()
    return cat_cols,int_cols,time_cols

In [30]:
cat_cols=col_separation(df_inp_conv)[0]
int_cols=col_separation(df_inp_conv)[1]
tim_cols=col_separation(df_inp_conv)[2]

5. Univariate Analysis

In [None]:
num_univariate(df_inp_conv)

In [None]:
cat_columns_univariate(df_inp_conv)

6. Outlier Analysis

In [None]:
outlier_analysis(df_inp_conv)

**Out patient**

In [26]:
df_outp.dtypes

BeneID                     object
ClaimID                    object
ClaimStartDt               object
ClaimEndDt                 object
Provider                   object
InscClaimAmtReimbursed      int64
AttendingPhysician         object
OperatingPhysician         object
OtherPhysician             object
ClmDiagnosisCode_1         object
ClmDiagnosisCode_2         object
ClmDiagnosisCode_3         object
ClmDiagnosisCode_4         object
ClmDiagnosisCode_5         object
ClmDiagnosisCode_6         object
ClmDiagnosisCode_7         object
ClmDiagnosisCode_8         object
ClmDiagnosisCode_9         object
ClmDiagnosisCode_10        object
ClmProcedureCode_1        float64
ClmProcedureCode_2        float64
ClmProcedureCode_3        float64
ClmProcedureCode_4        float64
ClmProcedureCode_5        float64
ClmProcedureCode_6        float64
DeductibleAmtPaid           int64
ClmAdmitDiagnosisCode      object
dtype: object

In [24]:
df_outp.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
BeneID,517737.0,133980.0,BENE42721,29.0,,,,,,,
ClaimID,517737.0,517737.0,CLM686139,1.0,,,,,,,
ClaimStartDt,517737.0,385.0,2009-03-03,1574.0,,,,,,,
ClaimEndDt,517737.0,366.0,2009-03-03,1563.0,,,,,,,
Provider,517737.0,5012.0,PRV51459,8240.0,,,,,,,
InscClaimAmtReimbursed,517737.0,,,,286.334799,694.034343,0.0,40.0,80.0,200.0,102500.0
AttendingPhysician,516341.0,74109.0,PHY330576,2534.0,,,,,,,
OperatingPhysician,90617.0,28532.0,PHY330576,424.0,,,,,,,
OtherPhysician,195046.0,44388.0,PHY412132,1247.0,,,,,,,
ClmDiagnosisCode_1,507284.0,10354.0,4019,13803.0,,,,,,,


In [25]:
df_outp.dtypes

BeneID                     object
ClaimID                    object
ClaimStartDt               object
ClaimEndDt                 object
Provider                   object
InscClaimAmtReimbursed      int64
AttendingPhysician         object
OperatingPhysician         object
OtherPhysician             object
ClmDiagnosisCode_1         object
ClmDiagnosisCode_2         object
ClmDiagnosisCode_3         object
ClmDiagnosisCode_4         object
ClmDiagnosisCode_5         object
ClmDiagnosisCode_6         object
ClmDiagnosisCode_7         object
ClmDiagnosisCode_8         object
ClmDiagnosisCode_9         object
ClmDiagnosisCode_10        object
ClmProcedureCode_1        float64
ClmProcedureCode_2        float64
ClmProcedureCode_3        float64
ClmProcedureCode_4        float64
ClmProcedureCode_5        float64
ClmProcedureCode_6        float64
DeductibleAmtPaid           int64
ClmAdmitDiagnosisCode      object
dtype: object

In [27]:
df_outp_conv=type_conv(df_outp,config_data,"datetime_columns_outp","obj_columns_outp","int_columns_outp")

1. Feature addition for outpatient file

In [28]:
df_outp_conv=Feature_addition_inp_outp(df_outp_conv,'outpatient')

In [29]:
df_outp_conv=diag_proccode_stnd(df_outp_conv)

In [28]:
len(df_outp)

517737

In [7]:
df_outp_conv.isna().sum()

NameError: name 'df_outp_conv' is not defined

In [30]:
df_outp_conv['ClmProcedureCode_6'].nunique

<bound method IndexOpsMixin.nunique of 0         nan
1         nan
2         nan
3         nan
4         nan
         ... 
517732    nan
517733    nan
517734    nan
517735    nan
517736    nan
Name: ClmProcedureCode_6, Length: 517737, dtype: object>

In [34]:
cat_cols=col_separation(df_outp_conv)[0]
int_cols=col_separation(df_outp_conv)[1]
tim_cols=col_separation(df_outp_conv)[2]

In [None]:
num_univariate(df_outp_conv)

In [None]:
outlier_analysis(df_outp_conv)

In [None]:
cat_columns_univariate(df_outp_conv)

**Merging Files - Inpatient and outpatient**

In [167]:
util.pickle_dump(df_ben_conv, config_data["raw_dataset_path_train_ben"])
util.pickle_dump(df_inp_conv, config_data["raw_dataset_path_train_inp"])
util.pickle_dump(df_outp_conv, config_data["raw_dataset_path_train_outp"])