In [15]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import pickle
from joblib import dump, load
from sklearn.metrics import r2_score


In [22]:
train_beneficiary_data = pd.read_csv('./HealthClaims/Train_Beneficiarydata-1542865627584.csv')
train_inpatient_data = pd.read_csv('./HealthClaims/Train_Inpatientdata-1542865627584.csv')
train_outpatient_data = pd.read_csv('./HealthClaims/Train_Outpatientdata-1542865627584.csv')
train_fraud_data = pd.read_csv('./HealthClaims/Train-1542865627584.csv')

print("Train Beneficiary Data Info:")
print(train_beneficiary_data.info())

print("\nTrain Inpatient Data Info:")
print(train_inpatient_data.info())

print("\nTrain Outpatient Data Info:")
print(train_outpatient_data.info())

print("\nTrain Fraud Data Info:")
print(train_fraud_data.info())


Train Beneficiary Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138556 entries, 0 to 138555
Data columns (total 25 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   BeneID                           138556 non-null  object
 1   DOB                              138556 non-null  object
 2   DOD                              1421 non-null    object
 3   Gender                           138556 non-null  int64 
 4   Race                             138556 non-null  int64 
 5   RenalDiseaseIndicator            138556 non-null  object
 6   State                            138556 non-null  int64 
 7   County                           138556 non-null  int64 
 8   NoOfMonths_PartACov              138556 non-null  int64 
 9   NoOfMonths_PartBCov              138556 non-null  int64 
 10  ChronicCond_Alzheimer            138556 non-null  int64 
 11  ChronicCond_Heartfailure         138556 non-null 

In [20]:
columns_to_drop = ['AdmissionDt', 'DischargeDt', 'DiagnosisGroupCode']
cleaned_inpatient_data = train_inpatient_data.drop(columns=columns_to_drop)
cleaned_inpatient_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40474 entries, 0 to 40473
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   BeneID                  40474 non-null  object 
 1   ClaimID                 40474 non-null  object 
 2   ClaimStartDt            40474 non-null  object 
 3   ClaimEndDt              40474 non-null  object 
 4   Provider                40474 non-null  object 
 5   InscClaimAmtReimbursed  40474 non-null  int64  
 6   AttendingPhysician      40362 non-null  object 
 7   OperatingPhysician      23830 non-null  object 
 8   OtherPhysician          4690 non-null   object 
 9   ClmAdmitDiagnosisCode   40474 non-null  object 
 10  DeductibleAmtPaid       39575 non-null  float64
 11  ClmDiagnosisCode_1      40474 non-null  object 
 12  ClmDiagnosisCode_2      40248 non-null  object 
 13  ClmDiagnosisCode_3      39798 non-null  object 
 14  ClmDiagnosisCode_4      38940 non-null

In [23]:
merged_data = pd.concat([cleaned_inpatient_data, train_outpatient_data], axis=0)

merged_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 558211 entries, 0 to 517736
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   BeneID                  558211 non-null  object 
 1   ClaimID                 558211 non-null  object 
 2   ClaimStartDt            558211 non-null  object 
 3   ClaimEndDt              558211 non-null  object 
 4   Provider                558211 non-null  object 
 5   InscClaimAmtReimbursed  558211 non-null  int64  
 6   AttendingPhysician      556703 non-null  object 
 7   OperatingPhysician      114447 non-null  object 
 8   OtherPhysician          199736 non-null  object 
 9   ClmAdmitDiagnosisCode   145899 non-null  object 
 10  DeductibleAmtPaid       557312 non-null  float64
 11  ClmDiagnosisCode_1      547758 non-null  object 
 12  ClmDiagnosisCode_2      362605 non-null  object 
 13  ClmDiagnosisCode_3      243055 non-null  object 
 14  ClmDiagnosisCode_4      1

In [24]:
merged_data_with_fraud = pd.merge(merged_data, train_fraud_data, on='Provider', how='left')
merged_data_with_fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558211 entries, 0 to 558210
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   BeneID                  558211 non-null  object 
 1   ClaimID                 558211 non-null  object 
 2   ClaimStartDt            558211 non-null  object 
 3   ClaimEndDt              558211 non-null  object 
 4   Provider                558211 non-null  object 
 5   InscClaimAmtReimbursed  558211 non-null  int64  
 6   AttendingPhysician      556703 non-null  object 
 7   OperatingPhysician      114447 non-null  object 
 8   OtherPhysician          199736 non-null  object 
 9   ClmAdmitDiagnosisCode   145899 non-null  object 
 10  DeductibleAmtPaid       557312 non-null  float64
 11  ClmDiagnosisCode_1      547758 non-null  object 
 12  ClmDiagnosisCode_2      362605 non-null  object 
 13  ClmDiagnosisCode_3      243055 non-null  object 
 14  ClmDiagnosisCode_4  

In [30]:
merged_data_with_beneficiary = pd.merge(train_beneficiary_data, merged_data_with_fraud, on='BeneID', how='left')

merged_data_with_beneficiary.isnull().sum()
merged_data_with_beneficiary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558211 entries, 0 to 558210
Data columns (total 52 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   BeneID                           558211 non-null  object 
 1   DOB                              558211 non-null  object 
 2   DOD                              4131 non-null    object 
 3   Gender                           558211 non-null  int64  
 4   Race                             558211 non-null  int64  
 5   RenalDiseaseIndicator            558211 non-null  object 
 6   State                            558211 non-null  int64  
 7   County                           558211 non-null  int64  
 8   NoOfMonths_PartACov              558211 non-null  int64  
 9   NoOfMonths_PartBCov              558211 non-null  int64  
 10  ChronicCond_Alzheimer            558211 non-null  int64  
 11  ChronicCond_Heartfailure         558211 non-null  int64  
 12  Ch

In [31]:
cleaned_data = merged_data_with_beneficiary.dropna(thresh=len(merged_data_with_beneficiary) * 0.5, axis=1)
cleaned_data.isnull().sum()
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558211 entries, 0 to 558210
Data columns (total 34 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   BeneID                           558211 non-null  object 
 1   DOB                              558211 non-null  object 
 2   Gender                           558211 non-null  int64  
 3   Race                             558211 non-null  int64  
 4   RenalDiseaseIndicator            558211 non-null  object 
 5   State                            558211 non-null  int64  
 6   County                           558211 non-null  int64  
 7   NoOfMonths_PartACov              558211 non-null  int64  
 8   NoOfMonths_PartBCov              558211 non-null  int64  
 9   ChronicCond_Alzheimer            558211 non-null  int64  
 10  ChronicCond_Heartfailure         558211 non-null  int64  
 11  ChronicCond_KidneyDisease        558211 non-null  int64  
 12  Ch

In [12]:
# Separate features and target variable
X = cleaned_data.drop(columns=['InscClaimAmtReimbursed', 'Provider'])
y = cleaned_data['InscClaimAmtReimbursed']

# Define preprocessing steps for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps for numerical and categorical columns using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X.select_dtypes(include=['int', 'float']).columns),
        ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns)
    ])

# Preprocess the data
X_preprocessed = preprocessor.fit_transform(X)

# Split the preprocessed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Define the model
model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)

# Train the model
model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)
print("RMSE:", np.sqrt(mse))


MSE: 2902652.3807902755
RMSE: 1703.7172244214341


In [16]:
# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

# Calculate Adjusted R^2 score
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print("Adjusted R^2 Score:", adjusted_r2)


R^2 Score: 0.7962289420360686
Adjusted R^2 Score: 1.0332288607144453


In [17]:
dump(preprocessor, './healthcare_preprocessor.joblib')
with open('./RfPickelModelx1.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Load the saved model
model_filename = './RfPickelModelx1.pkl'
with open(model_filename, 'rb') as model_file:
    loaded_model = pickle.load(model_file)


In [18]:
# Get the first row of the DataFrame
first_row = cleaned_data.iloc[[0]]

# Preprocess the first row
X_first_row_preprocessed = preprocessor.transform(first_row)

# Predict the value
predicted_value = loaded_model.predict(X_first_row_preprocessed)

# Get the actual value
actual_value = cleaned_data.iloc[0]['InscClaimAmtReimbursed']

print("Predicted Value:", predicted_value)
print("Actual Value:", actual_value)

Predicted Value: [11026.532]
Actual Value: 26000


In [19]:

# Calculate mode for each column
modes = cleaned_data.mode()
modes_dict = {}
for col in modes.columns:
    mode_val = modes[col][0]
    modes_dict[col] = [mode_val]
modes_df = pd.DataFrame(modes_dict)
modes_df.to_csv('./newmode1.csv', index=False)

# Print columns before and after dropping
print("Columns before dropping:")
print(cleaned_data.columns)
dropped_columns = cleaned_data.columns[cleaned_data.isnull().mean() > 0.5]
print("\nColumns dropped:")
print(dropped_columns)


Columns before dropping:
Index(['BeneID', 'DOB', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State',
       'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt',
       'Provider', 'InscClaimAmtReimbursed', 'AttendingPhysician',
       'DeductibleAmtPaid', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
       'PotentialFraud'],
      dtype='object')

Columns dropped:
Index([], dtype='object')


In [15]:
import pandas as pd

# Load your dataset
df0=dfN

# Print the columns before dropping
print("Columns before dropping:")
print(df0.columns)

# Drop columns with more than 50% missing values
dropped_columns = df0.columns[df0.isnull().mean() > 0.5]


# Print the columns after dropping


# Print the columns that were dropped
print("\nColumns dropped:")
print(dropped_columns)

Columns before dropping:
Index(['BeneID', 'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator',
       'State', 'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt',
       'Provider', 'InscClaimAmtReimbursed', 'AttendingPhysician',
       'OperatingPhysician', 'OtherPhysician', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
       'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5',
       'ClmDiagnosisCode_6', 'ClmDiagnosisCo

In [32]:
cleaned_data.to_csv('./retrain.csv')