# Machine Learning Processing

### Contents

In [2]:
# Import packages etc.
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import pickle

## Data Import and Processing

In [11]:
provdat = pickle.load(open('./Pickle_Jar/Xtrain.pickle', 'rb'))
provtarg = pickle.load(open('./Pickle_Jar/Xtest.pickle', 'rb'))
provgroup = pickle.load(open('./Pickle_Jar/Xdata.pickle', 'rb'))

In [12]:
# Examining Pre-Aggregated Per-Provider Data
print('Data Shape: ' + str(provdat.shape))
provdat.head()

Data Shape: (5410, 70)


Unnamed: 0_level_0,Number_of_Claims,Num_Unique_Patients,Num_Unique_Docs,Num_Unique_Diag_Codes,Num_Unique_Proc_Codes,Mean_InscClaimAmtReimbursed,Mean_DeductibleAmtPaid,Mean_Reimbursement_per_Day,Mean_Length_of_Stay,Mean_Length_of_Claim,...,Perc_ClaimStartDt_March,Perc_ClaimStartDt_April,Perc_ClaimStartDt_May,Perc_ClaimStartDt_June,Perc_ClaimStartDt_July,Perc_ClaimStartDt_August,Perc_ClaimStartDt_September,Perc_ClaimStartDt_October,Perc_ClaimStartDt_November,Perc_ClaimStartDt_December
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PRV51001,25,24,19,72,3,4185.6,213.6,3700.0,6.0,2.44,...,0.12,0.08,0.2,0.12,0.08,0.04,0.04,0.0,0.0,0.08
PRV51003,132,117,51,365,39,4588.41,502.17,2137.42,6.16,4.67,...,0.08,0.08,0.06,0.11,0.08,0.1,0.07,0.08,0.08,0.05
PRV51004,149,138,48,267,0,350.13,2.08,0.0,0.0,2.43,...,0.12,0.07,0.05,0.07,0.09,0.07,0.08,0.07,0.05,0.11
PRV51005,1165,495,7,1294,0,241.12,3.18,0.0,0.0,2.09,...,0.1,0.08,0.07,0.09,0.08,0.08,0.09,0.1,0.06,0.07
PRV51007,72,58,11,165,1,468.19,45.33,1080.56,6.33,1.96,...,0.12,0.06,0.12,0.12,0.04,0.08,0.1,0.03,0.08,0.08


In [22]:
# Examining Per-Provider Incidence of Fradulence
print('Data Shape: ' + str(provtarg.shape))
provtarg.head()

Data Shape: (5410,)


Provider
PRV51001    0
PRV51003    1
PRV51004    0
PRV51005    1
PRV51007    0
Name: PotentialFraud, dtype: int64

In [21]:
# Combining Target data with Provider data to generate supervised learning dataset
xdata = pd.merge(provdat.reset_index(), provtarg.reset_index(), on='Provider').set_index('Provider')
xdata.head()

Unnamed: 0_level_0,Number_of_Claims,Num_Unique_Patients,Num_Unique_Docs,Num_Unique_Diag_Codes,Num_Unique_Proc_Codes,Mean_InscClaimAmtReimbursed,Mean_DeductibleAmtPaid,Mean_Reimbursement_per_Day,Mean_Length_of_Stay,Mean_Length_of_Claim,...,Perc_ClaimStartDt_April,Perc_ClaimStartDt_May,Perc_ClaimStartDt_June,Perc_ClaimStartDt_July,Perc_ClaimStartDt_August,Perc_ClaimStartDt_September,Perc_ClaimStartDt_October,Perc_ClaimStartDt_November,Perc_ClaimStartDt_December,PotentialFraud
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PRV51001,25,24,19,72,3,4185.6,213.6,3700.0,6.0,2.44,...,0.08,0.2,0.12,0.08,0.04,0.04,0.0,0.0,0.08,0
PRV51003,132,117,51,365,39,4588.41,502.17,2137.42,6.16,4.67,...,0.08,0.06,0.11,0.08,0.1,0.07,0.08,0.08,0.05,1
PRV51004,149,138,48,267,0,350.13,2.08,0.0,0.0,2.43,...,0.07,0.05,0.07,0.09,0.07,0.08,0.07,0.05,0.11,0
PRV51005,1165,495,7,1294,0,241.12,3.18,0.0,0.0,2.09,...,0.08,0.07,0.09,0.08,0.08,0.09,0.1,0.06,0.07,1
PRV51007,72,58,11,165,1,468.19,45.33,1080.56,6.33,1.96,...,0.06,0.12,0.12,0.04,0.08,0.1,0.03,0.08,0.08,0


In [28]:
from sklearn.model_selection import train_test_split

# Split the data into train & test sets for supervised learning
xtrain, xtest, ytrain, ytest = train_test_split(xdata.iloc[:, :-1],
                                               xdata.iloc[:, -1:],
                                               test_size=0.2,
                                               random_state=430)

# Quick Heads Up
print('Training Data shape: ' + str(xtrain.shape))
print('Test Data Shape: ' + str(xtest.shape))

Training Data shape: (4328, 70)
Test Data Shape: (1082, 70)


## Training & Strategy