# ML Modeling: Epoch 1
> 1. Specialty
> 2. State

In [26]:
import pandas as pd
import numpy as np

In [11]:
# load id_df from efs
id_df = pd.read_csv('~/SageMaker/efs/DrFraud/data/id_df.csv', index_col=0)
id_df.head()

  mask |= (ar1 == a)


Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_city,nppes_provider_state,specialty_description,description_flag,drug_name,generic_name,bene_count,...,total_day_supply,total_drug_cost,bene_count_ge65,bene_count_ge65_suppress_flag,total_claim_count_ge65,ge65_suppress_flag,total_30_day_fill_count_ge65,total_day_supply_ge65,total_drug_cost_ge65,year
0,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,MD,Internal Medicine,S,ISOSORBIDE MONONITRATE ER,ISOSORBIDE MONONITRATE,,...,307,171.59,,*,,*,,,,2013
1,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,MD,Internal Medicine,S,LEVOFLOXACIN,LEVOFLOXACIN,26.0,...,165,227.1,15.0,,15.0,,15.0,106.0,159.72,2013
2,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,MD,Internal Medicine,S,LISINOPRIL,LISINOPRIL,17.0,...,570,100.37,,#,,#,,,,2013
3,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,MD,Internal Medicine,S,METOPROLOL TARTRATE,METOPROLOL TARTRATE,28.0,...,916,154.65,,#,,#,,,,2013
4,1003000126,ENKESHAFI,ARDALAN,CUMBERLAND,MD,Internal Medicine,S,PREDNISONE,PREDNISONE,14.0,...,133,44.72,,*,,*,,,,2013


In [12]:
# load LEIE from efs
LEIE_df = pd.read_csv('~/SageMaker/efs/DrFraud/data/LEIE.csv', index_col=0)
LEIE_df.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,...,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,year,month,key_present
0,,,,184TH STREET PHARMACY CORP,OTHER BUSINESS,PHARMACY,,1922348218,,69 E 184TH ST,...,NY,10468,1128a1,20180419,0,0,,2018,4,True
1,,,,"A & Y MEDICAL SUPPLY, INC",DME COMPANY,DME - GENERAL,,1942476080,,"6310 108TH STREET, APT 6J",...,NY,11375,1128b8,20170518,0,0,,2017,5,True
2,,,,"A FAIR DEAL PHARMACY, INC",OTHER BUSINESS,PHARMACY,,1891731758,,"C/O P O BOX 329014, #69709-05",...,NY,11232,1128b8,20170518,0,0,,2017,5,True
3,,,,ACCELERATED BEHAVIOR MANAGEMEN,OTHER BUSINESS,COMM MNTL HLTH CNTR,,1902198435,,6148 W SAHARA AVENUE,...,NV,89146,1128a1,20160120,0,0,,2016,1,True
4,,,,ADAMS LOVING PRO HEALTH CARE,OTHER BUSINESS,HOME HEALTH AGENCY,,1073682936,,"1277 HIGHWAY 82 W, STE 218",...,GA,31763,1128b7,20150409,0,0,,2015,4,True


In [54]:
df = id_df[['year',
            'npi',
            'specialty_description',
            'nppes_provider_state',
            'nppes_provider_city',]]
df.head()

Unnamed: 0,year,npi,specialty_description,nppes_provider_state,nppes_provider_city
0,2013,1003000126,Internal Medicine,MD,CUMBERLAND
1,2013,1003000126,Internal Medicine,MD,CUMBERLAND
2,2013,1003000126,Internal Medicine,MD,CUMBERLAND
3,2013,1003000126,Internal Medicine,MD,CUMBERLAND
4,2013,1003000126,Internal Medicine,MD,CUMBERLAND


In [55]:
df1 = df.drop_duplicates().reset_index(drop=True)
df1.head()

Unnamed: 0,year,npi,specialty_description,nppes_provider_state,nppes_provider_city
0,2013,1003000126,Internal Medicine,MD,CUMBERLAND
1,2013,1003000142,Anesthesiology,OH,TOLEDO
2,2013,1003000167,Dentist,NV,DAYTON
3,2013,1003000282,Nurse Practitioner,TN,NASHVILLE
4,2013,1003000407,Family Practice,PA,PATTON


In [56]:
df1.shape

(808603, 5)

In [57]:
target_df = LEIE_df[['year','NPI']].drop_duplicates()
target_df.head()

Unnamed: 0,year,NPI
0,2018,1922348218
1,2017,1942476080
2,2017,1891731758
3,2016,1902198435
4,2015,1073682936


target_df.shape

In [59]:
a = set(df1['npi'].values)
b = set(target_df['NPI'].values)
print('unique npis in Medicare Part D: ', len(a))
print('unique npis in LEIE: ', len(b))
print('npis common to both Medicare Part D and LEIE: ',len(a.intersection(b)))

unique npis in Medicare Part D:  808065
unique npis in LEIE:  2843
npis common to both Medicare Part D and LEIE:  1024


In [138]:
# transform dataframes into `year:npi` format
X = list(map(lambda x: ':'.join(x.astype(str)), df1[['year','npi']].values)) 
y = list(map(lambda x: ':'.join(x.astype(str)), target_df[['year','NPI']].values))

In [86]:
# check if an NPI from Medicare Part D was in the Excluded list, LEIE
df1['exlcluded'] = list(map(lambda x: x in y, X))
df1['year:npi'] = X

## ML Model - Specialty
---

In [118]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import confusion_matrix

In [119]:
# build feature matrix
specialty_df = pd.get_dummies(df1[['year:npi', 'specialty_description']], 
                                  columns=['specialty_description']).set_index('year:npi', drop=True)

In [120]:
X = specialty_df.values
y = df1['exlcluded'].values

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5)
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[404302]])

In [132]:
rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)

{False}