In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import boto3
%matplotlib inline

In [2]:
def load_LEIE(f_path):
    '''
    Input: relative file path of the .csv file for LEIE(List of Excluded Individuals/Entities)
    Input(Optional): order by month and/or by day
    Process: Takes in a csv converts it into a pandas dataframe. Also, adds a "year" and "month" column.
    Output: pd.DataFrame
    '''
    df = pd.read_csv(f_path, low_memory=False)
    df["year"] = pd.to_datetime(df["EXCLDATE"], format="%Y%m%d").dt.year
    df["month"] = pd.to_datetime(df["EXCLDATE"], format="%Y%m%d").dt.month
    return df

def key_by_npi(df):
    '''
    input: LEIE pd.DataFrame
    output: LEIE pd.DataFrame with added boolean "key_present" column
    '''
    df["key_present"] = df["NPI"] != 0
    return df

def filter_years(df, min_year):
    '''
    Input: LEIE pd.DataFrame, min_year(inclusive)
    Output: LEIE pd.DataFrame for years after min_year
    '''
    return df[df["year"]>=min_year].reset_index(drop=True)

def filter_keys(df):
    '''
    Input: pd.DataFrame of LEIE report with "key_present" columns
    Output: pd.DataFrame of LEIE report where key is present
    '''
    return df[df["key_present"]].reset_index(drop=True)

---
# A deeper look into the LEIE data

In [3]:
# Load file from S3
s3 = boto3.client('s3')
bucket='dast1healthcare' # Or whatever you called your bucket
data_key='06-2018 LEIE.csv'
path='s3://{}/{}'.format(bucket, data_key)
LEIE_df = load_LEIE(path)
LEIE_df.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,year,month
0,,,,14 LAWRENCE AVE PHARMACY,PHARMACY,,,0,,14 LAWRENCE AVENUE,SMITHTOWN,NY,11787,1128a1,19880830,0,0,,1988,8
1,,,,143 MEDICAL EQUIPMENT CO,DME COMPANY,DME - OXYGEN,,0,,701 NW 36 AVENUE,MIAMI,FL,33125,1128b7,19970620,0,0,,1997,6
2,,,,184TH STREET PHARMACY CORP,OTHER BUSINESS,PHARMACY,,1922348218,,69 E 184TH ST,BRONX,NY,10468,1128a1,20180419,0,0,,2018,4
3,,,,1951 FLATBUSH AVENUE PHARMACY,PHARMACY,,,0,,1951 FLATBUSH AVE,BROOKLYN,NY,11234,1128b5,20090319,0,0,,2009,3
4,,,,"1ST COMMUNITY HEALTH CTR, LTD",CLINIC,,,0,,3138 W CERMAK ROAD,CHICAGO,IL,60623,1128a1,19940524,0,0,,1994,5


### What features in the LEIE dataset can we use as a key to connect with the Medicare Part D dataset?
> - NPI
> - Address
> - Individual/Entity Name

In [4]:
# Create a LEIE_keys dictionary to capture all potential common keys
LEIE_keys = {}

# add unique NPIs to LEIE_keys dictionary
npi = LEIE_df["NPI"]
LEIE_keys["NPI"] = np.unique(npi[npi != 0].values)

# create and update key_present column in the LEIE df
LEIE_df = key_by_npi(LEIE_df)
LEIE_df.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,...,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,year,month,key_present
0,,,,14 LAWRENCE AVE PHARMACY,PHARMACY,,,0,,14 LAWRENCE AVENUE,...,NY,11787,1128a1,19880830,0,0,,1988,8,False
1,,,,143 MEDICAL EQUIPMENT CO,DME COMPANY,DME - OXYGEN,,0,,701 NW 36 AVENUE,...,FL,33125,1128b7,19970620,0,0,,1997,6,False
2,,,,184TH STREET PHARMACY CORP,OTHER BUSINESS,PHARMACY,,1922348218,,69 E 184TH ST,...,NY,10468,1128a1,20180419,0,0,,2018,4,True
3,,,,1951 FLATBUSH AVENUE PHARMACY,PHARMACY,,,0,,1951 FLATBUSH AVE,...,NY,11234,1128b5,20090319,0,0,,2009,3,False
4,,,,"1ST COMMUNITY HEALTH CTR, LTD",CLINIC,,,0,,3138 W CERMAK ROAD,...,IL,60623,1128a1,19940524,0,0,,1994,5,False


---
### Let's load previous month's LEIE report and see if it has more unique NPIs?

In [5]:
# Load previous LEIE report
data_key='05-2018 LEIE.csv'
path='s3://{}/{}'.format(bucket, data_key)
df_052018 = load_LEIE(path)
print("number of records in the June LEIE report: ", LEIE_df.shape[0]) 
print("number of records in the May LEIE report: ", df_052018.shape[0]) 

number of records in the June LEIE report:  70056
number of records in the May LEIE report:  69933


In [6]:
# Compute differences between June and May LEIE reports
npis_062018 = set(LEIE_keys["NPI"])
npis_052018 = set(df_052018["NPI"])
added_npis = npis_062018.difference(npis_052018)
removed_npis = npis_052018.difference(npis_062018)
print("Number of unique NPIs in the the June LEIE report: {}".format(len(npis_062018)))
print("Number of unique NPIs in the the May LEIE report: {}".format(len(npis_052018)))
print("NPIs added to the June LEIE report: {}".format(len(added_npis)))
print("NPIs removed from the May LEIE report: {}".format(len(removed_npis)))

Number of unique NPIs in the the June LEIE report: 4682
Number of unique NPIs in the the May LEIE report: 4669
NPIs added to the June LEIE report: 22
NPIs removed from the May LEIE report: 9


**_Based on `NPI` alone, the May LEIE report doesn't help us with getting significantly more unique `NPI`s._**

---
### Use the NPIs to build the `target ` to train the Machine Learning model. 
> **Step 1:** Filter out old data<br>
> **Step 2:** Use `NPI` as key

In [7]:
# Filter out old data and data without keys
df1 = filter_years(LEIE_df, 2014)

In [8]:
df2 = filter_keys(df1)
print("Total number of records in the LEIE dataset: {}".format(LEIE_df.shape[0]))
print("Number of records after filtering out everything prior to 2014 (recent data): {}".format(df1.shape[0]))
print("Number of records with NPIs present in the recent data: {}".format(df2.shape[0]))

# calculate npi ratio
npi_ratio = df2.shape[0]/df1.shape[0]
print("Percentage of targets accounted for by NPIs alone in the recent data: {}".format(str(round(npi_ratio, 3)*100))+"%")

Total number of records in the LEIE dataset: 70056
Number of records after filtering out everything prior to 2014 (recent data): 16096
Number of records with NPIs present in the recent data: 2879
Percentage of targets accounted for by NPIs alone in the recent data: 17.9%


> **_Less than 1/5 of `target` list can be accounted for by using `NPI` as key._** <br>
>
> **_We will need to circle back later to find other ways to account for more of the `target` list by using:_**
- `individual_name`
- `business_name`
- `address`
- or `combination` of the any of the above

---
### Circling back with Fuzzy Wuzzy

In [9]:
import boto3
import s3fs

In [10]:
# load file from s3
s3 = boto3.client('s3')
data_key = 'identification_part_d.csv'

obj = s3.get_object(Bucket=bucket, Key=data_key)
df_id = pd.read_csv(obj['Body'])
df_id.head()

Unnamed: 0.1,Unnamed: 0,npi,last/org name,first name,specialty,city,state
0,0,1891738407,HEIL,ELIZABETH,Internal Medicine,LONGWOOD,FL
1,1,1104199355,DEFORNO,PATRICK,Dentist,THORNDALE,PA
2,2,1306896931,SMITH,KEVIN,Dermatology,GAINESVILLE,GA
3,3,1194794776,SWEENEY,GORDON,Internal Medicine,VALLEJO,CA
4,4,1679849855,ELDRIDGE,JENNIFER,Student in an Organized Health Care Education/...,HERSHEY,PA


In [11]:
df1.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,...,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,year,month,key_present
0,,,,184TH STREET PHARMACY CORP,OTHER BUSINESS,PHARMACY,,1922348218,,69 E 184TH ST,...,NY,10468,1128a1,20180419,0,0,,2018,4,True
1,,,,A & H VITAMINS SUPPLY CORP,OTHER BUSINESS,PHARMACY,,0,,592 PALISADE AVENUE,...,NJ,7666,1128a1,20180419,0,0,,2018,4,False
2,,,,"A & Y MEDICAL SUPPLY, INC",DME COMPANY,DME - GENERAL,,1942476080,,"6310 108TH STREET, APT 6J",...,NY,11375,1128b8,20170518,0,0,,2017,5,True
3,,,,"A FAIR DEAL PHARMACY, INC",OTHER BUSINESS,PHARMACY,,1891731758,,"C/O P O BOX 329014, #69709-05",...,NY,11232,1128b8,20170518,0,0,,2017,5,True
4,,,,"A SERVICE CAB CO, INC",OTHER BUSINESS,TRANSPORTATION CO,,0,,3704 AIRLINE DRIVE,...,LA,70001,1128a1,20170720,0,0,,2017,7,False


In [13]:
df_noNPI = df1[df1['NPI']==0].reset_index(drop=True)
df_id[df_id['last/org name']=='ADLIFE HEALTH CARE']
df_noNPI_indiv = df_noNPI[pd.notnull(df_noNPI['LASTNAME'])].reset_index(drop=True)
df_a = df_noNPI_indiv[['LASTNAME','FIRSTNAME','SPECIALTY','CITY','STATE']]
df_a.head(20)

Unnamed: 0,LASTNAME,FIRSTNAME,SPECIALTY,CITY,STATE
0,AAMIR,MUHAMMAD,HOME HEALTH AGENCY,BLOOMFIELD HILLS,MI
1,AARON,ALINA,NURSE/NURSES AIDE,PHOENIX,AZ
2,AARON,CHRISTOPHER,NURSE/NURSES AIDE,WEST NEWTON,PA
3,AASEN,ANDREA,HEALTH CARE AIDE,SAINT PAUL,MN
4,ABAD,EDUARDO,DME - GENERAL,SAN FRANCISCO,CA
5,ABAD,PEDRO,NURSE/NURSES AIDE,ALTAMONTE SPRINGS,FL
6,ABAGA,AUGUSTIN,NO KNOWN AFFILIATION,PENSACOLA,FL
7,ABALOS,CARLO,NURSE/NURSES AIDE,CARSON,CA
8,ABARQUEZ,BRENDA,NURSE/NURSES AIDE,S SAN FRANCISCO,CA
9,ABASHKIN,ROMAN,PERSONAL CARE PROVID,WAYNE,NJ


In [14]:
choices_array = df_id[['last/org name',
                 'first name',
                 'specialty',
                 'city',
                 'state']].as_matrix()
choices = []
error_idx = []
for i, choice in enumerate(choices_array):
    try:
        choices.append(' '.join(choice))
    except:
        error_idx.append(i)

print('number of available choices :', len(choices))
print('number of errors in choices :', len(error_idx))

number of available choices : 1295371
number of errors in choices : 799


In [15]:
# get ids with missing npis from LEIE
missing_ids = [' '.join(x) for x in df_a.values if all(isinstance(y, str) for y in x)]
print('number of ids with missing npi #:', len(missing_ids))
print('number of ids with missing npi with errors :', len(df_a) - len(missing_ids))

number of ids with missing npi #: 13129
number of ids with missing npi with errors : 1


In [16]:
# get fuzzywuzzy  scores:
import time
from fuzzywuzzy import fuzz, process

fuzz_scores = {}
start_time = time.time()
for each in missing_ids[:2]:
    fuzz_scores[each] = process.extract(each, choices, limit=10, scorer=fuzz.ratio)
end_time = time.time()
print('time :', end_time - start_time)

time : 30.91515827178955


In [18]:
process.extract("ABDI MOHAMMED HEALTH CARE AIDE COLUMBUS OH", choices, limit=10, scorer=fuzz.ratio)

[('HASHI MOHAMED Family Practice COLUMBUS OH', 72),
 ('AHMED MOHAMMED Internal Medicine COLUMBUS OH', 72),
 ('AHMAD MOHAMMAD Internal Medicine COLUMBUS OH', 70),
 ('NAEM MOHAMED Internal Medicine COLUMBUS OH', 69),
 ('AMIR MUHAMMAD Internal Medicine COLUMBUS OH', 68),
 ('HAQUE MOHAMMAD Internal Medicine COLUMBUS OH', 67),
 ('NIKTASH MOHAMMEDREZA Family Practice COLUMBUS OH', 67),
 ('MOHAMED MOHAMUD Internal Medicine COLUMBUS OH', 67),
 ('KHAN MUHAMMAD Internal Medicine COLUMBUS OH', 66),
 ('BROM HEATHER Nurse Practitioner COLUMBUS OH', 66)]

In [24]:
print("Number of available CPU cores: ")
!cat /proc/cpuinfo | grep processor | wc -l

Number of available CPU cores: 
64
