In [1]:
import boto3
import io
import pandas as pd
import time
from io import StringIO
import numpy as np
import src.scripts as src
%matplotlib inline

---
# A deeper look into the LEIE data

In [2]:
LEIE = src.read_from_efs('raw_LEIE.csv')

In [6]:
LEIE.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,exclusion year
0,,,,14 LAWRENCE AVE PHARMACY,PHARMACY,,,0,,14 LAWRENCE AVENUE,SMITHTOWN,NY,11787,1128a1,19880830,0,0,,1988
1,,,,143 MEDICAL EQUIPMENT CO,DME COMPANY,DME - OXYGEN,,0,,701 NW 36 AVENUE,MIAMI,FL,33125,1128b7,19970620,0,0,,1997
2,,,,184TH STREET PHARMACY CORP,OTHER BUSINESS,PHARMACY,,1922348218,,69 E 184TH ST,BRONX,NY,10468,1128a1,20180419,0,0,,2018
3,,,,1951 FLATBUSH AVENUE PHARMACY,PHARMACY,,,0,,1951 FLATBUSH AVE,BROOKLYN,NY,11234,1128b5,20090319,0,0,,2009
4,,,,"1ST COMMUNITY HEALTH CTR, LTD",CLINIC,,,0,,3138 W CERMAK ROAD,CHICAGO,IL,60623,1128a1,19940524,0,0,,1994


In [5]:
# Add 'exclusion year' column
LEIE["exclusion year"] = pd.to_datetime(LEIE["EXCLDATE"], format="%Y%m%d").dt.year

### What features in the LEIE dataset can we use as a key to connect with the Medicare Part D dataset?
> - NPI
> - Address
> - Individual/Entity Name

---
### Let's load previous month's LEIE report and see if it has more unique NPIs?

In [10]:
# Load previous LEIE report
df_052018 = src.read_s3("05-2018 LEIE.csv")
npi_052018 = df_052018["NPI"][df_052018["NPI"] != 0]
print("number of records in the June LEIE report: ", LEIE.shape[0]) 
print("number of records in the May LEIE report: ", df_052018.shape[0]) 

number of records in the June LEIE report:  70056
number of records in the May LEIE report:  69933


In [11]:
# Compute differences between June and May LEIE reports
npis_062018 = set(LEIE["NPI"][LEIE["NPI"] != 0])
npis_052018 = set(npi_052018)
added_npis = npis_062018.difference(npis_052018)
removed_npis = npis_052018.difference(npis_062018)
print("Number of unique NPIs in the the June LEIE report: {}".format(len(npis_062018)))
print("Number of unique NPIs in the the May LEIE report: {}".format(len(npis_052018)))
print("NPIs added to the June LEIE report: {}".format(len(added_npis)))
print("NPIs removed from the May LEIE report: {}".format(len(removed_npis)))

Number of unique NPIs in the the June LEIE report: 4682
Number of unique NPIs in the the May LEIE report: 4668
NPIs added to the June LEIE report: 22
NPIs removed from the May LEIE report: 8


**_Based on `NPI` alone, the May LEIE report doesn't help us with getting significantly more unique `NPI`s._**

---
### Use the NPIs to build the `target ` to train the Machine Learning model. 
> **Step 1:** Filter out old data<br>
> **Step 2:** Use `NPI` as key

In [19]:
# Filter out old data and data without keys
df1 = LEIE[LEIE["exclusion year"]>=2014].reset_index(drop=True)
df2 = LEIE[(LEIE['NPI'] != 0) & (LEIE["exclusion year"]>=2014)].reset_index(drop=True)
print("Total number of records in the LEIE dataset: {}".format(LEIE.shape[0]))
print("Number of records after filtering out everything prior to 2014 (recent data): {}".format(df1.shape[0]))
print("Number of records with NPIs present in the recent data: {}".format(df2.shape[0]))

# calculate npi ratio
npi_ratio = df2.shape[0]/df1.shape[0]
print("Percentage of targets accounted for by NPIs alone in the recent data: {}".format(str(round(npi_ratio, 3)*100))+"%")

Total number of records in the LEIE dataset: 70056
Number of records after filtering out everything prior to 2014 (recent data): 16096
Number of records with NPIs present in the recent data: 2879
Percentage of targets accounted for by NPIs alone in the recent data: 17.9%


> **_Less than 1/5 of `target` list can be accounted for by using `NPI` as key._** <br>
>
> **_We will need to circle back later to find other ways to account for more of the `target` list by using:_**
- `individual_name`
- `business_name`
- `address`
- or `combination` of the any of the above

In [20]:
df2.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,exclusion year
0,,,,184TH STREET PHARMACY CORP,OTHER BUSINESS,PHARMACY,,1922348218,,69 E 184TH ST,BRONX,NY,10468,1128a1,20180419,0,0,,2018
1,,,,"A & Y MEDICAL SUPPLY, INC",DME COMPANY,DME - GENERAL,,1942476080,,"6310 108TH STREET, APT 6J",FOREST HILLS,NY,11375,1128b8,20170518,0,0,,2017
2,,,,"A FAIR DEAL PHARMACY, INC",OTHER BUSINESS,PHARMACY,,1891731758,,"C/O P O BOX 329014, #69709-05",BROOKLYN,NY,11232,1128b8,20170518,0,0,,2017
3,,,,ACCELERATED BEHAVIOR MANAGEMEN,OTHER BUSINESS,COMM MNTL HLTH CNTR,,1902198435,,6148 W SAHARA AVENUE,LAS VEGAS,NV,89146,1128a1,20160120,0,0,,2016
4,,,,ADAMS LOVING PRO HEALTH CARE,OTHER BUSINESS,HOME HEALTH AGENCY,,1073682936,,"1277 HIGHWAY 82 W, STE 218",LEESBURG,GA,31763,1128b7,20150409,0,0,,2015


In [22]:
src.save_to_efs(df2, 'LEIE_2014+_NPI')

---
### Circling back with Fuzzy Wuzzy

In [52]:
from fuzzywuzzy import fuzz, process
import boto3
import s3fs

In [10]:
# load file from s3
s3 = boto3.client('s3')
bucket='dast1healthcare' # Or whatever you called your bucket
data_key = 'identification_part_d.csv'

obj = s3.get_object(Bucket=bucket, Key=data_key)
df_id = pd.read_csv(obj['Body'])

In [11]:
df_id.head()

Unnamed: 0.1,Unnamed: 0,npi,last/org name,first name,specialty,city,state
0,0,1891738407,HEIL,ELIZABETH,Internal Medicine,LONGWOOD,FL
1,1,1104199355,DEFORNO,PATRICK,Dentist,THORNDALE,PA
2,2,1306896931,SMITH,KEVIN,Dermatology,GAINESVILLE,GA
3,3,1194794776,SWEENEY,GORDON,Internal Medicine,VALLEJO,CA
4,4,1679849855,ELDRIDGE,JENNIFER,Student in an Organized Health Care Education/...,HERSHEY,PA


In [23]:
df1.head()

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,...,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,year,month,key_present
0,,,,184TH STREET PHARMACY CORP,OTHER BUSINESS,PHARMACY,,1922348218,,69 E 184TH ST,...,NY,10468,1128a1,20180419,0,0,,2018,4,True
1,,,,A & H VITAMINS SUPPLY CORP,OTHER BUSINESS,PHARMACY,,0,,592 PALISADE AVENUE,...,NJ,7666,1128a1,20180419,0,0,,2018,4,False
2,,,,"A & Y MEDICAL SUPPLY, INC",DME COMPANY,DME - GENERAL,,1942476080,,"6310 108TH STREET, APT 6J",...,NY,11375,1128b8,20170518,0,0,,2017,5,True
3,,,,"A FAIR DEAL PHARMACY, INC",OTHER BUSINESS,PHARMACY,,1891731758,,"C/O P O BOX 329014, #69709-05",...,NY,11232,1128b8,20170518,0,0,,2017,5,True
4,,,,"A SERVICE CAB CO, INC",OTHER BUSINESS,TRANSPORTATION CO,,0,,3704 AIRLINE DRIVE,...,LA,70001,1128a1,20170720,0,0,,2017,7,False


In [40]:
df_noNPI = df1[df1['NPI']==0].reset_index(drop=True)
df_id[df_id['last/org name']=='ADLIFE HEALTH CARE']

Unnamed: 0.1,Unnamed: 0,npi,last/org name,first name,specialty,city,state


In [94]:
df_noNPI_indiv = df_noNPI[pd.notnull(df_noNPI['LASTNAME'])].reset_index(drop=True)
df_a = df_noNPI_indiv[['LASTNAME','FIRSTNAME','SPECIALTY','CITY','STATE']]
df_a.head(20)

Unnamed: 0,LASTNAME,FIRSTNAME,SPECIALTY,CITY,STATE
0,AAMIR,MUHAMMAD,HOME HEALTH AGENCY,BLOOMFIELD HILLS,MI
1,AARON,ALINA,NURSE/NURSES AIDE,PHOENIX,AZ
2,AARON,CHRISTOPHER,NURSE/NURSES AIDE,WEST NEWTON,PA
3,AASEN,ANDREA,HEALTH CARE AIDE,SAINT PAUL,MN
4,ABAD,EDUARDO,DME - GENERAL,SAN FRANCISCO,CA
5,ABAD,PEDRO,NURSE/NURSES AIDE,ALTAMONTE SPRINGS,FL
6,ABAGA,AUGUSTIN,NO KNOWN AFFILIATION,PENSACOLA,FL
7,ABALOS,CARLO,NURSE/NURSES AIDE,CARSON,CA
8,ABARQUEZ,BRENDA,NURSE/NURSES AIDE,S SAN FRANCISCO,CA
9,ABASHKIN,ROMAN,PERSONAL CARE PROVID,WAYNE,NJ


In [95]:
choices_array = df_id[['last/org name',
                 'first name',
                 'specialty',
                 'city',
                 'state']].as_matrix()
choices = []
error_idx = []
for i, choice in enumerate(choices_array):
    try:
        choices.append(' '.join(choice))
    except:
        error_idx.append(i)

print('number of available choices :', len(choices))
print('number of errors in choices :', len(error_idx))

In [127]:
# get ids with missing npis from LEIE
missing_ids = [' '.join(x) for x in df_a.values if all(isinstance(y, str) for y in x)]
print('number of ids with missing npi #:', len(missing_ids))
print('number of ids with missing npi with errors :', len(df_a) - len(missing_ids))

number of ids with missing npi #: 13129
number of ids with missing npi with errors : 1


In [None]:
# get fuzzywuzzy  scores:
import time

fuzz_scores = {}
start_time = time.time()
for each in missing_ids[:50]:
    fuzz_scores[each] = process.extract(each, choices, limit=10, scorer=fuzz.ratio)
end_time = time.time()
print('time :', end_time - start_time)

13129

In [96]:
process.extract("ABDI MOHAMMED HEALTH CARE AIDE COLUMBUS OH", choices, limit=30, scorer=fuzz.ratio)

[('HASHI MOHAMED Family Practice COLUMBUS OH', 72),
 ('AHMED MOHAMMED Internal Medicine COLUMBUS OH', 72),
 ('AHMAD MOHAMMAD Internal Medicine COLUMBUS OH', 70),
 ('NAEM MOHAMED Internal Medicine COLUMBUS OH', 69),
 ('AMIR MUHAMMAD Internal Medicine COLUMBUS OH', 68),
 ('HAQUE MOHAMMAD Internal Medicine COLUMBUS OH', 67),
 ('NIKTASH MOHAMMEDREZA Family Practice COLUMBUS OH', 67),
 ('MOHAMED MOHAMUD Internal Medicine COLUMBUS OH', 67),
 ('KHAN MUHAMMAD Internal Medicine COLUMBUS OH', 66),
 ('BROM HEATHER Nurse Practitioner COLUMBUS OH', 66),
 ('SHAREEF MOHAMMED Pulmonary Disease COLUMBUS OH', 66),
 ('AHMED MOHAMMED Internal Medicine COLUMBIA MO', 65),
 ('OBI PAMELA Family Practice COLUMBUS OH', 65),
 ('NIKTASH MOHAMMEDREZA Emergency Medicine COLUMBUS OH', 65),
 ('HAMMOND CHARLES Dentist COLUMBUS OH', 65),
 ('MOHAMMAD MOHAMMAD Podiatry COLUMBUS OH', 65),
 ('ADEPALLI AMANDA Family Practice COLUMBUS OH', 64),
 ('DAVIS MICHAEL Internal Medicine COLUMBUS OH', 64),
 ('PATHAN MOHAMMAD Family P

In [55]:
# fuzzy wuzzy 
str_a = 'hello world'
choices = df
fuzz.token_sort_ratio(str_a, str_b)

85

In [13]:
bus_state = []
for busname in busnames:
    bus_state.append(df3[df3['BUSNAME'] == busname]['STATE'].values[0])

list(zip(busnames, bus_state))

[('OMEGA PSYCHOLOGICAL, P C', 'NJ'),
 ('ATLANTA MEDICAL CENTER', 'GA'),
 ('NORTH FULTON MEDICAL CENTER, I', 'GA'),
 ('GRANT COUNTY WELLNESS CENTER', 'KY'),
 ('JEUNEDERM COSMETIC SURGERY CEN', 'CA'),
 ('CHAPARRAL SERVICES LTD', 'NY'),
 ('CSS NORTH, LLC', 'MN'),
 ('WARNER CHILCOTT SALES (U S) L', 'NJ'),
 ('VEE TRANSPORTATION COMPANY', 'IL'),
 ('PALM BEACH LIFE EXTENSION', 'FL'),
 ('FAMILY DERMATOLOGY OF DELAWARE', 'GA'),
 ('HUNTER BURNS CONSTRUCTION, LLC', 'MT'),
 ('BABY BOOMERS IN HOME CARE LC', 'MO'),
 ('CENTRAL KENTUCKY FAMILY PHARMA', 'KY'),
 ('CSHM, LLC', 'TN'),
 ('FLAMING FIRE', 'IL'),
 ('MEDEX AMBULANCE INC', 'PA'),
 ('MEHRAN ZAMANI, LLC', 'CT'),
 ('CAREAVAN INC OF WISCONSIN', 'IL'),
 ('NORTH AMERICA DRUGS CORP', 'NJ'),
 ('NEW PLAZA GROUP, INC', 'NY'),
 ('TRI-BOROUGH LIMO INC', 'NY'),
 ('HHCH HEALTH CARE INC', 'WV'),
 ('AAA PLUS HOME HEALTH CARE, LLC', 'MN'),
 ('DENTAL GROUP OF NEW BRITIAN, P', 'CT'),
 ('S & R MEDICAL ENTERPRISES, LLC', 'KY'),
 ('UMA HOME HEALTH CARE SYSTEM, L', '

In [14]:
print(df2['NPI'].shape)
print(len(set(df2['NPI'])))

(2879,)
2843


In [15]:
# add EXCLYR Column
df_LEIE["EXCLYR"] = df_LEIE["EXCLDATE"].astype(str).str[0:4].astype(int)

# remove all data prior to 2013
df_LEIE1 = df_LEIE[df_LEIE["EXCLYR"]>=2013].reset_index().copy()

NameError: name 'df_LEIE' is not defined

In [None]:
# remove white spaces from EXCLTYPE
isNameNull = df_LEIE1.isnull()[["LASTNAME","FIRSTNAME"]]
isNPInotNull = df_LEIE1["NPI"]!=0
isLocatableByNPIorName = []
for i in range(len(isNameNull)):
    if not isNameNull.iloc[i].values.any() or isNPInotNull.iloc[i]:
        isLocatableByNPIorName.append(True)
    else:
        isLocatableByNPIorName.append(False)

In [None]:
LEIE = df_LEIE1[isLocatableByNPIorName].reset_index(drop=True)

In [None]:
excluded = np.unique(df_LEIE1["NPI"][df_LEIE1["NPI"]!=0])
np.savetxt('excluded.txt', excluded)

In [None]:
LEIE[[
    "LASTNAME",
    "FIRSTNAME",
    "MIDNAME",
    "BUSNAME",
    "GENERAL",
    "SPECIALTY",
    "NPI",
    "DOB",
    "CITY",
    "STATE",
    "ZIP",
    "EXCLTYPE",
    "EXCLDATE",
    "EXCLYR"]].to_csv("LEIE.txt",index=False, 
                      header=["LASTNAME",
                                "FIRSTNAME",
                                "MIDNAME",
                                "BUSNAME",
                                "GENERAL",
                                "SPECIALTY",
                                "NPI",
                                "DOB",
                                "CITY",
                                "STATE",
                                "ZIP",
                                "EXCLTYPE",
                                "EXCLDATE",
                                "EXCLYR"])

In [None]:
LEIE_df = LEIE.groupby("EXCLTYPE").count().sort_values("index", ascending = False).reset_index()[["EXCLTYPE","index"]]
LEIE_df.columns = ["Code","Count"]
LEIE_df.head()

In [None]:
ExclType_Dict = {"1128Aa":"???",
                 "1128a1":"Conviction of program-related crimes. Minimum Period: 5 years",
                 "1128a2":"Conviction relating to patient abuse or neglect. Minimum Period: 5 years",
                 "1128a3":"Felony conviction relating to health care fraud. Minimum Period: 5 years",
                 "1128a4":"Felony conviction relating to controlled substance. Minimum Period: 5 years",
                 "1128b1":"Misdemeanor conviction relating to health care fraud. Baseline Period: 3 years",
                 "1128b14":"Default on health education loan or scholarship obligations. Minimum Period: Until default or obligation has been resolved.",
                 "1128b15":"Individuals controlling a sanctioned entity. Minimum Period: Same as length of entity's exclusion.",
                 "1128b16":"Making false statement or misrepresentations of material fact. Minimum period: None.",
                 "1128b2":"Conviction relating to obstruction of an investigation or audit. Baseline Period: 3 years",
                 "1128b3":"Misdemeanor conviction relating to controlled substance. Baseline Period: 3 years",
                 "1128b4":"License revocation, suspension, or surrender. Minimum Period: Period imposed by the state licensing authority.",
                 "1128b5":"Exclusion or suspension under federal or state health care program. Minimum Period: No less than the period imposed by federal or state health care program.",
                 "1128b6":"Claims for excessive charges, unnecessary services or services which fail to meet professionally recognized standards of health care, or failure of an HMO to furnish medically necessary services. Minimum Period: 1 year",
                 "1128b7":"Fraud, kickbacks, and other prohibited activities. Minimum Period: None",
                 "1128b8":"Entities controlled by a sanctioned individual. Minimum Period: Same as length of individual's exclusion.",
                 "1156":"Failure to meet statutory obligations of practitioners and providers to provide medically necessary services meeting professionally recognized standards of health care (Quality Improvement Organization (QIO) findings). Minimum Period: 1 year",
                 "1160":"???",
                 "BRCH CIA":"???",
                 "BRCH SA":"???"
                }

In [None]:
# convert ExclType_Dict to Pandas DataFrame
ExclTypeSummary_df = pd.DataFrame().from_dict(ExclType_Dict, orient="index").reset_index()
ExclTypeSummary_df.columns = ["Code","Description"]
ExclTypeSummary_df.head()

In [None]:
# Build LEIE_Summary by merging ExclTypeSummary_df and LEIE_df on Count
LEIE_Summary = pd.merge(ExclTypeSummary_df, LEIE_df, on="Code")\
                    .sort_values("Count", ascending=False)
pd.set_option('display.max_colwidth', -1)
LEIE_body=LEIE_Summary.head(12).reset_index(drop=True)

In [None]:
# Collapse LEIE_Summary Tail
tail = pd.DataFrame()
tail_dict = {"Code":"Misc.",
             "Description":"Other",
             "Count": LEIE_Summary.tail(7).sum(numeric_only=True)
            }
LEIE_tail = tail.from_dict(tail_dict)[["Code","Description","Count"]].reset_index(drop=True)

In [None]:
LEIE_summary = LEIE_body.append(LEIE_tail, ignore_index=True)
LEIE_summary

In [None]:
LEIE_w_total = LEIE_summary.append(LEIE_summary.sum(numeric_only=True), ignore_index=True)
LEIE_w_total

In [None]:
# Convert LEIE_Summary Summary to HTML
HTML = LEIE_w_total.to_html(index=False, )
HTML.replace("\n","")