In [1]:
import psycopg2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

%matplotlib inline
sns.set(style="white")

In [59]:
alerts = pd.read_csv("../data/alerts_actions_California_2015.csv")

In [60]:
alerts.head(2)

Unnamed: 0,ALERT ISSUE DATE,NAME,CITY/STATE,LICENSE NUMBER,TYPE OF ACTION,DATE OF ACTION
0,1/2/2015,"Heikali, Moosa","Los Angeles, CA",A 40559,Revoked,1/2/2015
1,1/2/2015,"Highman, Lawrence Marshall","Colusa, CA",G 40201,Public Reprimand,1/2/2015


In [4]:
alerts.describe()

Unnamed: 0,ALERT ISSUE DATE,NAME,CITY/STATE,LICENSE NUMBER,TYPE OF ACTION,DATE OF ACTION
count,797,797,797,797,797,797
unique,163,651,329,644,21,239
top,5/8/2015,"Butel, Aimee Lorine","Los Angeles, CA",A 87162,Accusation Filed,11/20/2015
freq,35,4,47,4,317,9


In [5]:
alerts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797 entries, 0 to 796
Data columns (total 6 columns):
ALERT ISSUE DATE    797 non-null object
NAME                797 non-null object
CITY/STATE          797 non-null object
LICENSE NUMBER      797 non-null object
TYPE OF ACTION      797 non-null object
DATE OF ACTION      797 non-null object
dtypes: object(6)
memory usage: 37.4+ KB


In [6]:
## types of action?
alerts['TYPE OF ACTION'].value_counts()

Accusation Filed                             317
Decision                                     123
Surrendered                                   75
Public Reprimand                              51
First Amended Accusation Filed                47
Revoked                                       37
Public Letter of Reprimand                    36
Suspended                                     28
Restrictions                                  21
Acc & Pet to Revoke Probation Filed           17
Cease Practice Order                           9
Second Amended Accusation Filed                8
Petition to Revoke Probation Filed             6
Amended Accusation Filed                       4
Accusation Withdrawn                           4
First Amnd Acc & Pet to Revoke Prob Filed      4
Accusation Dismissed                           4
Third Amended Accusation Filed                 3
Cancelled                                      1
First Amended Accusation Dismissed             1
Second Amnd Accusati

In [61]:
## separate CITY/STATE column into individual columns
alerts = (pd.concat([alerts.drop(['CITY/STATE','NAME'],1),
                     (alerts['CITY/STATE'].str.split(',', expand=True, n=1).
                      rename(columns={0:'city',1:'state'})),
                    (alerts['NAME'].str.split(',', expand=True, n=1).
                    rename(columns={0:'last_name',1:'first_name'}))],
                    axis=1).
         rename(columns={'ALERT ISSUE DATE':'alert_issue_date',
                        'LICENSE NUMBER':'license_number',
                        'TYPE OF ACTION':'type_of_action',
                        'DATE OF ACTION':'date_of_action'}))

alerts['city'] = alerts['city'].str.strip()
alerts['state'] = alerts['state'].str.strip()
alerts['last_name'] = alerts['last_name'].str.strip()
alerts['first_name'] = alerts['first_name'].str.strip()

In [62]:
alerts.head()

Unnamed: 0,alert_issue_date,license_number,type_of_action,date_of_action,city,state,last_name,first_name
0,1/2/2015,A 40559,Revoked,1/2/2015,Los Angeles,CA,Heikali,Moosa
1,1/2/2015,G 40201,Public Reprimand,1/2/2015,Colusa,CA,Highman,Lawrence Marshall
2,1/5/2015,A 75590,Suspended,12/31/2014,Hobbs,NM,Driskill,Christopher Shay
3,1/6/2015,A 37049,Surrendered,12/31/2014,Indio,CA,Alegria,Rudolpho Jaramillo
4,1/6/2015,A 88304,Accusation Filed,12/19/2014,Los Angeles,CA,Armin,Sean Shahdad


## Create dictionary of doctor names in alerts database for searching

In [63]:
## How many distinct names (last, first pairs) are there out of the 797 rows?
alerts.drop_duplicates(['last_name','first_name']).shape

(650, 8)

In [111]:
## key: last name
## value: list of strings, each string consisting of at least a first name 
## and (possibly including middle name)
## example: alerts_names['JONES'] = ['MARY LYNN', 'HAROLD']

fullnames2015 = (alerts.
                 drop_duplicates(['last_name','first_name'])[['last_name','first_name']].
                 reset_index().drop(['index'],1))

alerts_names = {}
irregular_names = []

for i in range(len(fullnames2015)):
    try:
        if fullnames2015.loc[i,'last_name'].upper() not in alerts_names:
            alerts_names[fullnames2015.loc[i,'last_name'].upper()] = [fullnames2015.loc[i,'first_name'].upper()]
        else:
            alerts_names[fullnames2015.loc[i,'last_name'].upper()].append(fullnames2015.loc[i,'first_name'].upper())
    except:
        irregular_names.append(fullnames2015.loc[i,'last_name'])

In [112]:
irregular_names

['Habezghi Hagos']

In [114]:
## so 50 doctors had last names in common
len(alerts_names)

599

## Read in medicare summary nppes table

In [99]:
## connect to database

con = psycopg2.connect("dbname='doctordb' user='cathy'")

In [100]:
q = """SELECT * FROM summary WHERE provider_type = 'Orthopedic Surgery' 
     AND nppes_provider_state = 'CA';"""
summary_df = pd.read_sql_query(q, con=con)

In [101]:
summary_df.head()

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,...,beneficiary_cc_depr_percent,beneficiary_cc_diab_percent,beneficiary_cc_hyperl_percent,beneficiary_cc_hypert_percent,beneficiary_cc_ihd_percent,beneficiary_cc_ost_percent,beneficiary_cc_raoa_percent,beneficiary_cc_schiot_percent,beneficiary_cc_strk_percent,beneficiary_average_risk_score
0,1003017971,TWEET,MATTHEW,L,MD,M,I,2725 CAPITOL AVE,SUITE 302,SACRAMENTO,...,24.0,29.0,56.0,60.0,30.0,10.0,65.0,,,1.0698
1,1003126830,VAN DYK,GRIETJE,,MD,F,I,3771 KATELLA AVE,SUITE 209,LOS ALAMITOS,...,30.0,33.0,65.0,73.0,39.0,28.0,75.0,,,1.426
2,1003802646,PERCIVAL,HERBERT,W,M.D.,M,I,2486 N PONDEROSA DR,D-114,CAMARILLO,...,18.0,28.0,58.0,67.0,41.0,15.0,67.0,,7.0,1.2052
3,1003817693,WEISSTEIN,JASON,S,"M.D.,M.P.H.",M,I,39000 BOB HOPE DR,HARRY & DIANE RINKER BUILDING,RANCHO MIRAGE,...,16.0,20.0,68.0,68.0,36.0,14.0,75.0,1.0,4.0,1.0605
4,1003828989,GAINOR,JOHN,,M.D.,M,I,215 PESETAS LN,,SANTA BARBARA,...,24.0,21.0,53.0,50.0,21.0,11.0,74.0,,2.0,1.004


In [117]:
## loop through names in summary and check if any alerts exist per orthopedic surgeon

## list of tuples matches
alert_matches = []

for i in range(len(summary_df)):
    last = summary_df.loc[i, 'nppes_provider_last_org_name'].upper()
    
    ## last name exists in alerts list
    if last in alerts_names:
        first = summary_df.loc[i, 'nppes_provider_first_name'].upper()
        
        ## search for match of first name in alerts list
        for n in alerts_names[last]:
            if re.search(n, first):
                alert_matches.append((summary_df.loc[i,'npi'], last, first))
                break

In [118]:
len(alert_matches)

1

In [119]:
alert_matches

[('1487884607', 'LEE', 'JASON')]

## check Jason Lee's entry in alerts (is it an actual match?)

In [134]:
(alerts.query('last_name.str.upper() == "LEE"').
     query('first_name.str.upper()== "JASON"'))

Unnamed: 0,alert_issue_date,license_number,type_of_action,date_of_action,city,state,last_name,first_name
639,10/21/2015,G 81729,Accusation Filed,10/2/2015,Bakersfield,CA,Lee,Jason


In [141]:
summary_df.loc[summary_df.npi=='1487884607',
               ['nppes_provider_city','nppes_provider_state','nppes_provider_mi']]

Unnamed: 0,nppes_provider_city,nppes_provider_state,nppes_provider_mi
980,ORANGE,CA,H


This is not a real match.  Bakersfield Jason Lee is an anesthesiologist, while summary Jason Lee is Jason Chia Lee, an orthopedic surgeon without a record.