# LDA model on orthopedic surgery

- dropped top 15 cols
- dropped cols with < 4 claims
- dropped npi 1376541979
- dropped npi's with < 10 claims
- number of topics = 10
- bene_unique_cnt as value

In [1]:
import psycopg2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import numpy as np
import time
import re

from gensim import matutils, models, corpora

%matplotlib inline
sns.set(style="white")

In [2]:
## connect to database
con = psycopg2.connect("dbname='doctordb' user='cathy'")

In [64]:
q = """SELECT npi,
        hcpcs_code, 
        hcpcs_description, 
        bene_unique_cnt,
        nppes_provider_last_org_name, 
        nppes_provider_first_name, 
        nppes_provider_state,
        place_of_service
FROM payments 
WHERE provider_type='Orthopedic Surgery'"""
payments = pd.read_sql_query(q, con=con)

In [65]:
payments.shape

(312006, 8)

## Read in data matrix generated in ipynb 17

In [5]:
## read in reduced data set
by_npi = pd.read_csv("17_by_npi_reduced_ortho.csv", index_col=0)
by_npi.index = by_npi.index.astype(str)
by_npi.shape

(11242, 795)

## Load LDA model (on filtered data frame using 10 topics)

In [6]:
## Load lda model
model_fname = "18_lda_10topics_ortho.model"
ldamodel = models.LdaModel.load(model_fname)

## Interpret the 10 topics in the model

In [7]:
corpus = matutils.Dense2Corpus(by_npi.as_matrix(), documents_columns=False)

In [8]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [9]:
## top 30 words in each topic
topics_matrix = ldamodel.show_topics(formatted=False, num_words=30)

In [10]:
## for each topic, store indices of top 30 hcpcs_codes in a list
topic_idx_dict = {}

for i in topics_matrix:
    topic_idx_dict[i[0]] = [int(word[0]) for word in i[1]]

In [11]:
## for each topic's list of indexes, get the list of hcpcs_codes
topic_dict = {}

for k, v in topic_idx_dict.items():
    topic_dict[k] = [by_npi.columns.values[i] for i in v]

In [12]:
## some words / hcpcs_codes are appended with '_O' or '_F'.
## remove these suffixes so we can map them back to the hcpcs descriptions

def remove_suffix(s):
    code = re.search(r'([a-zA-Z0-9]+)(_O)*(_F)*', s)
    if code.group(2) is not None:
        return code.group(1), 'O'
    elif code.group(3) is not None:
        return code.group(1), 'F'
    else:
        return code.group(1), None

In [13]:
topic_dict_nosuffix = {}
place_of_service_cnt = {}

for t, hcpcs_list in topic_dict.items():
    t_list = []
    O_cnt = 0
    F_cnt = 0
    for i in hcpcs_list:
        code, facility = remove_suffix(i)
        t_list.append(code)
        if facility == 'O':
            O_cnt += 1
        elif facility == 'F':
            F_cnt += 1        

    topic_dict_nosuffix[t] = t_list
    place_of_service_cnt[t] = [O_cnt, F_cnt]

In [129]:
## this could be improved by including a place_of_service column as a modifier to the hcpcs_code
codes_per_topic = (pd.melt(pd.DataFrame(topic_dict_nosuffix).reset_index(), 
                     id_vars='index', var_name='topic', value_name='hcpcs_code').
                     rename(columns={'index':'rank'}))
codes_per_topic.head()

Unnamed: 0,rank,topic,hcpcs_code
0,0,0,72120
1,1,0,77003
2,2,0,62311
3,3,0,62311
4,4,0,64483


In [14]:
pd.DataFrame([v for k,v in place_of_service_cnt.items()]).rename(columns={0:'out_of_facility_cnt', 1:'in_facility_cnt'})

Unnamed: 0,out_of_facility_cnt,in_facility_cnt
0,16,8
1,18,5
2,12,2
3,7,2
4,20,2
5,20,4
6,17,3
7,0,27
8,16,0
9,20,0


In [15]:
for t in range(len(place_of_service_cnt)):
    print(t, ': ', place_of_service_cnt[t])

0 :  [16, 8]
1 :  [18, 5]
2 :  [12, 2]
3 :  [7, 2]
4 :  [20, 2]
5 :  [20, 4]
6 :  [17, 3]
7 :  [0, 27]
8 :  [16, 0]
9 :  [20, 0]


### Most of the topics consist of top codes that take place out of facility, except for topic 7.

In [30]:
pd.set_option('max_colwidth',100)
## Topic 0 - spine, lower back
t = 0
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
5,99204,"New patient office or other outpatient visit, typically 45 minutes"
79,99215,"Established patient office or other outpatient, visit typically 40 minutes"
129,72100,"X-ray of lower and sacral spine, 2 or 3 views"
276,22214,Incision of spine to correct deformity at lower spinal column
303,72040,"X-ray of spine of neck, 2 or 3 views"


In [31]:
## Topic 1 - lower leg/foot, ankle
t = 1
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
5,99204,"New patient office or other outpatient visit, typically 45 minutes"
8,J3301,"Injection, triamcinolone acetonide, not otherwise specified, 10 mg"
24,73610,"X-ray of ankle, minimum of 3 views"
25,99212,"Established patient office or other outpatient visit, typically 10 minutes"
27,99222,"Initial hospital inpatient care, typically 50 minutes per day"


In [32]:
## Topic 2 - spine bones
t = 2
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
5,99204,"New patient office or other outpatient visit, typically 45 minutes"
25,99212,"Established patient office or other outpatient visit, typically 10 minutes"
79,99215,"Established patient office or other outpatient, visit typically 40 minutes"
129,72100,"X-ray of lower and sacral spine, 2 or 3 views"
248,22851,Insertion of spinal instrumentation for spinal stabilization


In [33]:
## Topic 3 - ER visits, general trauma, more random
t = 3
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
594,99283,"Emergency department visit, moderately severe problem"
595,J1100,"Injection, dexamethasone sodium phosphate, 1mg"
719,29085,Application of cast to hand and lower forearm
820,99211,"Established patient office or other outpatient visit, typically 5 minutes"
1161,36415,Insertion of needle into vein for collection of blood sample


In [34]:
## Topic 4 - upper leg: knee, pelvis, hip
t = 4
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
2,73510,"X-ray of ribs of one side of body, minimum of 2 views"
3,73562,"X-ray of knee, 3 views"
4,73564,"X-ray of knee, 4 or more views"
5,99204,"New patient office or other outpatient visit, typically 45 minutes"
8,J3301,"Injection, triamcinolone acetonide, not otherwise specified, 10 mg"


In [35]:
## Topic 5 - arthritis/carpal tunnel/hand, wrist, fingers
## in office joint pain meds - triamcinolone acetonide, methylprednisolone, betamethasone acetate
t = 5
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
5,99204,"New patient office or other outpatient visit, typically 45 minutes"
8,J3301,"Injection, triamcinolone acetonide, not otherwise specified, 10 mg"
25,99212,"Established patient office or other outpatient visit, typically 10 minutes"
44,J1030,"Injection, methylprednisolone acetate, 40 mg"
81,J0702,"Injection, betamethasone acetate 3mg and betamethasone sodium phosphate 3mg"


In [36]:
## Topic 6 - knee and shoulder
t = 6
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
2,73510,"X-ray of ribs of one side of body, minimum of 2 views"
3,73562,"X-ray of knee, 3 views"
4,73564,"X-ray of knee, 4 or more views"
5,99204,"New patient office or other outpatient visit, typically 45 minutes"
8,J3301,"Injection, triamcinolone acetonide, not otherwise specified, 10 mg"


In [37]:
## Topic 7 - Hospital
t = 7
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
0,20610,Aspiration and/or injection of large joint or joint capsule
1,73030,"X-ray of shoulder, minimum of 2 views"
2,73510,"X-ray of ribs of one side of body, minimum of 2 views"
3,73562,"X-ray of knee, 3 views"
4,73564,"X-ray of knee, 4 or more views"


In [38]:
## Topic 8 - therapy
t = 8
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
2,73510,"X-ray of ribs of one side of body, minimum of 2 views"
3,73562,"X-ray of knee, 3 views"
4,73564,"X-ray of knee, 4 or more views"
5,99204,"New patient office or other outpatient visit, typically 45 minutes"
8,J3301,"Injection, triamcinolone acetonide, not otherwise specified, 10 mg"


In [39]:
## Topic 9 - hospital, xray
t = 9
(payments.loc[payments.hcpcs_code.isin(topic_dict_nosuffix[t]), ['hcpcs_code','hcpcs_description']].
 drop_duplicates('hcpcs_code')).head()

Unnamed: 0,hcpcs_code,hcpcs_description
2,73510,"X-ray of ribs of one side of body, minimum of 2 views"
3,73562,"X-ray of knee, 3 views"
4,73564,"X-ray of knee, 4 or more views"
5,99204,"New patient office or other outpatient visit, typically 45 minutes"
8,J3301,"Injection, triamcinolone acetonide, not otherwise specified, 10 mg"


## Plot distribution of providers per topic

In [16]:
## per document (provider), assign topic based on highest-probability topic out of the 10
topic_per_doc = []
for i in corpus_tfidf:
    topic_per_doc.append(np.argmax(np.array(ldamodel.get_document_topics(i))[:,1]))

In [17]:
topic_per_doc_df = pd.DataFrame(topic_per_doc).rename(columns={0:'topic'})
topic_per_doc_df.index = by_npi.index

## For each doctor, match the claims by the doctor that are relevant to his/her assigned topic

In [131]:
topic_per_doc_df.head()

Unnamed: 0_level_0,topic
npi,Unnamed: 1_level_1
1003010919,9
1003017682,4
1003018086,9
1003023276,4
1003024894,4


In [130]:
codes_per_topic.head()

Unnamed: 0,rank,topic,hcpcs_code
0,0,0,72120
1,1,0,77003
2,2,0,62311
3,3,0,62311
4,4,0,64483


In [149]:
payments[['npi','hcpcs_code','hcpcs_description','bene_unique_cnt','place_of_service']].head()

Unnamed: 0,npi,hcpcs_code,hcpcs_description,bene_unique_cnt,place_of_service
0,1003001785,20610,Aspiration and/or injection of large joint or joint capsule,81.0,O
1,1003001785,73030,"X-ray of shoulder, minimum of 2 views",47.0,O
2,1003001785,73510,"X-ray of ribs of one side of body, minimum of 2 views",32.0,O
3,1003001785,73562,"X-ray of knee, 3 views",40.0,O
4,1003001785,73564,"X-ray of knee, 4 or more views",43.0,O


In [152]:
topic_per_doc_df.reset_index().merge(codes_per_topic, on='topic').head()

Unnamed: 0,npi,topic,rank,hcpcs_code
0,1003010919,9,0,J0702
1,1003010919,9,1,73562
2,1003010919,9,2,73564
3,1003010919,9,3,73560
4,1003010919,9,4,99212


In [154]:
top_claims_df = (topic_per_doc_df.reset_index().
                 merge(codes_per_topic, on='topic').
                 merge(payments[['npi','hcpcs_code','hcpcs_description','bene_unique_cnt','place_of_service']],
                    on=['npi','hcpcs_code'], how='inner'))

In [155]:
top_claims_df.shape

(123097, 7)

In [156]:
top_claims_df.head(2)

Unnamed: 0,npi,topic,rank,hcpcs_code,hcpcs_description,bene_unique_cnt,place_of_service
0,1003010919,9,3,73560,"X-ray of knee, 1 or 2 views",11.0,O
1,1003010919,9,4,99212,"Established patient office or other outpatient visit, typically 10 minutes",32.0,O


### Send table to postgres database

In [157]:
from sqlalchemy import create_engine
engine = create_engine('postgresql://%s@localhost/%s'%('cathy','doctordb'))
top_claims_df.to_sql("doctor_claims_for_topic", engine)

In [158]:
engine.dispose()