In [1]:
import psycopg2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
from sklearn.manifold import TSNE
import re

%matplotlib inline
sns.set(style="white")

In [4]:
## connect to database

con = psycopg2.connect("dbname='doctordb' user='cathy'")

## Read in medicare payments table

In [5]:
q = "SELECT * FROM payments WHERE provider_type = 'Medical Oncology'"
payments = pd.read_sql_query(q, con=con)

In [6]:
## grab all distinct hcpcs_codes that correspond to drugs
medicare_drug_claims = (payments.query("hcpcs_drug_indicator=='Y'")
                       .drop_duplicates('hcpcs_code')[['hcpcs_code','hcpcs_description']])

print(medicare_drug_claims.shape)

medicare_drug_claims.head()

(143, 2)


Unnamed: 0,hcpcs_code,hcpcs_description
17,J1100,"Injection, dexamethasone sodium phosphate, 1mg"
18,J1200,"Injection, diphenhydramine hcl, up to 50 mg"
19,J1626,"Injection, granisetron hydrochloride, 100 mcg"
95,J0641,"Injection, levoleucovorin calcium, 0.5 mg"
96,J0885,"Injection, epoetin alfa, (for non-esrd use), 1..."


In [7]:
def preprocess_drug_name(drug):
    """Given a drug name, pre-process before matching it in Medicare database by:
    converting to all lowercase, convert multiple spaces to single space, and 
    remove certain uninteresting strings like 'mg' and 'Injection' that are common
    to the hcpcs_descriptions
    
    INPUT:
    drug - a string
    
    OUTPUT:
    drug - string
    """
    """  process each item in the list
    for d in drug_list:      
        # lower case
        d = d.lower()

        # substitute (USP) with empty string
        d = d.replace('(usp)', '')
        
        # convert multiple spaces to single space
        d = re.sub(r'\s\s+', r' ', d)
        
        # strip leading/trailing whitespace
        d = d.strip()
        
        processed_drug_list.append(d)
        
    return processed_drug_list"""
    
    return drug.lower()

In [8]:
## apply drug name processing to hcpcs drug descriptions

medicare_drug_claims['hcpcs_description'] = medicare_drug_claims['hcpcs_description'].apply(preprocess_drug_name)

## Read in drug names/rxcui extracted from medi_indication table

In [9]:
rxcui_drugs = pd.read_csv("03_rxcui_drug_name.csv")

rxcui_drugs.head()

Unnamed: 0,rxcui_in,drug_name,keywords_indication_desc
0,448,ethanol,breast
1,632,mitomycin,breast
2,677,aminoglutethimide,breast
3,1437,berberine,breast
4,1562,bioflavonoids,breast


In [10]:
rxcui_drugs.shape

(185, 3)

## Per drug name in rxcui_drugs, search in the hcpcs_description for a match

In [11]:
matches = []

for (rxcui, drug, keyword) in rxcui_drugs.itertuples(index=False):
    
    for (hcpcs_code, hcpcs_desc) in medicare_drug_claims.itertuples(index=False):
        foundMatch = re.search(drug, hcpcs_desc)
        
        if foundMatch:
            matches.append((rxcui, drug, keyword, hcpcs_code, hcpcs_desc))

In [12]:
# number of breast cancer drugs found in claims data
len(matches)

35

In [13]:
for i in matches:
    print(i)

(632, 'mitomycin', 'breast', 'J9280', 'injection, mitomycin, 5 mg')
(2555, 'cisplatin', 'breast', 'J9060', 'injection, cisplatin, powder or solution, 10 mg')
(3002, 'cyclophosphamide', 'breast', 'J9070', 'cyclophosphamide, 100 mg')
(3639, 'doxorubicin', 'breast', 'J9000', 'injection, doxorubicin hydrochloride, 10 mg')
(3639, 'doxorubicin', 'breast', 'Q2050', 'injection, doxorubicin hydrochloride, liposomal, not otherwise specified, 10mg')
(4492, 'fluorouracil', 'breast', 'J9190', 'injection, fluorouracil, 500 mg')
(6851, 'methotrexate', 'breast', 'J9260', 'methotrexate sodium, 50 mg')
(10379, 'testosterone', 'breast', 'J1080', 'injection, testosterone cypionate, 1 cc, 200 mg')
(11202, 'vincristine', 'breast', 'J9370', 'vincristine sulfate, 1 mg')
(11295, 'water', 'breast', 'J7060', '5% dextrose/water (500 ml = 1 unit)')
(11473, 'pamidronate', 'breast', 'J2430', 'injection, pamidronate disodium, per 30 mg')
(12574, 'gemcitabine', 'breast', 'J9201', 'injection, gemcitabine hydrochloride,

In [14]:
## percentage of drugs identified as breast cancer drugs in medi_indication table that are also in payments table
len(matches) / rxcui_drugs.shape[0]

0.1891891891891892

In [15]:
## create data frame from matches list of tuples
rxcui_hcpcs_df = pd.DataFrame(matches, index=None, columns=['rxcui_in','drug_name','keywords_indication_desc',
                                                           'hcpcs_code','hcpcs_description'])

rxcui_hcpcs_df.head()

Unnamed: 0,rxcui_in,drug_name,keywords_indication_desc,hcpcs_code,hcpcs_description
0,632,mitomycin,breast,J9280,"injection, mitomycin, 5 mg"
1,2555,cisplatin,breast,J9060,"injection, cisplatin, powder or solution, 10 mg"
2,3002,cyclophosphamide,breast,J9070,"cyclophosphamide, 100 mg"
3,3639,doxorubicin,breast,J9000,"injection, doxorubicin hydrochloride, 10 mg"
4,3639,doxorubicin,breast,Q2050,"injection, doxorubicin hydrochloride, liposoma..."


In [31]:
## write map out to csv
rxcui_hcpcs_df.to_csv("05_rxcui_hcpcs_map_breast.csv", index=False)