In [1]:
import psycopg2
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
from sklearn.manifold import TSNE
import re

%matplotlib inline
sns.set(style="white")

In [4]:
## connect to database
con = None
con = psycopg2.connect("dbname='doctordb' user='cathy'")

In [7]:
q = "SELECT * FROM medi_indication WHERE indication_description ILIKE '%breast%'"
medi_ind = pd.read_sql_query(q, con=con)

In [8]:
## number of rows with description that contains 'breast'
len(medi_ind)

248

In [9]:
medi_ind.head()

Unnamed: 0,rxcui_in,drug_desc,icd9,indication_description,mentionedbyresources,highprecisionsubset,possible_label_use
0,448,Ethanol,760.71,Alcohol affecting fetus or newborn via placent...,1,0,0
1,632,Mitomycin,174.9,Malignant neoplasm of breast (female); unspeci...,1,0,0
2,677,Aminoglutethimide,239.3,Neoplasm of unspecified nature of breast,1,1,0
3,1437,Berberine,174.9,Malignant neoplasm of breast (female); unspeci...,1,0,0
4,1562,Bioflavonoids,174.9,Malignant neoplasm of breast (female); unspeci...,1,0,0


In [35]:
## get unique drugs (rxcui_in, drug_desc)
medi_ind_drugs = medi_ind.drop_duplicates('rxcui_in')[['rxcui_in','drug_desc']]

In [36]:
## number of unique drugs:
len(medi_ind_drugs)

182

In [38]:
medi_ind_drugs.head()

Unnamed: 0,rxcui_in,drug_desc
0,448,Ethanol
1,632,Mitomycin
2,677,Aminoglutethimide
3,1437,Berberine
4,1562,Bioflavonoids


In [39]:
## print all of the unique drugs that were used to treat conditions mentioning 'breast'
medi_ind_drugs.drug_desc

0                                 Ethanol
1                               Mitomycin
2                       Aminoglutethimide
3                               Berberine
4                           Bioflavonoids
5                           Bromocriptine
6                                Busulfan
7                            Chlorambucil
8                                 Choline
9                      Choline Bitartrate
10                              Cisplatin
12                             Clomiphene
13                               Curcumin
14                       Cyclophosphamide
16                                Danazol
19                            Desipramine
20                          Dicloxacillin
21                     Diethylstilbestrol
23                               Dopamine
24                            Doxorubicin
26                             Epirubicin
28                              Estradiol
31                                Estriol
32            Estrogens  Conjugate

## drug names processing
- lower case
- convert multiple spaces to single space
- remove (USP)
- compounds with slashes should check for both version names

In [27]:
def preprocess_drug_name(drug):
    """Given a drug name, pre-process before searching for it in Medicare database by:
    converting to all lowercase, convert multiple spaces to single space, separate strings denoting
    alternative compound names with slash into individual strings, remove the characters '(USP)'
    
    INPUT:
    drug - a string
    
    OUTPUT:
    druglist - list of strings (most often a list of length 1)
    """
    
    # split strings separated by slash into list
    drug_list = drug.split('/')
    
    processed_drug_list = []
    # process each item in the list
    for d in drug_list:      
        # lower case
        d = d.lower()

        # substitute (USP) with empty string
        d = d.replace('(usp)', '')
        
        # convert multiple spaces to single space
        d = re.sub(r'\s\s+', r' ', d)
        
        # strip leading/trailing whitespace
        d = d.strip()
        
        processed_drug_list.append(d)
        
    return processed_drug_list

In [42]:
## create list of tuple pairs (rxcui_in, processed_drug_name)
processed_drug_list = []

for (rxcui, drug) in medi_ind_drugs.itertuples(index=False):
    drugs_list = preprocess_drug_name(drug)
    
    for d in drugs_list:
        processed_drug_list.append((rxcui, d))

print('There are {0} tuples'.format(len(processed_drug_list)))

There are 185 tuples


In [43]:
## Create data frame from list of tuples
rxcui_drugs_df = pd.DataFrame(processed_drug_list, index=None, columns=['rxcui_in','drug_name'])

In [44]:
rxcui_drugs_df.head()

Unnamed: 0,rxcui_in,drug_name
0,448,ethanol
1,632,mitomycin
2,677,aminoglutethimide
3,1437,berberine
4,1562,bioflavonoids


In [64]:
## Add column to indicate keyword(s)/descriptor from 'indication_description' used to select the drug
query_from_indication_descr = 'breast'

df = (pd.concat([rxcui_drugs_df, 
           pd.DataFrame({'keywords_indication_desc':[query_from_indication_descr]*len(rxcui_drugs_df)})], 
          axis=1, join='outer', ignore_index=True).
      rename(columns={0:'rxcui_in', 1:'drug_name', 2:'keywords_indication_desc'}))

df.head()

Unnamed: 0,rxcui_in,drug_name,keywords_indication_desc
0,448,ethanol,breast
1,632,mitomycin,breast
2,677,aminoglutethimide,breast
3,1437,berberine,breast
4,1562,bioflavonoids,breast


In [68]:
## write processed drug name data frame to csv

df.to_csv("03_rxcui_drug_name.csv", index=False)

In [67]:
## close connection
if con:
    con.close()