In [1]:
import pandas as pd
import sqlite3 as lite
from NNClassifier import Translator
import os.path
import numpy as np
import feather
import itertools

In [2]:
#local database:
database = 'data/googDrugData.db'
druguse_file = 'data/drugnames_withclasses.feather'

We start by preparing some training data. We have a database of properly classifed drugs, `drugnames_withclasses`, and the (much larger) database `df_goog` of drugs labeled by strings of *drug class information* which we've scraped from google. We want to train a neural network to read the strings and output the correct drug classes. So we need to align these databases to get training data...

First, we download a database which connects drugs with their uses:

In [3]:
if os.path.isfile(druguse_file):
    df_uses = feather.read_dataframe(druguse_file)
else:
    print("Run the D4D script to generate the database.")

In [4]:
df_uses.head(10)

Unnamed: 0,drugname_brand,drugname_generic,rxnorm_rxcui,drug_major_class,dmc_name,drug_class,dc_name
0,10 wash,sulfacetamide sodium,10169,0|DE000|OP000,unknown/missing,0|DE101|OP210,unknown/missing
1,1st tier unifine pentips,"pen needle, diabetic",0.0,0,unknown/missing,0,unknown/missing
2,1st tier unifine pentips plus,"pen needle, diabetic",0.0,0,unknown/missing,0,unknown/missing
3,60pse-400gfn-20dm,guaifenesin/dm/pseudoephedrine,5032|8896,RE000,,RE200|RE302,
4,8-mop,methoxsalen,6854|227713,DE000,,DE810,
5,a-b otic,antipyrine/benzocaine,1399,NT000|OT000,,NT300|OT400,
6,abacavir,abacavir sulfate,190521|190521,AM000,,AM800,
7,abacavir-lamivudine-zidovudine,abacavir/lamivudine/zidovudine,190521|68244|11413,AM000,,AM800,
8,abelcet,amphotericin b lipid complex,81507,0,0,0,0
9,abilify,aripiprazole,89013|352393,CN000,,CN709,


Set the column we're interested in predicting:

In [5]:
column_of_interest = 'drug_major_class'

Restrict to those entries we have a target entry:

In [6]:
df_uses_known = df_uses[df_uses[column_of_interest] != '0']
df_uses_known.head(10)

Unnamed: 0,drugname_brand,drugname_generic,rxnorm_rxcui,drug_major_class,dmc_name,drug_class,dc_name
0,10 wash,sulfacetamide sodium,10169,0|DE000|OP000,unknown/missing,0|DE101|OP210,unknown/missing
3,60pse-400gfn-20dm,guaifenesin/dm/pseudoephedrine,5032|8896,RE000,,RE200|RE302,
4,8-mop,methoxsalen,6854|227713,DE000,,DE810,
5,a-b otic,antipyrine/benzocaine,1399,NT000|OT000,,NT300|OT400,
6,abacavir,abacavir sulfate,190521|190521,AM000,,AM800,
7,abacavir-lamivudine-zidovudine,abacavir/lamivudine/zidovudine,190521|68244|11413,AM000,,AM800,
9,abilify,aripiprazole,89013|352393,CN000,,CN709,
10,abilify discmelt,aripiprazole,89013|352393,CN000,,CN709,
11,abilify maintena,aripiprazole,89013|352393,CN000,,CN709,
12,abraxane,paclitaxel protein-bound,56946|589511,AN000,,AN900,


Next, we correlate this with our local database of drug uses scraped from the internet. Let's load that database:

In [7]:
with lite.connect(database) as con:
    df_goog = pd.read_sql('select Generic_Name, Drug_Class_Data_clean from goog_drug_class_data;',con)
#Do a bit of cleaning:
df_goog['Generic_Name'] = df_goog['Generic_Name'].apply(lambda x: x.strip())
df_goog['Generic_Name'] = df_goog['Generic_Name'].apply(lambda x: x.lower())
df_goog

Unnamed: 0,Generic_Name,Drug_Class_Data_clean
0,sulfacetamide sodium,\n\n sulfa antibiotics\n \n\n
1,sulfacetamide sodium,\n\n Other\n \n\n sulfa antibiotics\n \...
2,sulfacetamide sodium,\n\n sulfa antibiotics\n \n\n
3,sulfacetamide sodium,\n\n Other\n \n\n sulfa antibiotics\n \...
4,sulfacetamide sodium,\n\n sulfa antibiotic / corticosteroid combin...
5,sulfacetamide sodium,\n\n Other\n \n\n sulfa antibiotic / cort...
6,sulfacetamide sodium,\n\n sulfa antibiotics\n \n\n
7,sulfacetamide sodium,\n\n Other\n \n\n sulfa antibiotics\n \...
8,sulfacetamide sodium,\n\n Sulfonamide Antibacterial [EPC]\n \n
9,sulfacetamide sodium,\n\n Sulfonamides [Chemical/Ingredient]\n \n


In [8]:
df = pd.merge(df_uses_known[['drugname_generic',column_of_interest]]
                      ,df_goog.drop_duplicates(keep='first'),
              left_on='drugname_generic',
              right_on='Generic_Name',
              how='inner').drop('Generic_Name',1).reset_index(drop=True)
df

Unnamed: 0,drugname_generic,drug_major_class,Drug_Class_Data_clean
0,sulfacetamide sodium,0|DE000|OP000,\n\n sulfa antibiotics\n \n\n
1,sulfacetamide sodium,0|DE000|OP000,\n\n Other\n \n\n sulfa antibiotics\n \...
2,sulfacetamide sodium,0|DE000|OP000,\n\n Other\n \n\n sulfa antibiotics\n \...
3,sulfacetamide sodium,0|DE000|OP000,\n\n sulfa antibiotic / corticosteroid combin...
4,sulfacetamide sodium,0|DE000|OP000,\n\n Other\n \n\n sulfa antibiotic / cort...
5,sulfacetamide sodium,0|DE000|OP000,\n\n Other\n \n\n sulfa antibiotics\n \...
6,sulfacetamide sodium,0|DE000|OP000,\n\n Sulfonamide Antibacterial [EPC]\n \n
7,sulfacetamide sodium,0|DE000|OP000,\n\n Sulfonamides [Chemical/Ingredient]\n \n
8,sulfacetamide sodium,0|DE000|OP000,"\n\n Sulfonamide Antibacterial [EPC],Sulfonam..."
9,sulfacetamide sodium,0|DE000|OP000,\n\n Corticosteroid [EPC]\n \n


Now let's prepare a list of vocabulary for the drug reference phrases we found via google, and a dictionary between vocabulary and words...

In [9]:
translatorObj = Translator(input_phrases = df['Drug_Class_Data_clean'],
                           input_targets = df[column_of_interest],mult_targets=True,unk_target='0')

Most common words are: 
[('<eof>', 29132), ('(', 6011), (')', 6011), ('-', 4796), (',', 4015), ('/', 3301), ('other', 3161), ('epc', 2360), ('medicine', 2052), ('sodium', 1823)]
We dropped 4051 uninformative training entries, 13.906% of the input set


Save the Translator...

In [10]:
translatorObj.save('data/training_data.npz')

Translate the unknown drugs to be processed by the RNN...

In [11]:
df_goog['Drug_Class_Data_ids'] = df_goog['Drug_Class_Data_clean'].apply(translatorObj.translate_phrase)
df_goog['Drug_Class_Data_clean'] = df_goog['Drug_Class_Data_clean'].apply(translatorObj.clean_phrase)

#Drop uniformative words
informative_words = df_goog['Drug_Class_Data_ids'].apply(np.vectorize(lambda y: y!= translatorObj.word_to_id['<eof>']
                                                             and y != translatorObj.word_to_id['<unk>']))
informative_entries = informative_words.apply(np.any)
df_goog[informative_entries]



Unnamed: 0,Generic_Name,Drug_Class_Data_clean,Drug_Class_Data_ids
0,sulfacetamide sodium,sulfa antibiotics <eof>,"[202, 1116, 2028, 2028, 2028, 2028, 2028, 2028..."
1,sulfacetamide sodium,other sulfa antibiotics accutane aczone atrali...,"[333, 202, 1116, 2254, 1204, 1728, 446, 80, 13..."
2,sulfacetamide sodium,sulfa antibiotics <eof>,"[202, 1116, 2028, 2028, 2028, 2028, 2028, 2028..."
3,sulfacetamide sodium,other sulfa antibiotics amoxil augmentin bactr...,"[333, 202, 1116, 1002, 2094, 355, 1839, 308, 8..."
4,sulfacetamide sodium,sulfa antibiotic / corticosteroid combinations...,"[202, 1730, 1457, 1182, 111, 2028, 2028, 2028,..."
5,sulfacetamide sodium,other sulfa antibiotic / corticosteroid combin...,"[333, 202, 1730, 1457, 1182, 111, 1681, 76, 19..."
6,sulfacetamide sodium,sulfa antibiotics <eof>,"[202, 1116, 2028, 2028, 2028, 2028, 2028, 2028..."
7,sulfacetamide sodium,other sulfa antibiotics amoxil augmentin bactr...,"[333, 202, 1116, 1002, 2094, 355, 1839, 308, 8..."
8,sulfacetamide sodium,sulfonamide antibacterial ( epc ) <eof>,"[1815, 1785, 671, 2232, 2178, 2028, 2028, 2028..."
9,sulfacetamide sodium,sulfonamides ( chemical / ingredient ) <eof>,"[1811, 671, 518, 1457, 780, 2178, 2028, 2028, ..."


save the results...

In [12]:
with open('data/data_to_process.npz','wb') as file:
    np.savez(file,columns = df_goog[informative_entries].columns, data = df_goog[informative_entries].as_matrix())

Check that it loads properly:

In [13]:
with open('data/data_to_process.npz','rb') as file:
    npzfile = np.load(file)
    df_test = pd.DataFrame(columns = npzfile['columns'],data=npzfile['data'])

In [14]:
df_test

Unnamed: 0,Generic_Name,Drug_Class_Data_clean,Drug_Class_Data_ids
0,sulfacetamide sodium,sulfa antibiotics <eof>,"[202, 1116, 2028, 2028, 2028, 2028, 2028, 2028..."
1,sulfacetamide sodium,other sulfa antibiotics accutane aczone atrali...,"[333, 202, 1116, 2254, 1204, 1728, 446, 80, 13..."
2,sulfacetamide sodium,sulfa antibiotics <eof>,"[202, 1116, 2028, 2028, 2028, 2028, 2028, 2028..."
3,sulfacetamide sodium,other sulfa antibiotics amoxil augmentin bactr...,"[333, 202, 1116, 1002, 2094, 355, 1839, 308, 8..."
4,sulfacetamide sodium,sulfa antibiotic / corticosteroid combinations...,"[202, 1730, 1457, 1182, 111, 2028, 2028, 2028,..."
5,sulfacetamide sodium,other sulfa antibiotic / corticosteroid combin...,"[333, 202, 1730, 1457, 1182, 111, 1681, 76, 19..."
6,sulfacetamide sodium,sulfa antibiotics <eof>,"[202, 1116, 2028, 2028, 2028, 2028, 2028, 2028..."
7,sulfacetamide sodium,other sulfa antibiotics amoxil augmentin bactr...,"[333, 202, 1116, 1002, 2094, 355, 1839, 308, 8..."
8,sulfacetamide sodium,sulfonamide antibacterial ( epc ) <eof>,"[1815, 1785, 671, 2232, 2178, 2028, 2028, 2028..."
9,sulfacetamide sodium,sulfonamides ( chemical / ingredient ) <eof>,"[1811, 671, 518, 1457, 780, 2178, 2028, 2028, ..."
