In [32]:
import pandas as pd
import sqlite3 as lite
import re
from collections import Counter
import itertools
from NNClassifier import translator

In [2]:
#local database:
database = 'googDrugData1.db'

We start by preparing some training data. We have a (small) database of properly classifed drugs, `df_uses`, and the (much larger) database `df_goog` of drugs labeled by strings of *drug class information* which we've scraped from google. We want to train a neural network to read the strings and output the correct drug classes. So we need to align these databases to get training data...

First, we download a database which connects drugs with their uses:

In [3]:
df_uses = pd.read_csv('https://query.data.world/s/81zz5jv9bpf4n70eoaef1lvjw',index_col=0,nrows = 500)
df_uses

Unnamed: 0,drugname_brand,drugname_generic,anatomical,therapeutic,pharmacologic,chemical,substance,name
0,8-MOP,METHOXSALEN,DERMATOLOGICALS,ANTIPSORIATICS,ANTIPSORIATICS FOR TOPICAL USE,PSORALENS FOR TOPICAL USE,METHOXSALEN,METHOXSALEN
1,8-MOP,METHOXSALEN,DERMATOLOGICALS,ANTIPSORIATICS,ANTIPSORIATICS FOR SYSTEMIC USE,PSORALENS FOR SYSTEMIC USE,METHOXSALEN,METHOXSALEN
2,OXSORALEN,METHOXSALEN,DERMATOLOGICALS,ANTIPSORIATICS,ANTIPSORIATICS FOR TOPICAL USE,PSORALENS FOR TOPICAL USE,METHOXSALEN,METHOXSALEN
3,OXSORALEN,METHOXSALEN,DERMATOLOGICALS,ANTIPSORIATICS,ANTIPSORIATICS FOR SYSTEMIC USE,PSORALENS FOR SYSTEMIC USE,METHOXSALEN,METHOXSALEN
4,ABILIFY,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE
5,ABILIFY,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE CAVOXIL
6,ABILIFY,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE LAUROXIL
7,ABILIFY,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE HYDRATE
8,ABILIFY DISCMELT,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE
9,ABILIFY DISCMELT,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE CAVOXIL


Next, we correlate this with our local database of drug uses scraped from the internet. Let's load that database:

In [4]:
with lite.connect(database) as con:
    df_goog = pd.read_sql('select Generic_Name, Drug_Class_Data_clean from goog_drug_class_data;',con)
#Do a bit of cleaning:
df_goog['Generic_Name'] = df_goog['Generic_Name'].apply(lambda x: x.strip())
df_goog

Unnamed: 0,Generic_Name,Drug_Class_Data_clean
0,SULFACETAMIDE SODIUM,\n\n sulfa antibiotics\n \n\n
1,SULFACETAMIDE SODIUM,\n\n Other\n \n\n sulfa antibiotics\n \...
2,SULFACETAMIDE SODIUM,\n\n sulfa antibiotics\n \n\n
3,SULFACETAMIDE SODIUM,\n\n Other\n \n\n sulfa antibiotics\n \...
4,SULFACETAMIDE SODIUM,\n\n sulfa antibiotic / corticosteroid combin...
5,SULFACETAMIDE SODIUM,\n\n Other\n \n\n sulfa antibiotic / cort...
6,SULFACETAMIDE SODIUM,\n\n sulfa antibiotics\n \n\n
7,SULFACETAMIDE SODIUM,\n\n Other\n \n\n sulfa antibiotics\n \...
8,SULFACETAMIDE SODIUM,\n\n Sulfonamide Antibacterial [EPC]\n \n
9,SULFACETAMIDE SODIUM,\n\n Sulfonamides [Chemical/Ingredient]\n \n


In [5]:
df = pd.merge(df_uses[['drugname_generic','anatomical','therapeutic','pharmacologic']]
                      ,df_goog,
              left_on='drugname_generic',
              right_on='Generic_Name',
              how='inner').drop('Generic_Name',1).drop_duplicates(keep='first').reset_index(drop=True)
df

Unnamed: 0,drugname_generic,anatomical,therapeutic,pharmacologic,Drug_Class_Data_clean
0,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Type of medicine\n \n\n An antipsychoti...
1,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Antipsychotic\n \n\n
2,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n atypical antipsychotics\n \n\n
3,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Other\n \n\n atypical antipsychotics...
4,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Atypical Antipsychotic\n \n\n
5,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Antidepressant\n \n\n
6,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Antipsychotics\n \n\n
7,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Antipsychotic Medication\n \n\n
8,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Neurologie-psychiatrie\n \n\n
9,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n\n Antipsychotic\n \n\n\n


Now let's prepare a list of vocabulary for the drug reference phrases we found via google, and a dictionary between vocabulary and words...

In [6]:
def clean_my_string(my_string):
    my_string = my_string.replace('\n',' ').lower()
    my_string = re.sub('[^A-Za-z0-9\-,;./\s\[\]\(\)]+', ' ', my_string)
    my_string = my_string.replace(',',' , ')\
             .replace('.',' , ')\
             .replace(';',' , ')\
             .replace('(',' ( ')\
             .replace('[',' ( ')\
             .replace(')',' ) ')\
             .replace(']',' ) ')\
             .replace('-',' - ')\
             .replace('/',' / ')
    my_string=my_string+' <eof> '
    my_string = ' '.join(my_string.split()).strip()
    #If the string is too long, it isn't likely to be very specific
    if len(my_string) > 200:
        return ''
    return my_string
    

In [7]:
df_goog['Drug_Class_Data_clean'] = df_goog['Drug_Class_Data_clean'].apply(clean_my_string)

df_goog



Unnamed: 0,Generic_Name,Drug_Class_Data_clean
0,SULFACETAMIDE SODIUM,sulfa antibiotics <eof>
1,SULFACETAMIDE SODIUM,
2,SULFACETAMIDE SODIUM,sulfa antibiotics <eof>
3,SULFACETAMIDE SODIUM,other sulfa antibiotics amoxil augmentin bactr...
4,SULFACETAMIDE SODIUM,sulfa antibiotic / corticosteroid combinations...
5,SULFACETAMIDE SODIUM,
6,SULFACETAMIDE SODIUM,sulfa antibiotics <eof>
7,SULFACETAMIDE SODIUM,other sulfa antibiotics amoxil augmentin bactr...
8,SULFACETAMIDE SODIUM,sulfonamide antibacterial ( epc ) <eof>
9,SULFACETAMIDE SODIUM,sulfonamides ( chemical / ingredient ) <eof>


In [8]:
vocab = Counter(itertools.chain.from_iterable(x.split(' ') for x in df_goog['Drug_Class_Data_clean']))
vocab.most_common(50)

[('<eof>', 16653),
 (')', 3681),
 ('(', 3681),
 ('', 2995),
 ('-', 2306),
 (',', 2101),
 ('/', 1970),
 ('epc', 1487),
 ('other', 1149),
 ('sodium', 1138),
 ('medicine', 1114),
 ('fluorure', 1110),
 ('of', 930),
 ('type', 928),
 ('moa', 819),
 ('agent', 737),
 ('combinations', 696),
 ('ingredient', 680),
 ('chemical', 680),
 ('inhibitors', 634),
 ('et', 607),
 ('a', 513),
 ('vitamin', 452),
 ('anti', 448),
 ('inhibitor', 444),
 ('association', 425),
 ('en', 413),
 ('associations', 388),
 ('opioid', 376),
 ('olaflur', 370),
 ('monofluorophosphate', 370),
 ('etain', 370),
 ('agonists', 362),
 ('corticosteroid', 342),
 ('antibiotics', 333),
 ('de', 327),
 ('receptor', 324),
 ('d', 318),
 ('pe', 307),
 ('an', 299),
 ('contraceptive', 285),
 ('estrogen', 279),
 ('s', 274),
 ('antagonists', 270),
 ('combination', 267),
 ('medicaments', 260),
 ('central', 257),
 ('system', 256),
 ('nervous', 242),
 ('progestin', 239)]

In [9]:
len(vocab)

2713

In [10]:
uncommon_words = [w for w in vocab if vocab[w]<5]
len(uncommon_words)

1068

In [11]:
def replace_uncommon_words(my_string):
    #for w in uncommon_words:
    my_list = my_string.split(' ')
    my_new_list = [w if w not in uncommon_words else '<unk>' for w in my_list]
    #my_string= my_string.replace(' '+w+' ',' <unk> ')
    return ' '.join(my_new_list).strip()

In [12]:
df_goog['Drug_Class_Data_clean'] = df_goog['Drug_Class_Data_clean'].apply(replace_uncommon_words)
vocab = Counter(itertools.chain.from_iterable(x.split(' ') for x in df_goog['Drug_Class_Data_clean']))
len(vocab)

1646

In [13]:
lengths = Counter(len(x.split(' ')) for x in df_goog['Drug_Class_Data_clean'])
lengths

Counter({1: 3214,
         2: 3294,
         3: 2757,
         4: 1226,
         5: 2212,
         6: 1449,
         7: 1649,
         8: 1018,
         9: 489,
         10: 171,
         11: 266,
         12: 190,
         13: 207,
         14: 156,
         15: 80,
         16: 85,
         17: 209,
         18: 35,
         19: 79,
         20: 60,
         21: 90,
         22: 44,
         23: 136,
         24: 145,
         25: 186,
         26: 88,
         27: 19,
         28: 34,
         29: 32,
         30: 5,
         31: 2,
         32: 7,
         33: 5,
         35: 9})

In [14]:
word_to_id={word:id for id,word in enumerate(vocab)}
id_to_word={id:word for id,word in enumerate(vocab)}
word_to_id

{'': 0,
 'lmicos': 1,
 'impulse': 3,
 'glucose': 4,
 'striant': 5,
 'argatroban': 6,
 'iron': 7,
 'tenormin': 8,
 'injector': 9,
 'fenoglide': 10,
 'imidazol': 11,
 'vasopressin': 12,
 'diuresis': 13,
 'hct': 14,
 'neuroleptiques': 15,
 'sphingosine': 16,
 'acidosis': 17,
 'tranxene': 18,
 'folic': 19,
 'g': 20,
 'revlimid': 21,
 'voltaren': 22,
 'epileptics': 23,
 'associations': 24,
 'procrit': 25,
 'benzothiazepine': 26,
 'density': 548,
 'estroprogestatifs': 28,
 'cold': 29,
 'tiazac': 30,
 'silenor': 31,
 'rhumatologie': 32,
 'bisphosphonates': 33,
 'namenda': 414,
 'system': 35,
 'calcineurin': 36,
 'production': 37,
 'gaba': 38,
 'devices': 39,
 'atropine': 40,
 'medication': 41,
 'septra': 42,
 'niramine': 44,
 'midrin': 45,
 'quadrivalent': 46,
 'sulfonamide': 47,
 'intradermal': 48,
 'ergic': 49,
 'proscar': 50,
 'inhibitor': 829,
 'a01aa': 52,
 'pde5': 53,
 'eryc': 54,
 'antiglaucomateux': 55,
 'lamisil': 56,
 'virus': 1092,
 'neurologie': 58,
 'agents': 59,
 'antidepressant

In [15]:
target_column = 'anatomical'
targets = list(set(df_uses[target_column]))
target_to_id={word:id for id,word in enumerate(targets)}
id_to_target={id:word for id,word in enumerate(targets)}
target_to_id

{'ALIMENTARY TRACT AND METABOLISM': 2,
 'ANTIINFECTIVES FOR SYSTEMIC USE': 12,
 'ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS': 9,
 'ANTIPARASITIC PRODUCTS, INSECTICIDES AND REPELLENTS': 3,
 'BLOOD AND BLOOD FORMING ORGANS': 4,
 'CARDIOVASCULAR SYSTEM': 6,
 'DERMATOLOGICALS': 11,
 'GENITO URINARY SYSTEM AND SEX HORMONES': 0,
 'NERVOUS SYSTEM': 5,
 'RESPIRATORY SYSTEM': 7,
 'SENSORY ORGANS': 10,
 'SYSTEMIC HORMONAL PREPARATIONS, EXCL. SEX HORMONES AND INSULINS': 8,
 'VARIOUS': 1}

In [18]:
max_length = 32
def translate_my_string(my_string):
    numerical_list = [word_to_id[w] for w in my_string.split(' ') if w != '']
    if len(numerical_list) > max_length:
        numerical_list = numerical_list[:max_length-1] 
    padded_list=numerical_list+[word_to_id['<eof>']]*(max_length-len(numerical_list))
    return padded_list

In [19]:
df_goog['numerical_drug_data'] = df_goog['Drug_Class_Data_clean'].apply(translate_my_string)

Unnamed: 0,Generic_Name,Drug_Class_Data_clean,numerical_drug_data
0,SULFACETAMIDE SODIUM,sulfa antibiotics <eof>,"[1209, 1020, 1240, 1240, 1240, 1240, 1240, 124..."
1,SULFACETAMIDE SODIUM,,"[1240, 1240, 1240, 1240, 1240, 1240, 1240, 124..."
2,SULFACETAMIDE SODIUM,sulfa antibiotics <eof>,"[1209, 1020, 1240, 1240, 1240, 1240, 1240, 124..."
3,SULFACETAMIDE SODIUM,other sulfa antibiotics amoxil augmentin bactr...,"[1155, 1209, 1020, 1481, 1065, 674, 742, 1635,..."
4,SULFACETAMIDE SODIUM,sulfa antibiotic / corticosteroid combinations...,"[1209, 1365, 389, 1032, 446, 1240, 1240, 1240,..."
5,SULFACETAMIDE SODIUM,,"[1240, 1240, 1240, 1240, 1240, 1240, 1240, 124..."
6,SULFACETAMIDE SODIUM,sulfa antibiotics <eof>,"[1209, 1020, 1240, 1240, 1240, 1240, 1240, 124..."
7,SULFACETAMIDE SODIUM,other sulfa antibiotics amoxil augmentin bactr...,"[1155, 1209, 1020, 1481, 1065, 674, 742, 1635,..."
8,SULFACETAMIDE SODIUM,sulfonamide antibacterial ( epc ) <eof>,"[47, 1166, 176, 1100, 108, 1240, 1240, 1240, 1..."
9,SULFACETAMIDE SODIUM,sulfonamides ( chemical / ingredient ) <eof>,"[1098, 176, 686, 389, 75, 108, 1240, 1240, 124..."


In [20]:
df['numerical_drug_data'] = df['Drug_Class_Data_clean'].apply(translate_my_string)
df['target_id'] = df[target_column].apply(lambda x: target_to_id[x])
df

KeyError: '\n\n'

In [58]:
word_to_id['<unk>']

493

In [59]:
vocab.most_common()

[('<unk>', 86730),
 ('<e', 14083),
 ('>', 14083),
 ('s', 4476),
 ('r', 3569),
 (')', 3247),
 ('(', 3247),
 ('an', 2942),
 ('', 2540),
 ('d', 2338),
 ('e', 2031),
 ('-', 1933),
 ('t', 1905),
 (',', 1821),
 ('n', 1762),
 ('/', 1709),
 ('l', 1634),
 ('u', 1569),
 ('a', 1505),
 ('o', 1464),
 ('epc', 1309),
 ('pe', 1074),
 ('al', 1063),
 ('m', 1058),
 ('sodium', 990),
 ('inhibi', 982),
 ('ure', 972),
 ('ions', 954),
 ('c', 920),
 ('h', 895),
 ('ion', 825),
 ('mbin', 814),
 ('in', 793),
 ('of', 787),
 ('et', 763),
 ('associ', 724),
 ('ant', 701),
 ('ep', 698),
 ('moa', 687),
 ('en', 665),
 ('de', 658),
 ('agent', 646),
 ('b', 631),
 ('chem', 623),
 ('ingredient', 623),
 ('min', 598),
 ('rt', 589),
 ('rs', 579),
 ('os', 552),
 ('agoni', 495),
 ('biot', 399),
 ('ro', 369),
 ('ur', 365),
 ('on', 361),
 ('roid', 357),
 ('am', 337),
 ('p', 337),
 ('hosph', 332),
 ('ve', 331),
 ('ps', 329),
 ('ioid', 328),
 ('re', 328),
 ('mono', 324),
 ('ola', 324),
 ('goni', 314),
 ('ntr', 305),
 ('ge', 299),
 (

In [67]:
replace_uncommon_words('hi there <eop>')

'hi there <eop>'

In [16]:
a = [1,2,3,4]
a

[1, 2, 3]

In [31]:
tranObj = translator.Translator(df[target_column],df['Drug_Class_Data_clean'])

NameError: name 'clean_phrase' is not defined