In [5]:
import pandas as pd
import sqlite3 as lite

In [4]:
#local database:
database = 'googDrugData.db'

We start by preparing some training data. We have a (small) database of properly classifed drugs, `df_uses`, and the (much larger) database `df_goog` of drugs labeled by strings of *drug class information* which we've scraped from google. We want to train a neural network to read the strings and output the correct drug classes. So we need to align these databases to get training data...

First, we download a database which connects drugs with their uses:

In [11]:
df_uses = pd.read_csv('https://query.data.world/s/81zz5jv9bpf4n70eoaef1lvjw',index_col=0,nrows = 500)
df_uses

Unnamed: 0,drugname_brand,drugname_generic,anatomical,therapeutic,pharmacologic,chemical,substance,name
0,8-MOP,METHOXSALEN,DERMATOLOGICALS,ANTIPSORIATICS,ANTIPSORIATICS FOR TOPICAL USE,PSORALENS FOR TOPICAL USE,METHOXSALEN,METHOXSALEN
1,8-MOP,METHOXSALEN,DERMATOLOGICALS,ANTIPSORIATICS,ANTIPSORIATICS FOR SYSTEMIC USE,PSORALENS FOR SYSTEMIC USE,METHOXSALEN,METHOXSALEN
2,OXSORALEN,METHOXSALEN,DERMATOLOGICALS,ANTIPSORIATICS,ANTIPSORIATICS FOR TOPICAL USE,PSORALENS FOR TOPICAL USE,METHOXSALEN,METHOXSALEN
3,OXSORALEN,METHOXSALEN,DERMATOLOGICALS,ANTIPSORIATICS,ANTIPSORIATICS FOR SYSTEMIC USE,PSORALENS FOR SYSTEMIC USE,METHOXSALEN,METHOXSALEN
4,ABILIFY,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE
5,ABILIFY,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE CAVOXIL
6,ABILIFY,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE LAUROXIL
7,ABILIFY,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE HYDRATE
8,ABILIFY DISCMELT,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE
9,ABILIFY DISCMELT,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,OTHER ANTIPSYCHOTICS,ARIPIPRAZOLE,ARIPIPRAZOLE CAVOXIL


Next, we correlate this with our local database of drug uses scraped from the internet. Let's load that database:

In [24]:
with lite.connect(database) as con:
    df_goog = pd.read_sql('select Generic_Name, Drug_Class_Data_clean from goog_drug_class_data;',con)
#Do a bit of cleaning:
df_goog['Generic_Name'] = df_goog['Generic_Name'].apply(lambda x: x.strip())
df_goog

Unnamed: 0,Generic_Name,Drug_Class_Data_clean
0,SULFACETAMIDE SODIUM,\n\n sulfa antibiotics\n \n\n
1,SULFACETAMIDE SODIUM,\n\n Other\n \n\n sulfa antibiotics\n \...
2,SULFACETAMIDE SODIUM,\n\n sulfa antibiotics\n \n\n
3,SULFACETAMIDE SODIUM,\n\n Other\n \n\n sulfa antibiotics\n \...
4,SULFACETAMIDE SODIUM,\n\n sulfa antibiotic / corticosteroid combin...
5,SULFACETAMIDE SODIUM,\n\n Other\n \n\n sulfa antibiotic / cort...
6,SULFACETAMIDE SODIUM,\n\n sulfa antibiotics\n \n\n
7,SULFACETAMIDE SODIUM,\n\n Other\n \n\n sulfa antibiotics\n \...
8,SULFACETAMIDE SODIUM,\n\n Sulfonamide Antibacterial [EPC]\n \n
9,SULFACETAMIDE SODIUM,\n\n Sulfonamides [Chemical/Ingredient]\n \n


In [28]:
df = pd.merge(df_uses[['drugname_generic','anatomical','therapeutic','pharmacologic']]
                      ,df_goog,
              left_on='drugname_generic',
              right_on='Generic_Name',
              how='inner').drop('Generic_Name',1).drop_duplicates(keep='first').reset_index(drop=True)
df

Unnamed: 0,drugname_generic,anatomical,therapeutic,pharmacologic,Drug_Class_Data_clean
0,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Type of medicine\n \n\n An antipsychoti...
1,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Antipsychotic\n \n\n
2,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n atypical antipsychotics\n \n\n
3,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Other\n \n\n atypical antipsychotics...
4,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Atypical Antipsychotic\n \n\n
5,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Antidepressant\n \n\n
6,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Antipsychotics\n \n\n
7,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Antipsychotic Medication\n \n\n
8,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n Neurologie-psychiatrie\n \n\n
9,ARIPIPRAZOLE,NERVOUS SYSTEM,PSYCHOLEPTICS,ANTIPSYCHOTICS,\n\n\n Antipsychotic\n \n\n\n


In [20]:
df_uses['drugname_generic'][0]

'METHOXSALEN'

In [22]:
df_goog['Generic_Name'][0]

'SULFACETAMIDE SODIUM '