In [1]:
import pickle
from copy import deepcopy
import pandas as pd
import numpy as np
import re

In [2]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
# creating new dataframe on the scraped dataset from drugbank.ca
db_df = pd.read_csv('../Data Sources/drug_bank_scraped.csv')

In [4]:
# droppping absorbtion and and protein binding because many cells are paragraphs long with inconsistent formatting.
# dropping synthesis reference. was going to use for synthesis year, but some cells are for synthesis and some are for first patent.
# dropping cas_num because it will not enable any joining between tables.
# dropping url fields because they were only needed for the scraping process.
# accession number not needed because it won't help with joining other tables.
db_df.drop(['db_absorbtion', 'db_protein_binding', 'db_synth_ref',
            'db_cas_num', 'db_acc_num', 'db_brand_names_table_url',
            'db_drug_interactions_url', 'db_generic_names_table_url'], axis = 1, inplace = True)

In [5]:
# all cells that turned up empty for this field were over-the-counter only.
# I am only looking at prescription medications.
# db_df = deepcopy(db_df[db_df['db_brand_names'] != 'set()'])
db_df = deepcopy(db_df.reset_index())
db_df.drop('index', axis = 1, inplace = True)

In [6]:
# renamed chemicals where no brands or generics exist accordingly.
db_df.loc[db_df['db_brand_names'] == 'set()', 'db_brand_names'] = 'No Brands'
db_df.loc[db_df['db_generic_names'] == 'set()', 'db_generic_names'] = 'No Generic Products'

In [7]:
# all null values for chemical state were not available on DrugBank.
db_df.loc[db_df['db_chem_state'].isnull(), 'db_chem_state'] = 'Not Available'
# null values for these fields were either not yet classified, or not available.
# decided to classify all as not available
db_df.loc[db_df['db_kingdom'].isnull(), 'db_kingdom'] = 'Not Available'
db_df.loc[db_df['db_class'].isnull(), 'db_class'] = 'Not Available'
db_df.loc[db_df['db_super_class'].isnull(), 'db_super_class'] = 'Not Available'
db_df.loc[db_df['db_sub_class'].isnull(), 'db_sub_class'] = 'Not Available'
db_df.loc[db_df['db_num_targets'].isnull(), 'db_num_targets'] = np.nan
db_df.loc[db_df['db_molecular_weight'].isnull(), 'db_molecular_weight'] = 'Not Available'

In [8]:
# fixed formatting in db_brand_names column.
db_df['db_brand_names'] = ([s.replace('{', '') for s in db_df['db_brand_names']])
db_df['db_brand_names'] = ([s.replace('}', '') for s in db_df['db_brand_names']])
db_df['db_brand_names'] = ([s.replace('[', '') for s in db_df['db_brand_names']])
db_df['db_brand_names'] = ([s.replace(']', '') for s in db_df['db_brand_names']])
db_df['db_brand_names'] = ([s.replace('"', '') for s in db_df['db_brand_names']])
db_df['db_brand_names'] = ([s.replace("'", '') for s in db_df['db_brand_names']])

In [9]:
# fixed formatting in db_class column.
db_df['db_class'] = [s.replace('>Not', 'Not') for s in db_df['db_class']]
db_df['db_class'] = [s.replace('Not', '>Not') for s in db_df['db_class']]
db_df['db_class'] = [s.replace('Carboxylic Acids and Derivatives', '>Carboxylic Acids and Derivatives') for s in db_df['db_class']]
db_df['db_class'] = [s.split('>')[1] for s in db_df['db_class']]
db_df['db_class'] = [s.strip('</a') for s in db_df['db_class']]

In [10]:
# fixed formatting in db_sub_class column.
db_df['db_sub_class'] = [s.replace('>Not Available', 'Not Available') for s in db_df['db_sub_class']]
db_df['db_sub_class'] = [s.replace('Not', '>Not') for s in db_df['db_sub_class']]
db_df['db_sub_class'] = [s.replace('Amino Acids, Peptides, and Analogues', '>Amino Acids, Peptides, and Analogues') for s in db_df['db_sub_class']]
db_df['db_sub_class'] = [s.split('>')[1] for s in db_df['db_sub_class']]
db_df['db_sub_class'] = [s.strip('</a') for s in db_df['db_sub_class']]
db_df.loc[db_df['db_sub_class'] == 'Not Available</span', 'db_sub_class'] = 'Not Available'
db_df.loc[db_df['db_sub_class'] == '', 'db_sub_class'] = 'Not Available'

In [11]:
# fixed formatting in db_super_class column.
db_df['db_super_class'] = [s.replace('>Not Available', 'Not Available') for s in db_df['db_super_class']]
db_df['db_super_class'] = [s.replace('Not', '>Not') for s in db_df['db_super_class']]
db_df['db_super_class'] = [s.replace('Organic Acids', '>Organic Acids') for s in db_df['db_super_class']]
db_df['db_super_class'] = [s.split('>')[1] for s in db_df['db_super_class']]
db_df['db_super_class'] = [s.strip('</a') for s in db_df['db_super_class']]
db_df.loc[db_df['db_super_class'] == 'Not Available</span', 'db_super_class'] = 'Not Available'
db_df.loc[db_df['db_super_class'] == '', 'db_super_class'] = 'Not Available'

In [12]:
# fixed formatting in db_affected_organisms column.
db_df['db_affected_organisms'] = ([s.strip('<ul class="list-unstyled table-list"><li>') for s in db_df['db_affected_organisms']])
db_df['db_affected_organisms'] = ([s.strip('pan class="not-available">') for s in db_df['db_affected_organisms']])
db_df['db_affected_organisms'] = ([s.replace('</li>\\n<li>', ', ') for s in db_df['db_affected_organisms']])
db_df['db_affected_organisms'] = ([s.replace('</li></', '') for s in db_df['db_affected_organisms']])
db_df['db_affected_organisms'] = ([s.strip('</') for s in db_df['db_affected_organisms']])

In [13]:
# fixed formatting in db_molecular_weight column.
# removed Monoisotopic molecular weights but kept averages.
# changed 'Not Available' to np.nan
# fixed blank data rows that were not recorded as np.nan accordingly.
db_df['db_molecular_weight'] = ([s.replace('Average: ', '') for s in db_df['db_molecular_weight']])
db_df['db_molecular_weight'] = [re.sub(r' <br>\w+\:', '', s) for s in db_df['db_molecular_weight']]
db_df['db_molecular_weight'] = ([s.strip('<span class="not-available">') for s in db_df['db_molecular_weight']])
db_df['db_molecular_weight'] = ([s.replace('</', ' ') for s in db_df['db_molecular_weight']])
db_df['db_molecular_weight'] = ([s.split(' ')[0] for s in db_df['db_molecular_weight']])
db_df.loc[db_df['db_molecular_weight'] == 'Not', 'db_molecular_weight'] = 'Not Available'
db_df.loc[db_df['db_molecular_weight'] == 'Not Available', 'db_molecular_weight'] = np.nan
db_df.loc[3765, 'db_molecular_weight'] = None
db_df.loc[4605, 'db_molecular_weight'] = None
db_df.loc[8979, 'db_molecular_weight'] = None
db_df.rename({'db_molecular_weight': 'db_avg_molecular_weight'}, axis = 1, inplace = True)

In [14]:
# fixed formatting for 'Not Available' in db_molecular_framework column.
db_df.loc[db_df['db_molecular_framework'] == '<span class="not-available">Not Available</span>', 'db_molecular_framework'] = 'Not Available'

In [15]:
# renamed columns for appropriate names after future join.
db_df.rename({'db_drug_name': 'chem_name', 'db_drug_type': 'drug_type',
              'db_affected_organisms': 'affected_organisms',
              'db_chem_state': 'chem_state', 'db_class': 'chem_class',
              'db_drug_group': 'drug_group', 'db_kingdom': 'chem_kingdom',
              'db_molecular_framework': 'molecular_framework',
              'db_avg_molecular_weight': 'avg_molecular_weight',
              'db_num_clin_trials': 'num_clinical_trials',
              'db_num_dosage_forms': 'num_dosage_forms',
              'db_num_drug_interactions': 'num_drug_interactions',
              'db_num_food_interactions': 'num_food_interactions',
              'db_num_manufacturers': 'num_manufacturers_for_chem',
              'db_num_packagers': 'num_packagers_for_chem',
              'db_num_patents': 'num_patents_on_chem',
              'db_num_targets': 'num_targets', 'db_sub_class': 'chem_sub_class',
              'db_super_class': 'chem_super_class', 'db_brand_names': 'brand_names',
              'db_generic_names': 'generic_names'}, axis = 1, inplace = True)

In [16]:
# gave columns appropriate data types.
db_df['avg_molecular_weight'] = db_df['avg_molecular_weight'].astype(float)
db_df['num_clinical_trials'] = db_df['num_clinical_trials'].astype(float)
db_df['num_dosage_forms'] = db_df['num_dosage_forms'].astype(float)
db_df['num_drug_interactions'] = db_df['num_drug_interactions'].astype(float)
db_df['num_food_interactions'] = db_df['num_food_interactions'].astype(float)
db_df['num_manufacturers_for_chem'] = db_df['num_manufacturers_for_chem'].astype(float)
db_df['num_packagers_for_chem'] = db_df['num_packagers_for_chem'].astype(float)
db_df['num_patents_on_chem'] = db_df['num_patents_on_chem'].astype(float)
db_df['num_targets'] = db_df['num_targets'].astype(float)

In [17]:
db_df.to_pickle('../Pickles/db_df.pkl')

In [18]:
db_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11850 entries, 0 to 11849
Data columns (total 22 columns):
affected_organisms            11850 non-null object
brand_names                   11850 non-null object
chem_state                    11850 non-null object
chem_class                    11850 non-null object
drug_group                    11850 non-null object
chem_name                     11850 non-null object
drug_type                     11850 non-null object
generic_names                 11850 non-null object
chem_kingdom                  11850 non-null object
molecular_framework           9453 non-null object
avg_molecular_weight          9535 non-null float64
num_clinical_trials           11850 non-null float64
num_dosage_forms              11850 non-null float64
num_drug_interactions         11850 non-null float64
num_food_interactions         11850 non-null float64
num_manufacturers_for_chem    11850 non-null float64
num_packagers_for_chem        11850 non-null float64
nu