In [11]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [12]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [13]:
df = pd.read_pickle('../Pickles/merged_df.pkl')

# checked for number of drugs remaining for analysis.
print(len(df))

1780


In [14]:
df.columns

Index(['drug_name', 'generic_name', 'grx_lowest_price', 'grx_pharmacy',
       'grx_purchase', 'grx_lowest_price_adj', 'chem_name',
       'affected_organisms', 'chem_state', 'chem_class', 'drug_group',
       'drug_type', 'chem_kingdom', 'molecular_framework',
       'avg_molecular_weight', 'num_clinical_trials', 'num_dosage_forms',
       'num_drug_interactions', 'num_food_interactions',
       'num_manufacturers_for_chem', 'num_packagers_for_chem',
       'num_patents_on_chem', 'num_targets', 'chem_sub_class',
       'chem_super_class', 'over_the_counter', 'nadac_cost_per_unit'],
      dtype='object')

In [15]:
# removed all rows where the nadac_cost_per_unit was lower than grx_lowest_price.
df = deepcopy(df[df['nadac_cost_per_unit'] < df['grx_lowest_price']])

# checked for number of drugs remaining in dataframe
# after removing all rows where the nadac_cost_per_unit was lower than grx_lowest_price.
print(len(df))

1646


In [16]:
# removed all drugs listed by nadac_df as over-the-counter from the dataframe.
df = deepcopy(df[df['over_the_counter'] == 'N'])

# checked for number of drugs remaining in dataframe
# after removing all over-the-counter drugs.
print(len(df))

1559


In [17]:
# removed all drugs from the dataframe
# that did not come in the format of tablet, capsule, ampule, caplet or pill.
df = deepcopy(df[([('tablet' in s) or ('capsule' in s) or ('ampule' in s) or ('caplet' in s) or ('pill' in s) for s in df['grx_purchase'].str.lower()])])

# checked for number of drugs remaining in dataframe
# after removing all drugs that did not come in the format of tablet, capsule, ampule, caplet or pill.
print(len(df))

1087


In [18]:
# removed null values from the dataframe to enable linear regression with sklearn.
df = deepcopy(df[((df['num_targets'].isnull() == False) \
                  & (df['avg_molecular_weight'].isnull() == False) \
                  & (df['molecular_framework'] != 'Not Available') \
                  & (df['chem_kingdom'] != 'Not Available') \
                  & (df['chem_state'] != 'Not Available'))])

# checked for number of drugs remaining in dataframe after removing null values.
print(len(df))

1034


In [19]:
# replace current version of merged_df with new version that includes nadac data.
df.to_pickle('../Pickles/modeling_df.pkl')

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1034 entries, 0 to 1771
Data columns (total 27 columns):
drug_name                     1034 non-null object
generic_name                  1034 non-null object
grx_lowest_price              1034 non-null float64
grx_pharmacy                  1034 non-null object
grx_purchase                  1034 non-null object
grx_lowest_price_adj          1034 non-null float64
chem_name                     1034 non-null object
affected_organisms            1034 non-null object
chem_state                    1034 non-null object
chem_class                    1034 non-null object
drug_group                    1034 non-null object
drug_type                     1034 non-null object
chem_kingdom                  1034 non-null object
molecular_framework           1034 non-null object
avg_molecular_weight          1034 non-null float64
num_clinical_trials           1034 non-null float64
num_dosage_forms              1034 non-null float64
num_drug_interactions