In [1]:
# The data for this file were downloaded from https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory, 
# using NDC Database File - Excel Version (Zip Format)

In [2]:
# I will only use non-proprietary names for drugs in this database. Some significant trimming was done in Excel
# to the files to reduce the file size for uploading and make the dataset generally more manageable. 

In [3]:
import numpy as np
import pandas as pd
import random 

In [4]:
# import file
df_product = pd.read_excel('/Users/chrishuhn/product.xlsx')

In [10]:
# examine dataframe
df_product.head()

Unnamed: 0,NDC_Code,PRODUCTTYPENAME,Drug_Name,DOSAGEFORMNAME,ROUTENAME,SUBSTANCENAME,ACTIVE_NUMERATOR_STRENGTH,ACTIVE_INGRED_UNIT,PHARM_CLASSES
0,0002-1200,HUMAN PRESCRIPTION DRUG,Florbetapir F 18,"INJECTION, SOLUTION",INTRAVENOUS,FLORBETAPIR F-18,51.0,mCi/mL,"Radioactive Diagnostic Agent [EPC],Positron Em..."
1,0002-1433,HUMAN PRESCRIPTION DRUG,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,DULAGLUTIDE,0.75,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep..."
2,0002-1434,HUMAN PRESCRIPTION DRUG,Dulaglutide,"INJECTION, SOLUTION",SUBCUTANEOUS,DULAGLUTIDE,1.5,mg/.5mL,"GLP-1 Receptor Agonist [EPC],Glucagon-Like Pep..."
3,0002-1445,HUMAN PRESCRIPTION DRUG,ixekizumab,"INJECTION, SOLUTION",SUBCUTANEOUS,IXEKIZUMAB,80.0,mg/mL,"Interleukin-17A Antagonist [EPC],Interleukin-1..."
4,0002-4112,HUMAN PRESCRIPTION DRUG,Olanzapine,TABLET,ORAL,OLANZAPINE,2.5,mg/1,Atypical Antipsychotic [EPC]


In [6]:
# drop unwanted columns
df_product = df_product.drop(df_product.columns[[0]], axis=1)

In [7]:
# delete null values
df_product = df_product.dropna()

In [8]:
# rename PRODUCTNDC to "NDC_Code"
df_product = df_product.rename(columns={'PRODUCTNDC': 'NDC_Code'})

In [9]:
# rename NONPROPRIETARYNAME to "Drug_Name"
df_product = df_product.rename(columns={'NONPROPRIETARYNAME': 'Drug_Name'})

In [11]:
# create price column - These will be arbitray values. These are NOT intended to reflect acutal product 
# retail prices, copays, etc. This is merely as an exercise to attach an arbitrary price to each drug.
prices = list(range(1,100,1)) + list(range(100,1000,25))
prices = [random.choice(prices) for i in range(len(df_product))]

In [12]:
df_product["Price"] = prices

In [13]:
# SQL connection
from sqlalchemy import create_engine
import pymysql

# estabish connection
engine = create_engine('mysql+pymysql://USER:PASSWORD@HOST')

In [14]:
# use health_company database
engine.execute("USE health_company;")

<sqlalchemy.engine.result.ResultProxy at 0x11c230898>

In [15]:
# write df to MySQL table 'drugs'
df_product.to_sql('drugs', con=engine, index=False)

In [19]:
# close the connection
engine.dispose()

In [None]:
# completed steps

# filter df_product to only drugs with names under 20 characters
df_product = df_product[df_product['NONPROPRIETARYNAME'].map(len) < 20]

# shift NDC_Code to front
ndc = df_product['NDC_Code']
df_product.drop(labels=['NDC_Code'], axis=1,inplace = True)
df_product.insert(0, 'NDC_Code', ndc)

df_product.to_excel('/Users/chrishuhn/product.xlsx')