# Drug Pages QA Generator
Generate HTML pages for QA of drug vendor monthly update

Melanie Huston<br>
August 2017

In [149]:
from os.path import basename
from bs4 import BeautifulSoup
import re
import datetime
import calendar
import io

## Report source filenames and QA environment
Change these file paths and names here if the current month's filenames or locations are different<br>
Change the QA environment number if different

In [150]:
# Report source filenames
drugreportfile = 'MonthlyUpdate.htm' # from drug monthly vendor email
connectreportfile = 'M+Connect_drug_update_report.htm' # from OCCS
# QA environment prefix
qaenglish = 'http://qa.medlineplus.gov/qa1/druginfo/meds/'
qaspanish = 'http://qa.medlineplus.gov/qa1/spanish/druginfo/meds/'

In [151]:
# Get month and year for report date and output filename
# Allow time for current month report to be run up to the 15th of the next month
nowdate = datetime.datetime.now()
nowyear = nowdate.year
nowmonth = nowdate.month
if nowdate.day < 16:
    nowmonth -= 1
monthname = calendar.month_name[nowmonth]

reportdate = monthname + " " + str(nowyear)
reportfilename = "DrugPages_QAReport_" + monthname + str(nowyear) + ".html"

In [152]:
# Start writing the html file for reporting and link checking
outputfile = open(reportfilename,'w')

outputfile.write("<!doctype html><html><head><meta charset=\"utf-8\"><title>Drug Pages QA for ")
outputfile.write(reportdate)
outputfile.write("</title></head><body><h1>Drug Pages QA Report for ")
outputfile.write(reportdate)
outputfile.write(" Update</h1>")

In [153]:
# Get and parse HTML from the drug's monthly report

with open(drugreportfile, "r") as f:
    report = f.read()
f.close()

soup = BeautifulSoup(report, 'html.parser')

In [154]:
# Look for these headings in the drug's monthly report
headinglist = ["New English Monographs","Revised English Monographs",
               "New Spanish Monographs","Revised Spanish Monographs", 
               "Discontinued Monographs"]

In [155]:
# find all list items containing a link to a health topic
HTcrawl = soup.find_all('p')
enctr = 0
spctr = 0

for paragraph in HTcrawl:
    #remove line breaks from the text
    ptext = re.sub("[\r\n]+", " ", paragraph.text)
    ptext = ptext.encode('utf-8')

    #if it equals the heading, write the heading to the html file
    if ptext in headinglist:
        outputfile.write("<h2>" + ptext + "</h2><p>")
        enctr = 0
        spctr = 0

    #else if it matches a spanish monograph entry, write a checkable link to the html file
    elif re.match("\"a\d{6}\-es\".*", ptext):
        spctr += 1
        spanishmonograph = re.match("\"(?P<monnbr>a\d{6}\-es)\",\"(?P<monname>.*)\",.*", ptext)
        spmonnbr = spanishmonograph.group('monnbr')
        spmonname = spanishmonograph.group('monname')
        outputfile.write(str(spctr) + ". " + "<a href=\"" + qaspanish + spmonnbr + ".html\">" 
                         + spmonnbr + " | " + spmonname + "</a><br>")

    #else if it matches an english monograph entry, write a checkable link to the html file
    elif re.match("\"a\d{6}\".*", ptext):
        enctr += 1
        englishmonograph = re.match("\"(?P<monnbr>a\d{6})\",\"(?P<monname>.*)\",.*", ptext)       
        engmonnbr = englishmonograph.group('monnbr')
        engmonname = englishmonograph.group('monname')
        outputfile.write(str(enctr) + ". " + "<a href=\"" + qaenglish + engmonnbr + ".html\">" 
                         + engmonnbr + " | " + engmonname + "</a><br>")

    #else if it matches "none", write that to the file
    elif re.match(".*[Nn]one.*", ptext):
        outputfile.write("None reported.<br>")

outputfile.write("</p>")

In [156]:
outputfile.write("</body></html>")
outputfile.close()