In [1]:
#!/usr/bin/env python3

# sri_lanka.py: takes SRI pretests and organizes them for submission to the graders.

# INPUTS
# - pretests.csv: file with test/section info

# DEPENDENCIES
# - pdftk
# - pdfimages (comes with poppler)
# - tesseract

debug = False

from csv import reader
from datetime import date
import glob
import multiprocessing as mp
import os
import pandas as pd
import subprocess

# today's date
today = str(date.today())

# pretest directory
pretestDir = 'sri_pretests_' + today

# survey directory
surveyDir = 'sri_pretest_surveys_' + today

In [2]:
def processTest(fileName, section):
    '''
    processes one pretest
    '''
    
    # check that the file exists
    if not os.path.isfile(fileName):
        print("[WARNING]: %s does not exist" % fileName)
        return
    
    pretestPrefix = pretestDir + '/' + section + '/'
    surveyPrefix = surveyDir + '/' + section + '/'
    fBase = os.path.splitext(fileName)[0]

    # check that the .pdf has 20 pages total
    c1 = ['pdftk', fileName, 'dump_data'] # get pdf metadata
    c2 = ['grep', 'NumberOfPages']        # find line with # pages
    c3 = ['awk', '{print $2}']          # get number of pages
    # pdftk $$$ dump_data | grep NumberOfPages | awk '{print $2}'
    p1 = subprocess.Popen(c1, stdout=subprocess.PIPE)
    p2 = subprocess.Popen(c2, stdin=p1.stdout, stdout=subprocess.PIPE)
    p3 = subprocess.Popen(c3, stdin=p2.stdout, stdout=subprocess.PIPE)
    nPages = p3.stdout.read().decode('utf-8').strip()
    
    if (nPages != '20'):
        return "[FATAL ERROR]: %s does not have 20 pages [%s pages]" % (fileName, nPages)

    # extract images from .pdf
    # pdfimages -png fileName.pdf prefix/fileName
    subprocess.call(['pdfimages', '-png', fileName, fBase])

    # make sure the right text is on the right pages
    # (not a fatal error since the .pdf should have 20 pages and split properly)
    # page 1 should have "astro"
    # tesseract img.png stdout | grep astro
    c1 = ['tesseract', fBase + '-001.png', 'stdout'] # basic tesseract with output to stdout
    c2 = ['grep', 'astro']
    p1 = subprocess.Popen(c1, stdout=subprocess.PIPE)
    p2 = subprocess.Popen(c2, stdin=p1.stdout, stdout=subprocess.PIPE)
    astro = p2.stdout.read().decode('utf-8').strip()
    if astro == '':
        print("[ERROR]: %s page 01 does not have text 'astro'" % fileName)

    # page 17 should have "Mostly"
    c1 = ['tesseract', fBase + '-017.png', 'stdout']
    c2 = ['grep', 'Mostly']
    p1 = subprocess.Popen(c1, stdout=subprocess.PIPE)
    p2 = subprocess.Popen(c2, stdin=p1.stdout, stdout=subprocess.PIPE)
    astro = p2.stdout.read().decode('utf-8').strip()
    if astro == '':
        print("[ERROR]: %s page 17 does not have text 'Mostly'" % fileName)

    # remove images extracted from .pdf
    for img in glob.glob(fBase + '*.png'):
        os.remove(img)

    # we made it this far! Time to split up the .pdf
    # pages 2-17 are the assessment, 18-19 are the survey
    # pdftk A=input.pdf cat A2-17
    subprocess.call(['pdftk', fileName, 'cat', '2-17', 'output', pretestPrefix + 'pretest_' + fileName])
    subprocess.call(['pdftk', fileName, 'cat', '18-19', 'output', surveyPrefix + 'survey_' + fileName])

    return "done with test %s..." % fileName

In [6]:
def main():

    # create pool
    pool = mp.Pool(mp.cpu_count())

    # read in .csv
    pretests = pd.read_csv("pretests.csv")

    # Drop rows without section information, and skip those already processed.
    # ^ is bitwise xor, it works fine here because the objects behave as expected
    # (and because we won't mark tests as processed that don't have a section label).
    pretests = pretests[(pretests['Processed'].isnull()) ^ (pretests['Section'].isnull())]

    # for each section, create a folder for pretests and surveys
    for section in set(pretests.Section):
        sectionPretestDir = pretestDir + '/' + section
        if not os.path.exists(sectionPretestDir):
            os.makedirs(sectionPretestDir)

        sectionSurveyDir = surveyDir + '/' + section
        if not os.path.exists(sectionSurveyDir):
            os.makedirs(sectionSurveyDir)
    
    results = [pool.apply_async(processTest, (fname, section,)) for fname, section in zip(pretests.Filename, pretests.Section)]
    for res in results:
        print(res.get())
    # print([res.get() for res in results])
    # get list of file names

    pool.close()
    
if __name__ == "__main__":
    main()

done with test 001730.pdf...
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None


In [5]:
# wrap things up
pool.close()

NameError: name 'pool' is not defined

In [21]:
pretests = pd.read_csv("pretests.csv")

In [22]:
pretests[(pretests['Processed'].isnull()) ^ (pretests['Section'].isnull())]

Unnamed: 0,Test ID,Section,First Name,Last Name,Test Date,Filename,Missing pages?,Processed,Notes
101,102,vmsorescu_1,Deandrew,Viloa,9/19/16,000102.pdf,,,
123,124,vmsorescu_7,Daniel,Castellanos,9/16/16,000124.pdf,,,
126,127,vmsorescu_7,Dominic,Saez Jr.,9/16/16,000127.pdf,,,
129,130,vmsorescu_7,Kimberly,Huerta,9/16/16,000130.pdf,,,
203,204,eesoto_1,Omar,Zentend,9/19/16,000204.pdf,,,
211,212,ecarter_6,Julio,H,9/19/16,000212.pdf,,,
215,216,ecarter_6,Ruth,Alcauter,9/19/16,000216.pdf,,,
315,316,rcgora_4,Dasha,Lopez,9/16/16,000316.pdf,,,
442,443,vrwoods_1,Damari,Kellie,9/19/16,000443.pdf,,,
444,445,vrwoods_1,Angelina,Vasquez,9/19/16,000445.pdf,,,
