# Objectives

-	Increase dpi and redo conversion CHECK 
-	Require a publisher (will avoid hand-drawn) CHECK
-	Top 50 composers for now in toy dataset
-	Elim orchestral scores
-	Make sure there aren’t cover pages, etc.
-	Make sure data in grayscale—standardize
-	Do as much of this as possible in an automated fashion. Publisher requirement will require some iteration and testing. 
-	ALSO re-do cell phone pictures, taking into account these constraints.


In [1]:
import glob
import os
import subprocess
import numpy as np
from scipy import ndimage
from scipy import signal
from scipy.misc import imsave
from scipy.signal import convolve2d
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from skimage.transform import rotate, resize
import time
import pylab as pl
from IPython import display
import librosa
import pickle
import pandas as pd
from bs4 import BeautifulSoup as bsoup
import shutil
from PyPDF2 import PdfFileWriter, PdfFileReader

## Convert w 300 dpi

In [None]:
root_dir = 'score_data'
pdf_dir = root_dir + '/prepped_pdf' # directory containing pdfs of scores
png_dir = root_dir + '/prepped_png' # root output directory for png files

In [None]:
#os.mkdir(png_dir)
pdf_files = glob(pdf_dir + '/*.pdf')
for pdf_file in pdf_files:
    basename = os.path.splitext(os.path.basename(pdf_file))[0]
    pngout = png_dir + '/' + basename + '.png'
    subprocess.call(['convert', '-density', '300', pdf_file, pngout]) # 72 dpi is the default value

## Rotate

In [None]:
def importImage(pngfile):
    img = ndimage.imread(pngfile, flatten=True)
    img = 1 - img/255.0 # make white -> 0, black -> 1
    return img

In [None]:
def showImage(X, sz = (12,12)):
    plt.figure(figsize=sz)
    plt.imshow(1-X, cmap='gray', vmin=0, vmax=1)

In [None]:
def rotateToHorizontal(img, lb=-2, ub=2, incr=.01, topN=40):
    bestscore = -np.inf
    bestTheta = 0
    for theta in np.arange(lb, ub, incr):
        imgRot = rotate(img,theta)
        rsums = np.sum(imgRot, axis=1)
        rsums_sorted = sorted(rsums)[::-1]
        curscore = np.sum(rsums_sorted[0:topN])
        if curscore > bestscore:
            bestscore = curscore
            bestTheta = theta
    result = rotate(img,bestTheta)
    return result, bestTheta

In [None]:
I = importImage('score_data/prepped_png/score17.png')
Irot, theta = rotateToHorizontal(I,-1,1)
showImage(Irot)

In [None]:
def rotateAllImages(indir, outdir):
    png_files = glob.glob(indir + '/*.png')
    for png_file in png_files:
        I = importImage(png_file)
        Irot, theta = rotateToHorizontal(I)
        img = 255 - np.uint8(Irot*255)
        basename = os.path.basename(png_file)
        pngout = outdir + '/' + basename
        imsave(pngout,img)

In [None]:
pngrot_dir = root_dir + '/toAnnotate' # rotated png files

In [None]:
os.mkdir(pngrot_dir)
rotateAllImages(png_dir, pngrot_dir)

Here's an example of something w/o publisher: https://imslp.org/wiki/Acis_and_Galatea%2C_HWV_49_(Handel%2C_George_Frideric)
just go to the page parts and it's the first thing w/ id 106106

## Get bad (no publisher) names

We'll assume that we first loaded our text file as html text so we can parse with beautifulsoup. soup is the name of the object containing html info

In [None]:
# open html text file as a beautifulsoup object
with open('html.txt') as f:
    soup = BeautifulSoup(f, "html.parser")

In [None]:
for div in soup.find_all('div', class_ = 'we'):
    # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
    div = str(div)
    if 'Manuscript' in div or 'manuscript' in div:
        value_on = div[div.find('indexes='):] # this is where the ID should show up on a page
        bad_name = value_on[8:14] # just get the id, assuming 6 digits
        if bad_name[-1] = '/':
            # if it's a slash, there's something else after it
            bad_name_two = value_on[14:20]
            # check if last thing is actually a digit, in case of 5-digit ID
            if not bad_name_two[-1].isdigit():
                bad_name_two = bad_name_two[:-1]
            bad_pdf_two = bad_name_two + '.pdf'
            bad_names.append(bad_pdf_two)
        if not bad_name[-1].isdigit():
            # make could be a 5-digit ID
            bad_name = bad_name[:-1]
        bad_pdf = bad_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
        bad_names.append(bad_pdf) # append this to a list of bad names

## Check if Orchestral Score

Let's see if we can automate checking if smth is an orchestral score or just a part. Let's do a Beethoven Symphony :D

In [None]:
divList = []
score_names = []

In [None]:
for div in soup.find_all('div', class_ = 'we'):
    # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
    div = str(div)
    if 'Score' in div or 'score' in div:
        divList.append(div)

In [None]:
for div in divList:
    value_on = div[div.find('indexes='):]
    score_name = value_on[8:14]
    if score_name[-1] = '/':
            # if it's a slash, there's something else after it
            score_name_two = value_on[14:20]
            # check if last thing is actually a digit, in case of 5-digit ID
            if not score_name_two[-1].isdigit():
                score_name_two = score_name_two[:-1]
            score_pdf_two = score_name_two + '.pdf'
            score_names.append(score_pdf_two)
    if not score_name[-1].isdigit():
        # make could be a 5-digit ID
        score_name = score_name[:-1]
    score_pdf = score_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
    score_names.append(score_pdf) # append this to a list of bad names

# Re-create Dataset

In [2]:
current_dir = '/data1/dbashir/Project/Summer2018/'
topFolder = 'pdf'
resultsFolder = 'results_top50'
intermedFolder = 'results_intermediate'
pdfMiniFolder = 'mini_dataset_pdf'
pngMiniFolder = 'mini_dataset_png'

dataSetSize = 70

resultsPath = os.path.join(topFolder,resultsFolder)
intermedPath = os.path.join(topFolder, intermedFolder)
intermedFromTop = os.path.join(current_dir, intermedPath)

pdfDir = os.path.join(current_dir, topFolder)
miniPDFDataset = os.path.join(topFolder, pdfMiniFolder)
miniPNGDataset = os.path.join(pdfDir, pngMiniFolder)
pngFromTop = os.path.join(topFolder, pngMiniFolder)
pdfFromTop = os.path.join(pdfDir, pdfMiniFolder)
rotPngFromTop = os.path.join(topFolder, 'mini_dataset_png_rot')

fileList = []

print(pdfDir)

/data1/dbashir/Project/Summer2018/pdf


In [3]:
if not os.path.exists(intermedPath):
    os.makedirs(intermedPath)

In [4]:
if not os.path.exists(miniPDFDataset):
    os.makedirs(miniPDFDataset)

In [5]:
if not os.path.exists(miniPNGDataset):
    os.makedirs(miniPNGDataset)

In [6]:
if not os.path.exists(rotPngFromTop):
    os.makedirs(rotPngFromTop)

In [3]:
def make_bad_pdfs(html_path):
    bad_names = []
    with open(html_path) as f:
        soup = bsoup(f, "html.parser")

    for div in soup.find_all('div', class_ = 'we'):
        # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
        trs = div.find_all('tr') # also get no publisher
        for tr in trs:
            tr = str(tr)
            try:
                if 'Pub' not in tr:
                    value_on = tr[tr.find('indexes='):]
                    bad_name = value_on[8:14]
                    if bad_name[-1] == '/':
                        # if it's a slash, there's something else after it
                        bad_name_two = value_on[14:20]
                        # check if last thing is actually a digit, in case of 5-digit ID
                        if not bad_name_two[-1].isdigit():
                            bad_name_two = bad_name_two[:-1]
                        bad_pdf_two = bad_name_two + '.pdf'
                        bad_names.append(bad_pdf_two)
                    if not bad_name[-1].isdigit():
                        # make could be a 5-digit ID
                        bad_name = bad_name[:-1]
                    bad_pdf = bad_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
                    bad_names.append(bad_pdf) # append this to a list of bad name
            except:
                continue
        div = str(div)
        try:
            if 'Manuscript' in div or 'manuscript' in div:
                value_on = div[div.find('indexes='):] # this is where the ID should show up on a page
                bad_name_loc = value_on[8:] # just get the id, assuming 6 digits
                location_letter = 0 # so we can  go from beginning of bad_name_loc to the first letter and get all IDs
                while not bad_name_loc[location_letter].isalpha():
                    location_letter += 1
                # now check for a quote or smth
                names_location = bad_name_loc[0:location_letter]
                # get last number, going from the back
                last_num = next(i for i,j in list(enumerate(names_location,1))[::-1] if j.isdigit())
                if not names_location[-1].isnumeric():
                    names_location = names_location[:-1]
                names_location = names_location[:last_num]
                # now split by slashes to get the numbers
                IDs = names_location.split('/')
                IDs = [ID + '.pdf' for ID in IDs]
                for bad_name in IDs:
                    bad_names.append(bad_name)
                
                # now find and remove slashes
#                 if bad_name[-1] == '/':
#                     # if it's a slash, there's something else after it
#                     bad_name_two = value_on[14:20]
#                     # check if last thing is actually a digit, in case of 5-digit ID
#                     if not bad_name_two[-1].isdigit():
#                         bad_name_two = bad_name_two[:-1]
#                     bad_pdf_two = bad_name_two + '.pdf'
#                     bad_names.append(bad_pdf_two)
#                 if not bad_name[-1].isdigit():
#                     # make could be a 5-digit ID
#                     bad_name = bad_name[:-1]
                #bad_pdf = bad_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
                #bad_names.append(bad_pdf) # append this to a list of bad names
        except:
            continue
            
    return bad_names

In [7]:
def make_score_pdfs(html_path):
    score_names = []
    divList = []
    with open(html_path) as f:
        soup = bsoup(f, "html.parser")
    for div in soup.find_all('div', class_ = 'we'):
        # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
        div = str(div)
        if 'Score' in div or 'score' in div:
            divList.append(div)
    for div in divList:
        try:
            value_on = div[div.find('indexes='):] # this is where the ID should show up on a page
            bad_name_loc = value_on[8:] # just get the id, assuming 6 digits
            location_letter = 0 # so we can  go from beginning of bad_name_loc to the first letter and get all IDs
            while not bad_name_loc[location_letter].isalpha():
                location_letter += 1
            # now check for a quote or smth
            names_location = bad_name_loc[0:location_letter]
            # get last number, going from the back
            last_num = next(i for i,j in list(enumerate(names_location,1))[::-1] if j.isdigit())
            if not names_location[-1].isnumeric():
                names_location = names_location[:-1]
            names_location = names_location[:last_num]
            # now split by slashes to get the numbers
            IDs = names_location.split('/')
            IDs = [ID + '.pdf' for ID in IDs]
            for bad_name in IDs:
                bad_names.append(bad_name)
        except:
            continue
    return score_names

In [7]:
#using os.walk, move all the .pdf files to an intermediate folder for EZ access
all_bad_names = []
all_score_names = []
for subdir, dirs, files in os.walk(resultsPath):
    for pdf_file in files:
        if 'pdf' in pdf_file:
            #move it to intermediate file for shuffling
#             print(pdf_file)
#             print("subdirectory: " + subdir)
            html_path = os.path.join(subdir, 'html.txt')
            bad_names = make_bad_pdfs(html_path)
            score_names = make_score_pdfs(html_path)
            for bad_name in bad_names:
                all_bad_names.append(bad_name)
            for score_name in score_names:
                all_score_names.append(score_name)
            if pdf_file in bad_names or pdf_file in score_names:
                print("skipping file: " + str(pdf_file))
                continue # skip if it's a handwritten or orchestral score
            if pdf_file not in all_bad_names or pdf_file not in all_score_names:
                pdf_file_path = os.path.join(subdir, pdf_file)
                final_path = os.path.join(intermedPath, pdf_file)
                shutil.copy(pdf_file_path, final_path)

skipping file: 270629.pdf
skipping file: 159741.pdf
skipping file: 42540.pdf
skipping file: 20515.pdf
skipping file: 87055.pdf
skipping file: 270796.pdf
skipping file: 234591.pdf
skipping file: 143296.pdf
skipping file: 272417.pdf
skipping file: 58813.pdf
skipping file: 68459.pdf
skipping file: 270430.pdf
skipping file: 105269.pdf
skipping file: 452822.pdf
skipping file: 42541.pdf
skipping file: 15191.pdf
skipping file: 270632.pdf
skipping file: 385573.pdf
skipping file: 411838.pdf
skipping file: 87056.pdf
skipping file: 240429.pdf
skipping file: 16144.pdf
skipping file: 14555.pdf
skipping file: 504187.pdf
skipping file: 270423.pdf
skipping file: 322115.pdf
skipping file: 385686.pdf
skipping file: 385730.pdf
skipping file: 385691.pdf
skipping file: 337078.pdf
skipping file: 180677.pdf
skipping file: 385711.pdf
skipping file: 383664.pdf
skipping file: 520316.pdf
skipping file: 505600.pdf
skipping file: 291632.pdf
skipping file: 291628.pdf
skipping file: 420951.pdf
skipping file: 102874.

skipping file: 385684.pdf
skipping file: 291635.pdf
skipping file: 420949.pdf
skipping file: 162119.pdf
skipping file: 162084.pdf
skipping file: 117300.pdf
skipping file: 291630.pdf
skipping file: 413960.pdf
skipping file: 57966.pdf
skipping file: 453649.pdf
skipping file: 93976.pdf
skipping file: 93975.pdf
skipping file: 130233.pdf
skipping file: 505630.pdf
skipping file: 184632.pdf
skipping file: 163509.pdf
skipping file: 234233.pdf
skipping file: 520314.pdf
skipping file: 21738.pdf
skipping file: 288708.pdf
skipping file: 228156.pdf
skipping file: 105449.pdf
skipping file: 244394.pdf
skipping file: 21739.pdf
skipping file: 445410.pdf
skipping file: 59243.pdf
skipping file: 24725.pdf
skipping file: 389006.pdf
skipping file: 322157.pdf
skipping file: 381909.pdf
skipping file: 147291.pdf
skipping file: 513937.pdf
skipping file: 385569.pdf
skipping file: 31791.pdf
skipping file: 115015.pdf
skipping file: 111897.pdf
skipping file: 383663.pdf
skipping file: 109840.pdf
skipping file: 32565

skipping file: 423923.pdf
skipping file: 139560.pdf
skipping file: 214305.pdf
skipping file: 114797.pdf
skipping file: 13304.pdf
skipping file: 73176.pdf
skipping file: 84319.pdf
skipping file: 58603.pdf
skipping file: 109379.pdf
skipping file: 338494.pdf
skipping file: 326669.pdf
skipping file: 131135.pdf
skipping file: 253352.pdf
skipping file: 253345.pdf
skipping file: 200901.pdf
skipping file: 253343.pdf
skipping file: 253349.pdf
skipping file: 115659.pdf
skipping file: 476776.pdf
skipping file: 464518.pdf
skipping file: 404683.pdf
skipping file: 445422.pdf
skipping file: 111609.pdf
skipping file: 404685.pdf
skipping file: 73181.pdf
skipping file: 58604.pdf
skipping file: 116266.pdf
skipping file: 475247.pdf
skipping file: 253353.pdf
skipping file: 430524.pdf
skipping file: 113041.pdf
skipping file: 409318.pdf
skipping file: 90538.pdf
skipping file: 112902.pdf
skipping file: 58606.pdf
skipping file: 377338.pdf
skipping file: 140524.pdf
skipping file: 253318.pdf
skipping file: 11290

skipping file: 323197.pdf
skipping file: 334118.pdf
skipping file: 334112.pdf
skipping file: 252876.pdf
skipping file: 323186.pdf
skipping file: 295105.pdf
skipping file: 16776.pdf
skipping file: 323169.pdf
skipping file: 186524.pdf
skipping file: 356213.pdf
skipping file: 260796.pdf
skipping file: 75869.pdf
skipping file: 496364.pdf
skipping file: 350486.pdf
skipping file: 356251.pdf
skipping file: 444472.pdf
skipping file: 253257.pdf
skipping file: 250777.pdf
skipping file: 247231.pdf
skipping file: 250781.pdf
skipping file: 356219.pdf
skipping file: 247193.pdf
skipping file: 323293.pdf
skipping file: 380212.pdf
skipping file: 383182.pdf
skipping file: 311220.pdf
skipping file: 250501.pdf
skipping file: 261384.pdf
skipping file: 114339.pdf
skipping file: 323176.pdf
skipping file: 323247.pdf
skipping file: 257072.pdf
skipping file: 252007.pdf
skipping file: 451336.pdf
skipping file: 306207.pdf
skipping file: 299453.pdf
skipping file: 363028.pdf
skipping file: 255106.pdf
skipping file:

skipping file: 261371.pdf
skipping file: 250526.pdf
skipping file: 250955.pdf
skipping file: 250679.pdf
skipping file: 250794.pdf
skipping file: 114355.pdf
skipping file: 323242.pdf
skipping file: 16789.pdf
skipping file: 250941.pdf
skipping file: 255110.pdf
skipping file: 356196.pdf
skipping file: 253255.pdf
skipping file: 260794.pdf
skipping file: 356198.pdf
skipping file: 252019.pdf
skipping file: 251839.pdf
skipping file: 295045.pdf
skipping file: 418523.pdf
skipping file: 261377.pdf
skipping file: 380210.pdf
skipping file: 323288.pdf
skipping file: 356246.pdf
skipping file: 166243.pdf
skipping file: 356267.pdf
skipping file: 250954.pdf
skipping file: 247160.pdf
skipping file: 261389.pdf
skipping file: 418515.pdf
skipping file: 312735.pdf
skipping file: 323178.pdf
skipping file: 247233.pdf
skipping file: 16799.pdf
skipping file: 356069.pdf
skipping file: 363035.pdf
skipping file: 356195.pdf
skipping file: 323239.pdf
skipping file: 16786.pdf
skipping file: 250795.pdf
skipping file: 

skipping file: 17187.pdf
skipping file: 17188.pdf
skipping file: 25884.pdf
skipping file: 396255.pdf
skipping file: 13988.pdf
skipping file: 51364.pdf
skipping file: 395602.pdf
skipping file: 23766.pdf
skipping file: 28163.pdf
skipping file: 419636.pdf
skipping file: 429963.pdf
skipping file: 51040.pdf
skipping file: 68733.pdf
skipping file: 51039.pdf
skipping file: 268666.pdf
skipping file: 237873.pdf
skipping file: 90498.pdf
skipping file: 502258.pdf
skipping file: 58010.pdf
skipping file: 313376.pdf
skipping file: 51350.pdf
skipping file: 58131.pdf
skipping file: 372717.pdf
skipping file: 495977.pdf
skipping file: 51351.pdf
skipping file: 281827.pdf
skipping file: 51269.pdf
skipping file: 58129.pdf
skipping file: 350969.pdf
skipping file: 350972.pdf
skipping file: 319011.pdf
skipping file: 74029.pdf
skipping file: 51444.pdf
skipping file: 53055.pdf
skipping file: 217743.pdf
skipping file: 26294.pdf
skipping file: 52520.pdf
skipping file: 58084.pdf
skipping file: 182395.pdf
skipping 

skipping file: 318729.pdf
skipping file: 36935.pdf
skipping file: 36933.pdf
skipping file: 36936.pdf
skipping file: 475468.pdf
skipping file: 36934.pdf
skipping file: 318867.pdf
skipping file: 59046.pdf
skipping file: 48101.pdf
skipping file: 245715.pdf
skipping file: 59000.pdf
skipping file: 47765.pdf
skipping file: 318573.pdf
skipping file: 333134.pdf
skipping file: 57529.pdf
skipping file: 408927.pdf
skipping file: 146853.pdf
skipping file: 12833.pdf
skipping file: 77403.pdf
skipping file: 51448.pdf
skipping file: 77404.pdf
skipping file: 318762.pdf
skipping file: 292903.pdf
skipping file: 311264.pdf
skipping file: 12817.pdf
skipping file: 313099.pdf
skipping file: 57939.pdf
skipping file: 318628.pdf
skipping file: 59219.pdf
skipping file: 51346.pdf
skipping file: 251504.pdf
skipping file: 51452.pdf
skipping file: 318638.pdf
skipping file: 58120.pdf
skipping file: 309351.pdf
skipping file: 51118.pdf
skipping file: 21521.pdf
skipping file: 274828.pdf
skipping file: 182379.pdf
skippin

skipping file: 265590.pdf
skipping file: 21356.pdf
skipping file: 34707.pdf
skipping file: 319018.pdf
skipping file: 51480.pdf
skipping file: 74045.pdf
skipping file: 70346.pdf
skipping file: 51965.pdf
skipping file: 36945.pdf
skipping file: 70345.pdf
skipping file: 52412.pdf
skipping file: 69003.pdf
skipping file: 36943.pdf
skipping file: 307123.pdf
skipping file: 51348.pdf
skipping file: 314961.pdf
skipping file: 20786.pdf
skipping file: 73977.pdf
skipping file: 11445.pdf
skipping file: 107383.pdf
skipping file: 307064.pdf
skipping file: 11611.pdf
skipping file: 11936.pdf
skipping file: 460084.pdf
skipping file: 474802.pdf
skipping file: 516486.pdf
skipping file: 161929.pdf
skipping file: 437210.pdf
skipping file: 51456.pdf
skipping file: 242946.pdf
skipping file: 516488.pdf
skipping file: 513711.pdf
skipping file: 516487.pdf
skipping file: 163257.pdf
skipping file: 107262.pdf
skipping file: 33493.pdf
skipping file: 484500.pdf
skipping file: 19389.pdf
skipping file: 460087.pdf
skippi

skipping file: 318572.pdf
skipping file: 51422.pdf
skipping file: 59048.pdf
skipping file: 51465.pdf
skipping file: 48278.pdf
skipping file: 318869.pdf
skipping file: 51423.pdf
skipping file: 12823.pdf
skipping file: 74044.pdf
skipping file: 319017.pdf
skipping file: 57530.pdf
skipping file: 90442.pdf
skipping file: 396267.pdf
skipping file: 25878.pdf
skipping file: 22095.pdf
skipping file: 13983.pdf
skipping file: 381781.pdf
skipping file: 83252.pdf
skipping file: 51349.pdf
skipping file: 146860.pdf
skipping file: 12717.pdf
skipping file: 43515.pdf
skipping file: 33973.pdf
skipping file: 63759.pdf
skipping file: 327949.pdf
skipping file: 300158.pdf
skipping file: 27457.pdf
skipping file: 505854.pdf
skipping file: 43511.pdf
skipping file: 371825.pdf
skipping file: 273096.pdf
skipping file: 396230.pdf
skipping file: 28159.pdf
skipping file: 90439.pdf
skipping file: 12318.pdf
skipping file: 25875.pdf
skipping file: 13861.pdf
skipping file: 21922.pdf
skipping file: 59005.pdf
skipping file

skipping file: 90489.pdf
skipping file: 318733.pdf
skipping file: 48099.pdf
skipping file: 59045.pdf
skipping file: 85850.pdf
skipping file: 261612.pdf
skipping file: 225272.pdf
skipping file: 493405.pdf
skipping file: 26932.pdf
skipping file: 336780.pdf
skipping file: 516665.pdf
skipping file: 43513.pdf
skipping file: 505958.pdf
skipping file: 10640.pdf
skipping file: 103999.pdf
skipping file: 13626.pdf
skipping file: 249842.pdf
skipping file: 51414.pdf
skipping file: 314316.pdf
skipping file: 42539.pdf
skipping file: 52509.pdf
skipping file: 42538.pdf
skipping file: 314232.pdf
skipping file: 90473.pdf
skipping file: 42235.pdf
skipping file: 52507.pdf
skipping file: 271190.pdf
skipping file: 505705.pdf
skipping file: 42241.pdf
skipping file: 90564.pdf
skipping file: 91285.pdf
skipping file: 51970.pdf
skipping file: 328877.pdf
skipping file: 19667.pdf
skipping file: 161285.pdf
skipping file: 381467.pdf
skipping file: 51707.pdf
skipping file: 90486.pdf
skipping file: 77992.pdf
skipping 

skipping file: 317099.pdf
skipping file: 51244.pdf
skipping file: 47382.pdf
skipping file: 328670.pdf
skipping file: 26453.pdf
skipping file: 100548.pdf
skipping file: 69586.pdf
skipping file: 51796.pdf
skipping file: 51254.pdf
skipping file: 51253.pdf
skipping file: 502707.pdf
skipping file: 379726.pdf
skipping file: 377818.pdf
skipping file: 51717.pdf
skipping file: 512502.pdf
skipping file: 501484.pdf
skipping file: 173648.pdf
skipping file: 90495.pdf
skipping file: 243126.pdf
skipping file: 68720.pdf
skipping file: 50957.pdf
skipping file: 413407.pdf
skipping file: 370928.pdf
skipping file: 26070.pdf
skipping file: 97203.pdf
skipping file: 46532.pdf
skipping file: 161918.pdf
skipping file: 245915.pdf
skipping file: 163219.pdf
skipping file: 108016.pdf
skipping file: 473268.pdf
skipping file: 250559.pdf
skipping file: 90746.pdf
skipping file: 46061.pdf
skipping file: 28877.pdf
skipping file: 46533.pdf
skipping file: 515347.pdf
skipping file: 90472.pdf
skipping file: 499706.pdf
skipp

skipping file: 310291.pdf
skipping file: 326297.pdf
skipping file: 326492.pdf
skipping file: 428702.pdf
skipping file: 333909.pdf
skipping file: 461691.pdf
skipping file: 424482.pdf
skipping file: 310215.pdf
skipping file: 445622.pdf
skipping file: 329019.pdf
skipping file: 343601.pdf
skipping file: 328996.pdf
skipping file: 342166.pdf
skipping file: 334464.pdf
skipping file: 352540.pdf
skipping file: 311736.pdf
skipping file: 326661.pdf
skipping file: 509001.pdf
skipping file: 311458.pdf
skipping file: 310409.pdf
skipping file: 509005.pdf
skipping file: 449932.pdf
skipping file: 407269.pdf
skipping file: 456540.pdf
skipping file: 461693.pdf
skipping file: 379163.pdf
skipping file: 452416.pdf
skipping file: 337436.pdf
skipping file: 334268.pdf
skipping file: 461692.pdf
skipping file: 379283.pdf
skipping file: 461700.pdf
skipping file: 310195.pdf
skipping file: 445629.pdf
skipping file: 310266.pdf
skipping file: 325476.pdf
skipping file: 351491.pdf
skipping file: 310415.pdf
skipping fil

skipping file: 365456.pdf
skipping file: 220285.pdf
skipping file: 346797.pdf
skipping file: 495841.pdf
skipping file: 494721.pdf
skipping file: 497227.pdf
skipping file: 438714.pdf
skipping file: 129592.pdf
skipping file: 109384.pdf
skipping file: 81762.pdf
skipping file: 317250.pdf
skipping file: 495965.pdf
skipping file: 496448.pdf
skipping file: 428221.pdf
skipping file: 83953.pdf
skipping file: 495966.pdf
skipping file: 263137.pdf
skipping file: 175123.pdf
skipping file: 196059.pdf
skipping file: 94898.pdf
skipping file: 18993.pdf
skipping file: 129578.pdf
skipping file: 18991.pdf
skipping file: 496431.pdf
skipping file: 89931.pdf
skipping file: 406201.pdf
skipping file: 84360.pdf
skipping file: 493277.pdf
skipping file: 258050.pdf
skipping file: 460385.pdf
skipping file: 493469.pdf
skipping file: 493282.pdf
skipping file: 493281.pdf
skipping file: 267703.pdf
skipping file: 364957.pdf
skipping file: 460906.pdf
skipping file: 460908.pdf
skipping file: 460909.pdf
skipping file: 1612

skipping file: 93613.pdf
skipping file: 454956.pdf
skipping file: 345452.pdf
skipping file: 120846.pdf
skipping file: 93614.pdf
skipping file: 454957.pdf
skipping file: 502746.pdf
skipping file: 121420.pdf
skipping file: 455033.pdf
skipping file: 455027.pdf
skipping file: 501775.pdf
skipping file: 501777.pdf
skipping file: 424430.pdf
skipping file: 502341.pdf
skipping file: 358402.pdf
skipping file: 501779.pdf
skipping file: 288160.pdf
skipping file: 506150.pdf
skipping file: 221417.pdf
skipping file: 296172.pdf
skipping file: 387394.pdf
skipping file: 303350.pdf
skipping file: 472150.pdf
skipping file: 303198.pdf
skipping file: 221455.pdf
skipping file: 221436.pdf
skipping file: 288778.pdf
skipping file: 428236.pdf
skipping file: 321045.pdf
skipping file: 288741.pdf
skipping file: 351458.pdf
skipping file: 135863.pdf
skipping file: 227827.pdf
skipping file: 13074.pdf
skipping file: 13075.pdf
skipping file: 221147.pdf
skipping file: 23031.pdf
skipping file: 10042.pdf
skipping file: 459

skipping file: 142044.pdf
skipping file: 367690.pdf
skipping file: 129319.pdf
skipping file: 428228.pdf
skipping file: 425472.pdf
skipping file: 15785.pdf
skipping file: 15786.pdf
skipping file: 416027.pdf
skipping file: 497747.pdf
skipping file: 417545.pdf
skipping file: 65195.pdf
skipping file: 31929.pdf
skipping file: 24494.pdf
skipping file: 25252.pdf
skipping file: 468679.pdf
skipping file: 247189.pdf
skipping file: 344640.pdf
skipping file: 260919.pdf
skipping file: 457279.pdf
skipping file: 117459.pdf
skipping file: 391079.pdf
skipping file: 161758.pdf
skipping file: 313155.pdf
skipping file: 91110.pdf
skipping file: 24360.pdf
skipping file: 495892.pdf
skipping file: 470149.pdf
skipping file: 495894.pdf
skipping file: 495893.pdf
skipping file: 394112.pdf
skipping file: 24276.pdf
skipping file: 494170.pdf
skipping file: 409145.pdf
skipping file: 467339.pdf
skipping file: 494169.pdf
skipping file: 494167.pdf
skipping file: 495551.pdf
skipping file: 494168.pdf
skipping file: 496637

skipping file: 496346.pdf
skipping file: 131932.pdf
skipping file: 496342.pdf
skipping file: 496341.pdf
skipping file: 496695.pdf
skipping file: 492206.pdf
skipping file: 492204.pdf
skipping file: 24242.pdf
skipping file: 492205.pdf
skipping file: 209391.pdf
skipping file: 361428.pdf
skipping file: 492207.pdf
skipping file: 401118.pdf
skipping file: 490706.pdf
skipping file: 490703.pdf
skipping file: 490707.pdf
skipping file: 490705.pdf
skipping file: 490708.pdf
skipping file: 93070.pdf
skipping file: 24233.pdf
skipping file: 455914.pdf
skipping file: 490709.pdf
skipping file: 131456.pdf
skipping file: 484318.pdf
skipping file: 238214.pdf
skipping file: 91781.pdf
skipping file: 484319.pdf
skipping file: 398177.pdf
skipping file: 470148.pdf
skipping file: 24207.pdf
skipping file: 484317.pdf
skipping file: 484316.pdf
skipping file: 91780.pdf
skipping file: 496174.pdf
skipping file: 91052.pdf
skipping file: 386442.pdf
skipping file: 91108.pdf
skipping file: 496567.pdf
skipping file: 44578

skipping file: 362417.pdf
skipping file: 497420.pdf
skipping file: 503970.pdf
skipping file: 491166.pdf
skipping file: 298288.pdf
skipping file: 68765.pdf
skipping file: 384726.pdf
skipping file: 299808.pdf
skipping file: 68764.pdf
skipping file: 406991.pdf
skipping file: 493305.pdf
skipping file: 392912.pdf
skipping file: 495629.pdf
skipping file: 493302.pdf
skipping file: 493307.pdf
skipping file: 24258.pdf
skipping file: 493303.pdf
skipping file: 493306.pdf
skipping file: 493308.pdf
skipping file: 470146.pdf
skipping file: 493304.pdf
skipping file: 457590.pdf
skipping file: 105468.pdf
skipping file: 223029.pdf
skipping file: 407729.pdf
skipping file: 493970.pdf
skipping file: 131913.pdf
skipping file: 506458.pdf
skipping file: 24268.pdf
skipping file: 495137.pdf
skipping file: 333169.pdf
skipping file: 90526.pdf
skipping file: 289707.pdf
skipping file: 503323.pdf
skipping file: 214716.pdf
skipping file: 199708.pdf
skipping file: 90803.pdf
skipping file: 24231.pdf
skipping file: 4157

skipping file: 65570.pdf
skipping file: 65569.pdf
skipping file: 331337.pdf
skipping file: 500588.pdf
skipping file: 500589.pdf
skipping file: 500586.pdf
skipping file: 422764.pdf
skipping file: 500585.pdf
skipping file: 25254.pdf
skipping file: 161240.pdf
skipping file: 106232.pdf
skipping file: 494762.pdf
skipping file: 75568.pdf
skipping file: 470574.pdf
skipping file: 496644.pdf
skipping file: 128302.pdf
skipping file: 282481.pdf
skipping file: 346794.pdf
skipping file: 93858.pdf
skipping file: 24243.pdf
skipping file: 403553.pdf
skipping file: 492208.pdf
skipping file: 492209.pdf
skipping file: 429915.pdf
skipping file: 303324.pdf
skipping file: 497266.pdf
skipping file: 177709.pdf
skipping file: 81799.pdf
skipping file: 267421.pdf
skipping file: 101397.pdf
skipping file: 507434.pdf
skipping file: 101399.pdf
skipping file: 496023.pdf
skipping file: 413384.pdf
skipping file: 24362.pdf
skipping file: 515444.pdf
skipping file: 496019.pdf
skipping file: 513070.pdf
skipping file: 49511

skipping file: 505441.pdf
skipping file: 86294.pdf
skipping file: 132437.pdf
skipping file: 230217.pdf
skipping file: 505439.pdf
skipping file: 102028.pdf
skipping file: 95529.pdf
skipping file: 91051.pdf
skipping file: 495762.pdf
skipping file: 412033.pdf
skipping file: 495761.pdf
skipping file: 495764.pdf
skipping file: 24353.pdf
skipping file: 495765.pdf
skipping file: 459515.pdf
skipping file: 84699.pdf
skipping file: 422845.pdf
skipping file: 500590.pdf
skipping file: 500591.pdf
skipping file: 500592.pdf
skipping file: 450450.pdf
skipping file: 141911.pdf
skipping file: 129512.pdf
skipping file: 87952.pdf
skipping file: 100906.pdf
skipping file: 24194.pdf
skipping file: 491619.pdf
skipping file: 397723.pdf
skipping file: 483952.pdf
skipping file: 458711.pdf
skipping file: 101456.pdf
skipping file: 449462.pdf
skipping file: 444901.pdf
skipping file: 26752.pdf
skipping file: 397187.pdf
skipping file: 483616.pdf
skipping file: 240714.pdf
skipping file: 458682.pdf
skipping file: 13128

skipping file: 495756.pdf
skipping file: 411675.pdf
skipping file: 496940.pdf
skipping file: 496937.pdf
skipping file: 24421.pdf
skipping file: 231523.pdf
skipping file: 245551.pdf
skipping file: 293652.pdf
skipping file: 293665.pdf
skipping file: 415905.pdf
skipping file: 374643.pdf
skipping file: 496938.pdf
skipping file: 496939.pdf
skipping file: 490721.pdf
skipping file: 254252.pdf
skipping file: 24236.pdf
skipping file: 490718.pdf
skipping file: 463093.pdf
skipping file: 367317.pdf
skipping file: 402656.pdf
skipping file: 490723.pdf
skipping file: 495028.pdf
skipping file: 490722.pdf
skipping file: 424536.pdf
skipping file: 83925.pdf
skipping file: 494808.pdf
skipping file: 240150.pdf
skipping file: 57550.pdf
skipping file: 305274.pdf
skipping file: 90976.pdf
skipping file: 19272.pdf
skipping file: 161224.pdf
skipping file: 192707.pdf
skipping file: 97185.pdf
skipping file: 81861.pdf
skipping file: 464800.pdf
skipping file: 494996.pdf
skipping file: 463974.pdf
skipping file: 63281

skipping file: 495579.pdf
skipping file: 404713.pdf
skipping file: 411186.pdf
skipping file: 495578.pdf
skipping file: 262309.pdf
skipping file: 153352.pdf
skipping file: 364026.pdf
skipping file: 453854.pdf
skipping file: 24388.pdf
skipping file: 254264.pdf
skipping file: 207049.pdf
skipping file: 496205.pdf
skipping file: 349244.pdf
skipping file: 83214.pdf
skipping file: 496204.pdf
skipping file: 457525.pdf
skipping file: 207042.pdf
skipping file: 167640.pdf
skipping file: 496203.pdf
skipping file: 512157.pdf
skipping file: 350799.pdf
skipping file: 413195.pdf
skipping file: 492030.pdf
skipping file: 414431.pdf
skipping file: 188675.pdf
skipping file: 358725.pdf
skipping file: 496202.pdf
skipping file: 128938.pdf
skipping file: 83213.pdf
skipping file: 130246.pdf
skipping file: 481343.pdf
skipping file: 90652.pdf
skipping file: 206684.pdf
skipping file: 24169.pdf
skipping file: 385490.pdf
skipping file: 481483.pdf
skipping file: 96602.pdf
skipping file: 18116.pdf
skipping file: 1294

skipping file: 496071.pdf
skipping file: 81151.pdf
skipping file: 497282.pdf
skipping file: 91046.pdf
skipping file: 497256.pdf
skipping file: 81787.pdf
skipping file: 254392.pdf
skipping file: 128980.pdf
skipping file: 453588.pdf
skipping file: 111283.pdf
skipping file: 214668.pdf
skipping file: 494649.pdf
skipping file: 485794.pdf
skipping file: 392913.pdf
skipping file: 470150.pdf
skipping file: 485793.pdf
skipping file: 485789.pdf
skipping file: 24483.pdf
skipping file: 495633.pdf
skipping file: 24485.pdf
skipping file: 485792.pdf
skipping file: 24484.pdf
skipping file: 24213.pdf
skipping file: 485791.pdf
skipping file: 485790.pdf
skipping file: 83151.pdf
skipping file: 99701.pdf
skipping file: 494197.pdf
skipping file: 202620.pdf
skipping file: 65819.pdf
skipping file: 409577.pdf
skipping file: 497735.pdf
skipping file: 123427.pdf
skipping file: 462793.pdf
skipping file: 417545.pdf
skipping file: 475532.pdf
skipping file: 499010.pdf
skipping file: 381501.pdf
skipping file: 499009.

skipping file: 520226.pdf
skipping file: 492974.pdf
skipping file: 492969.pdf
skipping file: 493557.pdf
skipping file: 464380.pdf
skipping file: 492975.pdf
skipping file: 378159.pdf
skipping file: 520303.pdf
skipping file: 24250.pdf
skipping file: 520263.pdf
skipping file: 520304.pdf
skipping file: 404737.pdf
skipping file: 492971.pdf
skipping file: 492973.pdf
skipping file: 427614.pdf
skipping file: 491490.pdf
skipping file: 491491.pdf
skipping file: 265253.pdf
skipping file: 11000.pdf
skipping file: 474396.pdf
skipping file: 502664.pdf
skipping file: 497336.pdf
skipping file: 355009.pdf
skipping file: 359522.pdf
skipping file: 505563.pdf
skipping file: 515987.pdf
skipping file: 60970.pdf
skipping file: 96330.pdf
skipping file: 365424.pdf
skipping file: 24349.pdf
skipping file: 503095.pdf
skipping file: 172740.pdf
skipping file: 485387.pdf
skipping file: 62906.pdf
skipping file: 22958.pdf
skipping file: 371396.pdf
skipping file: 271083.pdf
skipping file: 376516.pdf
skipping file: 2248

skipping file: 100885.pdf
skipping file: 423876.pdf
skipping file: 123684.pdf
skipping file: 84039.pdf
skipping file: 179853.pdf
skipping file: 500264.pdf
skipping file: 500266.pdf
skipping file: 500261.pdf
skipping file: 500263.pdf
skipping file: 395183.pdf
skipping file: 24467.pdf
skipping file: 454880.pdf
skipping file: 500262.pdf
skipping file: 96025.pdf
skipping file: 216489.pdf
skipping file: 480312.pdf
skipping file: 83965.pdf
skipping file: 480311.pdf
skipping file: 83161.pdf
skipping file: 83160.pdf
skipping file: 268082.pdf
skipping file: 101320.pdf
skipping file: 55515.pdf
skipping file: 434945.pdf
skipping file: 23805.pdf
skipping file: 93676.pdf
skipping file: 513376.pdf
skipping file: 260224.pdf
skipping file: 492077.pdf
skipping file: 415028.pdf
skipping file: 268736.pdf
skipping file: 253600.pdf
skipping file: 81774.pdf
skipping file: 434104.pdf
skipping file: 422493.pdf
skipping file: 500477.pdf
skipping file: 500476.pdf
skipping file: 233215.pdf
skipping file: 494468.

skipping file: 268841.pdf
skipping file: 497250.pdf
skipping file: 497308.pdf
skipping file: 216333.pdf
skipping file: 268747.pdf
skipping file: 133800.pdf
skipping file: 222806.pdf
skipping file: 94932.pdf
skipping file: 415030.pdf
skipping file: 23805.pdf
skipping file: 289704.pdf
skipping file: 62063.pdf
skipping file: 133801.pdf
skipping file: 493119.pdf
skipping file: 250304.pdf
skipping file: 516681.pdf
skipping file: 129145.pdf
skipping file: 96024.pdf
skipping file: 90806.pdf
skipping file: 457607.pdf
skipping file: 492200.pdf
skipping file: 457595.pdf
skipping file: 492202.pdf
skipping file: 492199.pdf
skipping file: 403301.pdf
skipping file: 24240.pdf
skipping file: 450144.pdf
skipping file: 512155.pdf
skipping file: 499709.pdf
skipping file: 167496.pdf
skipping file: 423872.pdf
skipping file: 172612.pdf
skipping file: 84032.pdf
skipping file: 90656.pdf
skipping file: 459534.pdf
skipping file: 432332.pdf
skipping file: 397091.pdf
skipping file: 483610.pdf
skipping file: 80382

skipping file: 24178.pdf
skipping file: 482642.pdf
skipping file: 482644.pdf
skipping file: 482641.pdf
skipping file: 482647.pdf
skipping file: 394624.pdf
skipping file: 482640.pdf
skipping file: 482645.pdf
skipping file: 395440.pdf
skipping file: 400153.pdf
skipping file: 449659.pdf
skipping file: 167639.pdf
skipping file: 116929.pdf
skipping file: 494816.pdf
skipping file: 453846.pdf
skipping file: 424757.pdf
skipping file: 34217.pdf
skipping file: 129110.pdf
skipping file: 70850.pdf
skipping file: 255413.pdf
skipping file: 121688.pdf
skipping file: 462111.pdf
skipping file: 94132.pdf
skipping file: 75796.pdf
skipping file: 215391.pdf
skipping file: 75797.pdf
skipping file: 254354.pdf
skipping file: 105620.pdf
skipping file: 268573.pdf
skipping file: 121689.pdf
skipping file: 107648.pdf
skipping file: 241796.pdf
skipping file: 121690.pdf
skipping file: 108700.pdf
skipping file: 29356.pdf
skipping file: 70857.pdf
skipping file: 108698.pdf
skipping file: 12165.pdf
skipping file: 36902.

skipping file: 65629.pdf
skipping file: 122518.pdf
skipping file: 328288.pdf
skipping file: 77965.pdf
skipping file: 227838.pdf
skipping file: 504882.pdf
skipping file: 115012.pdf
skipping file: 231725.pdf
skipping file: 252093.pdf
skipping file: 134638.pdf
skipping file: 249811.pdf
skipping file: 36788.pdf
skipping file: 160949.pdf
skipping file: 60763.pdf
skipping file: 219496.pdf
skipping file: 98565.pdf
skipping file: 111636.pdf
skipping file: 212278.pdf
skipping file: 29749.pdf
skipping file: 257255.pdf
skipping file: 421137.pdf
skipping file: 511447.pdf
skipping file: 434063.pdf
skipping file: 100020.pdf
skipping file: 77909.pdf
skipping file: 21721.pdf
skipping file: 371015.pdf
skipping file: 520604.pdf
skipping file: 62635.pdf
skipping file: 22015.pdf
skipping file: 151006.pdf
skipping file: 94634.pdf
skipping file: 13330.pdf
skipping file: 13069.pdf
skipping file: 13990.pdf
skipping file: 36786.pdf
skipping file: 21722.pdf
skipping file: 38445.pdf
skipping file: 308123.pdf
ski

skipping file: 14622.pdf
skipping file: 230316.pdf
skipping file: 385992.pdf
skipping file: 109498.pdf
skipping file: 500624.pdf
skipping file: 458467.pdf
skipping file: 346824.pdf
skipping file: 423661.pdf
skipping file: 517704.pdf
skipping file: 333433.pdf
skipping file: 423647.pdf
skipping file: 55565.pdf
skipping file: 55566.pdf
skipping file: 324655.pdf
skipping file: 51845.pdf
skipping file: 51846.pdf
skipping file: 347269.pdf
skipping file: 72384.pdf
skipping file: 72385.pdf
skipping file: 341952.pdf
skipping file: 310539.pdf
skipping file: 423650.pdf
skipping file: 173078.pdf
skipping file: 382272.pdf
skipping file: 310544.pdf
skipping file: 355138.pdf
skipping file: 310403.pdf
skipping file: 463031.pdf
skipping file: 225409.pdf
skipping file: 246693.pdf
skipping file: 138645.pdf
skipping file: 314086.pdf
skipping file: 296992.pdf
skipping file: 301526.pdf
skipping file: 463695.pdf
skipping file: 423644.pdf
skipping file: 503973.pdf
skipping file: 503721.pdf
skipping file: 5037

skipping file: 21622.pdf
skipping file: 163675.pdf
skipping file: 336075.pdf
skipping file: 467497.pdf
skipping file: 75840.pdf
skipping file: 21621.pdf
skipping file: 386751.pdf
skipping file: 115355.pdf
skipping file: 331916.pdf
skipping file: 163674.pdf
skipping file: 513448.pdf
skipping file: 518823.pdf
skipping file: 49226.pdf
skipping file: 270016.pdf
skipping file: 246440.pdf
skipping file: 22538.pdf
skipping file: 53958.pdf
skipping file: 244031.pdf
skipping file: 310555.pdf
skipping file: 103382.pdf
skipping file: 256192.pdf
skipping file: 241554.pdf
skipping file: 23913.pdf
skipping file: 518383.pdf
skipping file: 332997.pdf
skipping file: 402593.pdf
skipping file: 508765.pdf
skipping file: 129974.pdf
skipping file: 53950.pdf
skipping file: 109374.pdf
skipping file: 198736.pdf
skipping file: 146654.pdf
skipping file: 450733.pdf
skipping file: 109667.pdf
skipping file: 141987.pdf
skipping file: 51783.pdf
skipping file: 309467.pdf
skipping file: 10978.pdf
skipping file: 78154.p

skipping file: 36924.pdf
skipping file: 268308.pdf
skipping file: 52129.pdf
skipping file: 31561.pdf
skipping file: 67622.pdf
skipping file: 67621.pdf
skipping file: 455370.pdf
skipping file: 455290.pdf
skipping file: 51784.pdf
skipping file: 259284.pdf
skipping file: 62255.pdf
skipping file: 10976.pdf
skipping file: 273916.pdf
skipping file: 162954.pdf
skipping file: 49305.pdf
skipping file: 365773.pdf
skipping file: 51537.pdf
skipping file: 101087.pdf
skipping file: 251061.pdf
skipping file: 28819.pdf
skipping file: 52515.pdf
skipping file: 454412.pdf
skipping file: 301569.pdf
skipping file: 103224.pdf
skipping file: 21626.pdf
skipping file: 75852.pdf
skipping file: 234238.pdf
skipping file: 474305.pdf
skipping file: 20287.pdf
skipping file: 331925.pdf
skipping file: 286997.pdf
skipping file: 519543.pdf
skipping file: 36366.pdf
skipping file: 519542.pdf
skipping file: 513450.pdf
skipping file: 22533.pdf
skipping file: 237237.pdf
skipping file: 162881.pdf
skipping file: 372700.pdf
ski

skipping file: 81546.pdf
skipping file: 25892.pdf
skipping file: 115010.pdf
skipping file: 23026.pdf
skipping file: 338464.pdf
skipping file: 271834.pdf
skipping file: 27201.pdf
skipping file: 23028.pdf
skipping file: 24137.pdf
skipping file: 168649.pdf
skipping file: 27802.pdf
skipping file: 27028.pdf
skipping file: 89504.pdf
skipping file: 89498.pdf
skipping file: 144735.pdf
skipping file: 474956.pdf
skipping file: 73313.pdf
skipping file: 399404.pdf
skipping file: 73252.pdf
skipping file: 356494.pdf
skipping file: 26821.pdf
skipping file: 27773.pdf
skipping file: 100177.pdf
skipping file: 26764.pdf
skipping file: 26765.pdf
skipping file: 27440.pdf
skipping file: 141240.pdf
skipping file: 60819.pdf
skipping file: 77038.pdf
skipping file: 399253.pdf
skipping file: 26812.pdf
skipping file: 338462.pdf
skipping file: 60820.pdf
skipping file: 464445.pdf
skipping file: 39994.pdf
skipping file: 25751.pdf
skipping file: 36512.pdf
skipping file: 485023.pdf
skipping file: 10711.pdf
skipping fi

skipping file: 37829.pdf
skipping file: 29151.pdf
skipping file: 253978.pdf
skipping file: 479662.pdf
skipping file: 106532.pdf
skipping file: 107379.pdf
skipping file: 356404.pdf
skipping file: 496230.pdf
skipping file: 106533.pdf
skipping file: 78596.pdf
skipping file: 63400.pdf
skipping file: 113280.pdf
skipping file: 107690.pdf
skipping file: 517987.pdf
skipping file: 517986.pdf
skipping file: 111903.pdf
skipping file: 29358.pdf
skipping file: 29359.pdf
skipping file: 341833.pdf
skipping file: 45933.pdf
skipping file: 267291.pdf
skipping file: 33361.pdf
skipping file: 483066.pdf
skipping file: 111966.pdf
skipping file: 108042.pdf
skipping file: 378304.pdf
skipping file: 517982.pdf
skipping file: 444741.pdf
skipping file: 425524.pdf
skipping file: 260586.pdf
skipping file: 441577.pdf
skipping file: 387466.pdf
skipping file: 107689.pdf
skipping file: 50693.pdf
skipping file: 108031.pdf
skipping file: 405907.pdf
skipping file: 107863.pdf
skipping file: 106408.pdf
skipping file: 107716

skipping file: 53232.pdf
skipping file: 330459.pdf
skipping file: 269048.pdf
skipping file: 151846.pdf
skipping file: 151855.pdf
skipping file: 437923.pdf
skipping file: 333872.pdf
skipping file: 60823.pdf
skipping file: 60824.pdf
skipping file: 74809.pdf
skipping file: 394863.pdf
skipping file: 469040.pdf
skipping file: 96685.pdf
skipping file: 512811.pdf
skipping file: 375114.pdf
skipping file: 56121.pdf
skipping file: 151830.pdf
skipping file: 151821.pdf
skipping file: 151806.pdf
skipping file: 341648.pdf
skipping file: 28258.pdf
skipping file: 375067.pdf
skipping file: 468655.pdf
skipping file: 151798.pdf
skipping file: 494761.pdf
skipping file: 151649.pdf
skipping file: 152843.pdf
skipping file: 28261.pdf
skipping file: 131724.pdf
skipping file: 469172.pdf
skipping file: 403094.pdf
skipping file: 311371.pdf
skipping file: 491336.pdf
skipping file: 151660.pdf
skipping file: 57242.pdf
skipping file: 119339.pdf
skipping file: 152835.pdf
skipping file: 142775.pdf
skipping file: 28260.

skipping file: 13804.pdf
skipping file: 176306.pdf
skipping file: 13725.pdf
skipping file: 80284.pdf
skipping file: 21205.pdf
skipping file: 80283.pdf
skipping file: 49168.pdf
skipping file: 64930.pdf
skipping file: 514875.pdf
skipping file: 403620.pdf
skipping file: 21451.pdf
skipping file: 98731.pdf
skipping file: 96858.pdf
skipping file: 413799.pdf
skipping file: 59752.pdf
skipping file: 40087.pdf
skipping file: 75947.pdf
skipping file: 29942.pdf
skipping file: 30930.pdf
skipping file: 39302.pdf
skipping file: 98727.pdf
skipping file: 68599.pdf
skipping file: 68600.pdf
skipping file: 24759.pdf
skipping file: 66300.pdf
skipping file: 24760.pdf
skipping file: 488831.pdf
skipping file: 64935.pdf
skipping file: 272257.pdf
skipping file: 66288.pdf
skipping file: 30841.pdf
skipping file: 71420.pdf
skipping file: 13092.pdf
skipping file: 30116.pdf
skipping file: 414007.pdf
skipping file: 412304.pdf
skipping file: 64929.pdf
skipping file: 412296.pdf
skipping file: 179600.pdf
skipping file: 

skipping file: 361733.pdf
skipping file: 62900.pdf
skipping file: 106487.pdf
skipping file: 45771.pdf
skipping file: 62901.pdf
skipping file: 30575.pdf
skipping file: 217731.pdf
skipping file: 64928.pdf
skipping file: 64873.pdf
skipping file: 60073.pdf
skipping file: 66297.pdf
skipping file: 60069.pdf
skipping file: 30563.pdf
skipping file: 33384.pdf
skipping file: 69059.pdf
skipping file: 44755.pdf
skipping file: 257840.pdf
skipping file: 76788.pdf
skipping file: 119834.pdf
skipping file: 435070.pdf
skipping file: 75950.pdf
skipping file: 440511.pdf
skipping file: 291609.pdf
skipping file: 79621.pdf
skipping file: 272914.pdf
skipping file: 211924.pdf
skipping file: 33277.pdf
skipping file: 21204.pdf
skipping file: 272915.pdf
skipping file: 512537.pdf
skipping file: 52914.pdf
skipping file: 393712.pdf
skipping file: 412291.pdf
skipping file: 30600.pdf
skipping file: 140869.pdf
skipping file: 375939.pdf
skipping file: 31074.pdf
skipping file: 31410.pdf
skipping file: 147279.pdf
skipping

skipping file: 269116.pdf
skipping file: 93258.pdf
skipping file: 14737.pdf
skipping file: 33140.pdf
skipping file: 168022.pdf
skipping file: 33141.pdf
skipping file: 109986.pdf
skipping file: 507725.pdf
skipping file: 61429.pdf
skipping file: 246282.pdf
skipping file: 15421.pdf
skipping file: 16158.pdf
skipping file: 425438.pdf
skipping file: 33283.pdf
skipping file: 16136.pdf
skipping file: 14240.pdf
skipping file: 36706.pdf
skipping file: 36705.pdf
skipping file: 34184.pdf
skipping file: 410848.pdf
skipping file: 14960.pdf
skipping file: 34183.pdf
skipping file: 419438.pdf
skipping file: 42479.pdf
skipping file: 36057.pdf
skipping file: 36056.pdf
skipping file: 284840.pdf
skipping file: 13295.pdf
skipping file: 490302.pdf
skipping file: 225776.pdf
skipping file: 84127.pdf
skipping file: 14741.pdf
skipping file: 84126.pdf
skipping file: 14011.pdf
skipping file: 14239.pdf
skipping file: 14010.pdf
skipping file: 468739.pdf
skipping file: 143668.pdf
skipping file: 13404.pdf
skipping fil

skipping file: 20566.pdf
skipping file: 33694.pdf
skipping file: 211850.pdf
skipping file: 151615.pdf
skipping file: 24779.pdf
skipping file: 79135.pdf
skipping file: 79259.pdf
skipping file: 115026.pdf
skipping file: 15187.pdf
skipping file: 79337.pdf
skipping file: 56657.pdf
skipping file: 76250.pdf
skipping file: 162701.pdf
skipping file: 111947.pdf
skipping file: 73288.pdf
skipping file: 130154.pdf
skipping file: 127087.pdf
skipping file: 141041.pdf
skipping file: 79235.pdf
skipping file: 59395.pdf
skipping file: 434260.pdf
skipping file: 11147.pdf
skipping file: 180686.pdf
skipping file: 382877.pdf
skipping file: 11146.pdf
skipping file: 110914.pdf
skipping file: 22438.pdf
skipping file: 22695.pdf
skipping file: 42941.pdf
skipping file: 135551.pdf
skipping file: 19168.pdf
skipping file: 315549.pdf
skipping file: 12078.pdf
skipping file: 135583.pdf
skipping file: 42945.pdf
skipping file: 135163.pdf
skipping file: 290815.pdf
skipping file: 13691.pdf
skipping file: 82838.pdf
skipping

skipping file: 15895.pdf
skipping file: 38444.pdf
skipping file: 16027.pdf
skipping file: 33385.pdf
skipping file: 15867.pdf
skipping file: 15888.pdf
skipping file: 15869.pdf
skipping file: 16038.pdf
skipping file: 16037.pdf
skipping file: 19931.pdf
skipping file: 15939.pdf
skipping file: 15849.pdf
skipping file: 15936.pdf
skipping file: 15886.pdf
skipping file: 15887.pdf
skipping file: 15913.pdf
skipping file: 383573.pdf
skipping file: 473744.pdf
skipping file: 15894.pdf
skipping file: 15959.pdf
skipping file: 10678.pdf
skipping file: 473172.pdf
skipping file: 15856.pdf
skipping file: 15921.pdf
skipping file: 15937.pdf
skipping file: 249920.pdf
skipping file: 15915.pdf
skipping file: 281675.pdf
skipping file: 15916.pdf
skipping file: 15874.pdf
skipping file: 15907.pdf
skipping file: 15960.pdf
skipping file: 57332.pdf
skipping file: 15899.pdf
skipping file: 57331.pdf
skipping file: 15851.pdf
skipping file: 16026.pdf
skipping file: 16012.pdf
skipping file: 15923.pdf
skipping file: 15884

skipping file: 322556.pdf
skipping file: 161347.pdf
skipping file: 325424.pdf
skipping file: 181322.pdf
skipping file: 211952.pdf
skipping file: 285710.pdf
skipping file: 267972.pdf
skipping file: 178685.pdf
skipping file: 505093.pdf
skipping file: 312686.pdf
skipping file: 482400.pdf
skipping file: 10604.pdf
skipping file: 27400.pdf
skipping file: 344793.pdf
skipping file: 27046.pdf
skipping file: 343753.pdf
skipping file: 27387.pdf
skipping file: 96689.pdf
skipping file: 90295.pdf
skipping file: 306305.pdf
skipping file: 10569.pdf
skipping file: 36388.pdf
skipping file: 215298.pdf
skipping file: 88260.pdf
skipping file: 244878.pdf
skipping file: 88261.pdf
skipping file: 36389.pdf
skipping file: 306293.pdf
skipping file: 306358.pdf
skipping file: 10804.pdf
skipping file: 27384.pdf
skipping file: 27072.pdf
skipping file: 178867.pdf
skipping file: 88989.pdf
skipping file: 57596.pdf
skipping file: 88990.pdf
skipping file: 57595.pdf
skipping file: 306354.pdf
skipping file: 28182.pdf
skipp

skipping file: 451452.pdf
skipping file: 16818.pdf
skipping file: 316546.pdf
skipping file: 16815.pdf
skipping file: 45324.pdf
skipping file: 13527.pdf
skipping file: 253868.pdf
skipping file: 14088.pdf
skipping file: 33735.pdf
skipping file: 77895.pdf
skipping file: 120730.pdf
skipping file: 14553.pdf
skipping file: 34181.pdf
skipping file: 515493.pdf
skipping file: 263892.pdf
skipping file: 13525.pdf
skipping file: 16817.pdf
skipping file: 77894.pdf
skipping file: 207051.pdf
skipping file: 173645.pdf
skipping file: 234988.pdf
skipping file: 21416.pdf
skipping file: 340574.pdf
skipping file: 190533.pdf
skipping file: 35992.pdf
skipping file: 35991.pdf
skipping file: 21941.pdf
skipping file: 62823.pdf
skipping file: 66079.pdf
skipping file: 409177.pdf
skipping file: 437435.pdf
skipping file: 76014.pdf
skipping file: 21460.pdf
skipping file: 322345.pdf
skipping file: 160987.pdf
skipping file: 47958.pdf
skipping file: 25756.pdf
skipping file: 515569.pdf
skipping file: 237108.pdf
skipping

skipping file: 498813.pdf
skipping file: 398360.pdf
skipping file: 40037.pdf
skipping file: 74818.pdf
skipping file: 436871.pdf
skipping file: 436877.pdf
skipping file: 44640.pdf
skipping file: 33839.pdf
skipping file: 16446.pdf
skipping file: 33840.pdf
skipping file: 82598.pdf
skipping file: 34180.pdf
skipping file: 13818.pdf
skipping file: 130310.pdf
skipping file: 336044.pdf
skipping file: 13524.pdf
skipping file: 336042.pdf
skipping file: 515492.pdf
skipping file: 62432.pdf
skipping file: 458271.pdf
skipping file: 279012.pdf
skipping file: 43585.pdf
skipping file: 248151.pdf
skipping file: 44642.pdf
skipping file: 429060.pdf
skipping file: 252931.pdf
skipping file: 456057.pdf
skipping file: 63564.pdf
skipping file: 321971.pdf
skipping file: 58743.pdf
skipping file: 515499.pdf
skipping file: 459878.pdf
skipping file: 338417.pdf
skipping file: 333243.pdf
skipping file: 35604.pdf
skipping file: 71127.pdf
skipping file: 161006.pdf
skipping file: 85861.pdf
skipping file: 63565.pdf
skipp

skipping file: 434846.pdf
skipping file: 425770.pdf
skipping file: 425799.pdf
skipping file: 434843.pdf
skipping file: 436621.pdf
skipping file: 436620.pdf
skipping file: 439699.pdf
skipping file: 428678.pdf
skipping file: 365537.pdf
skipping file: 83101.pdf
skipping file: 427428.pdf
skipping file: 428680.pdf
skipping file: 439702.pdf
skipping file: 133685.pdf
skipping file: 425755.pdf
skipping file: 428677.pdf
skipping file: 428685.pdf
skipping file: 435502.pdf
skipping file: 435504.pdf
skipping file: 428282.pdf
skipping file: 435501.pdf
skipping file: 428279.pdf
skipping file: 435498.pdf
skipping file: 428281.pdf
skipping file: 436630.pdf
skipping file: 425497.pdf
skipping file: 425517.pdf
skipping file: 436625.pdf
skipping file: 71889.pdf
skipping file: 429324.pdf
skipping file: 427451.pdf
skipping file: 426506.pdf
skipping file: 427449.pdf
skipping file: 426468.pdf
skipping file: 427448.pdf
skipping file: 436072.pdf
skipping file: 426473.pdf
skipping file: 426471.pdf
skipping file:

skipping file: 433453.pdf
skipping file: 433439.pdf
skipping file: 433449.pdf
skipping file: 433445.pdf
skipping file: 433441.pdf
skipping file: 433440.pdf
skipping file: 433455.pdf
skipping file: 433454.pdf
skipping file: 426533.pdf
skipping file: 426532.pdf
skipping file: 379069.pdf
skipping file: 430659.pdf
skipping file: 431884.pdf
skipping file: 430658.pdf
skipping file: 376531.pdf
skipping file: 431497.pdf
skipping file: 431495.pdf
skipping file: 431498.pdf
skipping file: 429137.pdf
skipping file: 271885.pdf
skipping file: 426343.pdf
skipping file: 433725.pdf
skipping file: 426486.pdf
skipping file: 426346.pdf
skipping file: 426347.pdf
skipping file: 432831.pdf
skipping file: 425613.pdf
skipping file: 432830.pdf
skipping file: 427423.pdf
skipping file: 427429.pdf
skipping file: 40093.pdf
skipping file: 427426.pdf
skipping file: 431531.pdf
skipping file: 432057.pdf
skipping file: 322780.pdf
skipping file: 431663.pdf
skipping file: 431656.pdf
skipping file: 431669.pdf
skipping file

skipping file: 425609.pdf
skipping file: 430155.pdf
skipping file: 315954.pdf
skipping file: 426464.pdf
skipping file: 426465.pdf
skipping file: 442707.pdf
skipping file: 426466.pdf
skipping file: 429247.pdf
skipping file: 46862.pdf
skipping file: 429248.pdf
skipping file: 429230.pdf
skipping file: 425607.pdf
skipping file: 430041.pdf
skipping file: 430154.pdf
skipping file: 432424.pdf
skipping file: 433008.pdf
skipping file: 425516.pdf
skipping file: 432425.pdf
skipping file: 433007.pdf
skipping file: 432418.pdf
skipping file: 432423.pdf
skipping file: 427670.pdf
skipping file: 427671.pdf
skipping file: 428076.pdf
skipping file: 428008.pdf
skipping file: 428011.pdf
skipping file: 428619.pdf
skipping file: 430152.pdf
skipping file: 437023.pdf
skipping file: 430151.pdf
skipping file: 430157.pdf
skipping file: 430153.pdf
skipping file: 436621.pdf
skipping file: 436620.pdf
skipping file: 439699.pdf
skipping file: 428678.pdf
skipping file: 365537.pdf
skipping file: 83101.pdf
skipping file:

skipping file: 56792.pdf
skipping file: 100156.pdf
skipping file: 473382.pdf
skipping file: 100155.pdf
skipping file: 62136.pdf
skipping file: 147593.pdf
skipping file: 39779.pdf
skipping file: 442609.pdf
skipping file: 473563.pdf
skipping file: 62137.pdf
skipping file: 18795.pdf
skipping file: 474172.pdf
skipping file: 56793.pdf
skipping file: 473901.pdf
skipping file: 442396.pdf
skipping file: 333384.pdf
skipping file: 474504.pdf
skipping file: 18738.pdf
skipping file: 130598.pdf
skipping file: 397891.pdf
skipping file: 219383.pdf
skipping file: 418521.pdf
skipping file: 420969.pdf
skipping file: 460301.pdf
skipping file: 110120.pdf
skipping file: 402664.pdf
skipping file: 427344.pdf
skipping file: 423096.pdf
skipping file: 282005.pdf
skipping file: 179680.pdf
skipping file: 320594.pdf
skipping file: 85870.pdf
skipping file: 323035.pdf
skipping file: 413093.pdf
skipping file: 400779.pdf
skipping file: 296590.pdf
skipping file: 325101.pdf
skipping file: 296509.pdf
skipping file: 33868

skipping file: 397701.pdf
skipping file: 215644.pdf
skipping file: 479227.pdf
skipping file: 282119.pdf
skipping file: 65590.pdf
skipping file: 90938.pdf
skipping file: 90937.pdf
skipping file: 276949.pdf
skipping file: 98068.pdf
skipping file: 11691.pdf
skipping file: 65589.pdf
skipping file: 323149.pdf
skipping file: 98069.pdf
skipping file: 296254.pdf
skipping file: 402674.pdf
skipping file: 12237.pdf
skipping file: 419362.pdf
skipping file: 421777.pdf
skipping file: 115547.pdf
skipping file: 18987.pdf
skipping file: 434539.pdf
skipping file: 257187.pdf
skipping file: 147583.pdf
skipping file: 141984.pdf
skipping file: 108235.pdf
skipping file: 198294.pdf
skipping file: 296519.pdf
skipping file: 430621.pdf
skipping file: 296840.pdf
skipping file: 446298.pdf
skipping file: 282200.pdf
skipping file: 420133.pdf
skipping file: 419944.pdf
skipping file: 422042.pdf
skipping file: 419364.pdf
skipping file: 419902.pdf
skipping file: 418500.pdf
skipping file: 476529.pdf
skipping file: 282118

skipping file: 404450.pdf
skipping file: 366716.pdf
skipping file: 303317.pdf
skipping file: 76951.pdf
skipping file: 419376.pdf
skipping file: 419713.pdf
skipping file: 179683.pdf
skipping file: 418516.pdf
skipping file: 420855.pdf
skipping file: 281859.pdf
skipping file: 382282.pdf
skipping file: 396537.pdf
skipping file: 83774.pdf
skipping file: 521690.pdf
skipping file: 18753.pdf
skipping file: 473711.pdf
skipping file: 444718.pdf
skipping file: 303321.pdf
skipping file: 323147.pdf
skipping file: 275652.pdf
skipping file: 296830.pdf
skipping file: 407979.pdf
skipping file: 449514.pdf
skipping file: 282314.pdf
skipping file: 513984.pdf
skipping file: 418504.pdf
skipping file: 420250.pdf
skipping file: 18866.pdf
skipping file: 335349.pdf
skipping file: 335768.pdf
skipping file: 296827.pdf
skipping file: 402669.pdf
skipping file: 428326.pdf
skipping file: 18990.pdf
skipping file: 435436.pdf
skipping file: 108193.pdf
skipping file: 416859.pdf
skipping file: 281754.pdf
skipping file: 45

skipping file: 62138.pdf
skipping file: 18854.pdf
skipping file: 442260.pdf
skipping file: 39721.pdf
skipping file: 449105.pdf
skipping file: 247772.pdf
skipping file: 141977.pdf
skipping file: 434701.pdf
skipping file: 400175.pdf
skipping file: 18989.pdf
skipping file: 434699.pdf
skipping file: 435242.pdf
skipping file: 43650.pdf
skipping file: 450918.pdf
skipping file: 376197.pdf
skipping file: 281751.pdf
skipping file: 238654.pdf
skipping file: 305012.pdf
skipping file: 281793.pdf
skipping file: 458300.pdf
skipping file: 163988.pdf
skipping file: 482030.pdf
skipping file: 306117.pdf
skipping file: 352921.pdf
skipping file: 460866.pdf
skipping file: 282334.pdf
skipping file: 405482.pdf
skipping file: 405483.pdf
skipping file: 198396.pdf
skipping file: 18756.pdf
skipping file: 255159.pdf
skipping file: 117862.pdf
skipping file: 414449.pdf
skipping file: 414716.pdf
skipping file: 282343.pdf
skipping file: 467701.pdf
skipping file: 404415.pdf
skipping file: 491631.pdf
skipping file: 397

skipping file: 459913.pdf
skipping file: 395983.pdf
skipping file: 18868.pdf
skipping file: 420061.pdf
skipping file: 418502.pdf
skipping file: 315460.pdf
skipping file: 296822.pdf
skipping file: 432329.pdf
skipping file: 474034.pdf
skipping file: 385052.pdf
skipping file: 18957.pdf
skipping file: 445381.pdf
skipping file: 428310.pdf
skipping file: 281724.pdf
skipping file: 446202.pdf
skipping file: 513807.pdf
skipping file: 512815.pdf
skipping file: 354834.pdf
skipping file: 423076.pdf
skipping file: 404263.pdf
skipping file: 255155.pdf
skipping file: 423751.pdf
skipping file: 308104.pdf
skipping file: 439573.pdf
skipping file: 296249.pdf
skipping file: 429354.pdf
skipping file: 428634.pdf
skipping file: 296257.pdf
skipping file: 296845.pdf
skipping file: 429063.pdf
skipping file: 258317.pdf
skipping file: 281840.pdf
skipping file: 463117.pdf
skipping file: 419366.pdf
skipping file: 422039.pdf
skipping file: 282103.pdf
skipping file: 296591.pdf
skipping file: 429131.pdf
skipping file:

skipping file: 445751.pdf
skipping file: 249544.pdf
skipping file: 92492.pdf
skipping file: 110517.pdf
skipping file: 432553.pdf
skipping file: 390466.pdf
skipping file: 398711.pdf
skipping file: 514181.pdf
skipping file: 72418.pdf
skipping file: 65019.pdf
skipping file: 92072.pdf
skipping file: 305036.pdf
skipping file: 72416.pdf
skipping file: 427837.pdf
skipping file: 427830.pdf
skipping file: 418594.pdf
skipping file: 400674.pdf
skipping file: 517029.pdf
skipping file: 39778.pdf
skipping file: 280110.pdf
skipping file: 80130.pdf
skipping file: 445761.pdf
skipping file: 106276.pdf
skipping file: 258868.pdf
skipping file: 431463.pdf
skipping file: 280111.pdf
skipping file: 431457.pdf
skipping file: 80132.pdf
skipping file: 467362.pdf
skipping file: 39777.pdf
skipping file: 331440.pdf
skipping file: 13762.pdf
skipping file: 13648.pdf
skipping file: 36584.pdf
skipping file: 80091.pdf
skipping file: 39704.pdf
skipping file: 92125.pdf
skipping file: 30534.pdf
skipping file: 39726.pdf
ski

skipping file: 63025.pdf
skipping file: 63026.pdf
skipping file: 278736.pdf
skipping file: 167782.pdf
skipping file: 90987.pdf
skipping file: 315887.pdf
skipping file: 345347.pdf
skipping file: 115667.pdf
skipping file: 90988.pdf
skipping file: 78964.pdf
skipping file: 321947.pdf
skipping file: 231776.pdf
skipping file: 67908.pdf
skipping file: 25845.pdf
skipping file: 340091.pdf
skipping file: 421755.pdf
skipping file: 67807.pdf
skipping file: 212440.pdf
skipping file: 230102.pdf
skipping file: 46772.pdf
skipping file: 63850.pdf
skipping file: 115749.pdf
skipping file: 340628.pdf
skipping file: 78956.pdf
skipping file: 509970.pdf
skipping file: 422017.pdf
skipping file: 66946.pdf
skipping file: 88074.pdf
skipping file: 425553.pdf
skipping file: 53412.pdf
skipping file: 481970.pdf
skipping file: 428007.pdf
skipping file: 63706.pdf
skipping file: 115754.pdf
skipping file: 48332.pdf
skipping file: 428053.pdf
skipping file: 498593.pdf
skipping file: 20956.pdf
skipping file: 20957.pdf
skip

skipping file: 498349.pdf
skipping file: 78462.pdf
skipping file: 422759.pdf
skipping file: 62518.pdf
skipping file: 62517.pdf
skipping file: 86875.pdf
skipping file: 497721.pdf
skipping file: 498421.pdf
skipping file: 468350.pdf
skipping file: 173086.pdf
skipping file: 88866.pdf
skipping file: 51014.pdf
skipping file: 51013.pdf
skipping file: 260822.pdf
skipping file: 115529.pdf
skipping file: 398828.pdf
skipping file: 62048.pdf
skipping file: 107352.pdf
skipping file: 340324.pdf
skipping file: 25852.pdf
skipping file: 492631.pdf
skipping file: 167780.pdf
skipping file: 89750.pdf
skipping file: 25084.pdf
skipping file: 34662.pdf
skipping file: 421840.pdf
skipping file: 278729.pdf
skipping file: 86730.pdf
skipping file: 188969.pdf
skipping file: 56830.pdf
skipping file: 24886.pdf
skipping file: 32769.pdf
skipping file: 422020.pdf
skipping file: 516518.pdf
skipping file: 256098.pdf
skipping file: 359996.pdf
skipping file: 66244.pdf
skipping file: 35147.pdf
skipping file: 110130.pdf
skip

skipping file: 80738.pdf
skipping file: 46233.pdf
skipping file: 115688.pdf
skipping file: 63213.pdf
skipping file: 64152.pdf
skipping file: 425551.pdf
skipping file: 175765.pdf
skipping file: 26203.pdf
skipping file: 43655.pdf
skipping file: 363584.pdf
skipping file: 81492.pdf
skipping file: 30797.pdf
skipping file: 78454.pdf
skipping file: 422096.pdf
skipping file: 319740.pdf
skipping file: 67116.pdf
skipping file: 425599.pdf
skipping file: 496797.pdf
skipping file: 277886.pdf
skipping file: 422095.pdf
skipping file: 66985.pdf
skipping file: 277889.pdf
skipping file: 62508.pdf
skipping file: 62507.pdf
skipping file: 78463.pdf
skipping file: 515109.pdf
skipping file: 60436.pdf
skipping file: 252753.pdf
skipping file: 80204.pdf
skipping file: 80203.pdf
skipping file: 474250.pdf
skipping file: 78965.pdf
skipping file: 167784.pdf
skipping file: 95198.pdf
skipping file: 95196.pdf
skipping file: 278720.pdf
skipping file: 74933.pdf
skipping file: 510672.pdf
skipping file: 190212.pdf
skippin

skipping file: 481973.pdf
skipping file: 53428.pdf
skipping file: 387511.pdf
skipping file: 364187.pdf
skipping file: 70216.pdf
skipping file: 456604.pdf
skipping file: 56445.pdf
skipping file: 90996.pdf
skipping file: 86892.pdf
skipping file: 63119.pdf
skipping file: 468330.pdf
skipping file: 63120.pdf
skipping file: 91033.pdf
skipping file: 171457.pdf
skipping file: 115734.pdf
skipping file: 63705.pdf
skipping file: 48336.pdf
skipping file: 90423.pdf
skipping file: 433851.pdf
skipping file: 86896.pdf
skipping file: 373015.pdf
skipping file: 296698.pdf
skipping file: 297153.pdf
skipping file: 515101.pdf
skipping file: 255996.pdf
skipping file: 100556.pdf
skipping file: 67895.pdf
skipping file: 230097.pdf
skipping file: 78484.pdf
skipping file: 51008.pdf
skipping file: 275411.pdf
skipping file: 398829.pdf
skipping file: 62061.pdf
skipping file: 51009.pdf
skipping file: 173092.pdf
skipping file: 423910.pdf
skipping file: 418830.pdf
skipping file: 78488.pdf
skipping file: 330666.pdf
skip

skipping file: 474101.pdf
skipping file: 56447.pdf
skipping file: 468345.pdf
skipping file: 74947.pdf
skipping file: 512987.pdf
skipping file: 174715.pdf
skipping file: 95028.pdf
skipping file: 223262.pdf
skipping file: 109830.pdf
skipping file: 167213.pdf
skipping file: 474246.pdf
skipping file: 56701.pdf
skipping file: 512127.pdf
skipping file: 86717.pdf
skipping file: 97421.pdf
skipping file: 14242.pdf
skipping file: 510814.pdf
skipping file: 501752.pdf
skipping file: 387576.pdf
skipping file: 252222.pdf
skipping file: 97420.pdf
skipping file: 14241.pdf
skipping file: 250302.pdf
skipping file: 16912.pdf
skipping file: 80763.pdf
skipping file: 509958.pdf
skipping file: 67830.pdf
skipping file: 214005.pdf
skipping file: 40191.pdf
skipping file: 106040.pdf
skipping file: 433130.pdf
skipping file: 40188.pdf
skipping file: 163703.pdf
skipping file: 58408.pdf
skipping file: 293001.pdf
skipping file: 48045.pdf
skipping file: 496773.pdf
skipping file: 35378.pdf
skipping file: 110133.pdf
ski

skipping file: 111138.pdf
skipping file: 57710.pdf
skipping file: 110068.pdf
skipping file: 265150.pdf
skipping file: 29018.pdf
skipping file: 443596.pdf
skipping file: 57711.pdf
skipping file: 29017.pdf
skipping file: 474818.pdf
skipping file: 249586.pdf
skipping file: 115752.pdf
skipping file: 477371.pdf
skipping file: 88488.pdf
skipping file: 47727.pdf
skipping file: 98823.pdf
skipping file: 25865.pdf
skipping file: 376066.pdf
skipping file: 64617.pdf
skipping file: 28647.pdf
skipping file: 339905.pdf
skipping file: 107366.pdf
skipping file: 88489.pdf
skipping file: 47728.pdf
skipping file: 154986.pdf
skipping file: 108072.pdf
skipping file: 326479.pdf
skipping file: 64616.pdf
skipping file: 244729.pdf
skipping file: 305446.pdf
skipping file: 22062.pdf
skipping file: 500870.pdf
skipping file: 62044.pdf
skipping file: 260820.pdf
skipping file: 398822.pdf
skipping file: 50928.pdf
skipping file: 442195.pdf
skipping file: 50927.pdf
skipping file: 245922.pdf
skipping file: 302610.pdf
ski

skipping file: 353665.pdf
skipping file: 516490.pdf
skipping file: 38687.pdf
skipping file: 84122.pdf
skipping file: 16904.pdf
skipping file: 340614.pdf
skipping file: 210081.pdf
skipping file: 155010.pdf
skipping file: 370748.pdf
skipping file: 65666.pdf
skipping file: 158664.pdf
skipping file: 498627.pdf
skipping file: 86808.pdf
skipping file: 20432.pdf
skipping file: 25829.pdf
skipping file: 26859.pdf
skipping file: 498667.pdf
skipping file: 167219.pdf
skipping file: 109831.pdf
skipping file: 226411.pdf
skipping file: 56685.pdf
skipping file: 478570.pdf
skipping file: 130020.pdf
skipping file: 34187.pdf
skipping file: 214456.pdf
skipping file: 135968.pdf
skipping file: 86905.pdf
skipping file: 364470.pdf
skipping file: 22467.pdf
skipping file: 34191.pdf
skipping file: 109098.pdf
skipping file: 425552.pdf
skipping file: 384704.pdf
skipping file: 259941.pdf
skipping file: 64156.pdf
skipping file: 497730.pdf
skipping file: 26717.pdf
skipping file: 415478.pdf
skipping file: 484916.pdf
s

skipping file: 56704.pdf
skipping file: 223279.pdf
skipping file: 100392.pdf
skipping file: 174710.pdf
skipping file: 173475.pdf
skipping file: 467479.pdf
skipping file: 63061.pdf
skipping file: 86885.pdf
skipping file: 90997.pdf
skipping file: 63062.pdf
skipping file: 91025.pdf
skipping file: 259006.pdf
skipping file: 164585.pdf
skipping file: 140963.pdf
skipping file: 376177.pdf
skipping file: 21929.pdf
skipping file: 58671.pdf
skipping file: 115737.pdf
skipping file: 30793.pdf
skipping file: 21927.pdf
skipping file: 363623.pdf
skipping file: 88344.pdf
skipping file: 340629.pdf
skipping file: 98709.pdf
skipping file: 21931.pdf
skipping file: 259270.pdf
skipping file: 20894.pdf
skipping file: 325189.pdf
skipping file: 58672.pdf
skipping file: 53790.pdf
skipping file: 243853.pdf
skipping file: 82027.pdf
skipping file: 296762.pdf
skipping file: 98697.pdf
skipping file: 15179.pdf
skipping file: 331474.pdf
skipping file: 464236.pdf
skipping file: 376180.pdf
skipping file: 291368.pdf
skipp

skipping file: 498336.pdf
skipping file: 22586.pdf
skipping file: 64639.pdf
skipping file: 242579.pdf
skipping file: 167409.pdf
skipping file: 167209.pdf
skipping file: 246275.pdf
skipping file: 387510.pdf
skipping file: 481971.pdf
skipping file: 53417.pdf
skipping file: 332618.pdf
skipping file: 31101.pdf
skipping file: 101620.pdf
skipping file: 101617.pdf
skipping file: 101622.pdf
skipping file: 498318.pdf
skipping file: 78959.pdf
skipping file: 135230.pdf
skipping file: 332622.pdf
skipping file: 31102.pdf
skipping file: 387901.pdf
skipping file: 101619.pdf
skipping file: 101621.pdf
skipping file: 193986.pdf
skipping file: 48970.pdf
skipping file: 85100.pdf
skipping file: 79687.pdf
skipping file: 325819.pdf
skipping file: 78469.pdf
skipping file: 456631.pdf
skipping file: 388376.pdf
skipping file: 25838.pdf
skipping file: 340069.pdf
skipping file: 91029.pdf
skipping file: 70215.pdf
skipping file: 90992.pdf
skipping file: 56443.pdf
skipping file: 86893.pdf
skipping file: 456603.pdf
sk

skipping file: 114803.pdf
skipping file: 62577.pdf
skipping file: 225822.pdf
skipping file: 418625.pdf
skipping file: 304408.pdf
skipping file: 110619.pdf
skipping file: 432963.pdf
skipping file: 76076.pdf
skipping file: 447133.pdf
skipping file: 213719.pdf
skipping file: 377779.pdf
skipping file: 108791.pdf
skipping file: 41795.pdf
skipping file: 170136.pdf
skipping file: 520651.pdf
skipping file: 41792.pdf
skipping file: 98879.pdf
skipping file: 449115.pdf
skipping file: 77512.pdf
skipping file: 163837.pdf
skipping file: 112107.pdf
skipping file: 108403.pdf
skipping file: 213515.pdf
skipping file: 151009.pdf
skipping file: 257620.pdf
skipping file: 273661.pdf
skipping file: 227892.pdf
skipping file: 408522.pdf
skipping file: 381850.pdf
skipping file: 381856.pdf
skipping file: 129164.pdf
skipping file: 134820.pdf
skipping file: 67901.pdf
skipping file: 67900.pdf
skipping file: 116786.pdf
skipping file: 130202.pdf
skipping file: 108796.pdf
skipping file: 441295.pdf
skipping file: 38962

skipping file: 114498.pdf
skipping file: 18068.pdf
skipping file: 402073.pdf
skipping file: 64213.pdf
skipping file: 21923.pdf
skipping file: 64218.pdf
skipping file: 53579.pdf
skipping file: 518674.pdf
skipping file: 518676.pdf
skipping file: 518658.pdf
skipping file: 472768.pdf
skipping file: 472771.pdf
skipping file: 518659.pdf
skipping file: 518657.pdf
skipping file: 518652.pdf
skipping file: 518675.pdf
skipping file: 518684.pdf
skipping file: 518678.pdf
skipping file: 430382.pdf
skipping file: 432728.pdf
skipping file: 518677.pdf
skipping file: 460503.pdf
skipping file: 403328.pdf
skipping file: 472495.pdf
skipping file: 472494.pdf
skipping file: 64306.pdf
skipping file: 21290.pdf
skipping file: 270649.pdf
skipping file: 100825.pdf
skipping file: 81479.pdf
skipping file: 81480.pdf
skipping file: 494527.pdf
skipping file: 389430.pdf
skipping file: 86212.pdf
skipping file: 86213.pdf
skipping file: 221061.pdf
skipping file: 415040.pdf
skipping file: 18700.pdf
skipping file: 107653.pd

skipping file: 105376.pdf
skipping file: 42224.pdf
skipping file: 12788.pdf
skipping file: 19480.pdf
skipping file: 62439.pdf
skipping file: 21285.pdf
skipping file: 75932.pdf
skipping file: 415060.pdf
skipping file: 398662.pdf
skipping file: 398582.pdf
skipping file: 19075.pdf
skipping file: 407720.pdf
skipping file: 89907.pdf
skipping file: 403718.pdf
skipping file: 18697.pdf
skipping file: 89908.pdf
skipping file: 112791.pdf
skipping file: 95419.pdf
skipping file: 19473.pdf
skipping file: 457256.pdf
skipping file: 519220.pdf
skipping file: 19517.pdf
skipping file: 19051.pdf
skipping file: 19471.pdf
skipping file: 64374.pdf
skipping file: 21294.pdf
skipping file: 509668.pdf
skipping file: 38396.pdf
skipping file: 30803.pdf
skipping file: 38395.pdf
skipping file: 440279.pdf
skipping file: 369135.pdf
skipping file: 439430.pdf
skipping file: 469105.pdf
skipping file: 321426.pdf
skipping file: 515595.pdf
skipping file: 411725.pdf
skipping file: 493614.pdf
skipping file: 19053.pdf
skippin

skipping file: 332813.pdf
skipping file: 399061.pdf
skipping file: 54845.pdf
skipping file: 260350.pdf
skipping file: 332615.pdf
skipping file: 333319.pdf
skipping file: 335240.pdf
skipping file: 419768.pdf
skipping file: 166245.pdf
skipping file: 367568.pdf
skipping file: 345255.pdf
skipping file: 266105.pdf
skipping file: 357613.pdf
skipping file: 339444.pdf
skipping file: 54483.pdf
skipping file: 69747.pdf
skipping file: 247883.pdf
skipping file: 385069.pdf
skipping file: 211648.pdf
skipping file: 66093.pdf
skipping file: 69746.pdf
skipping file: 266235.pdf
skipping file: 362574.pdf
skipping file: 338297.pdf
skipping file: 378140.pdf
skipping file: 428040.pdf
skipping file: 395374.pdf
skipping file: 478714.pdf
skipping file: 338025.pdf
skipping file: 495015.pdf
skipping file: 228152.pdf
skipping file: 262006.pdf
skipping file: 362560.pdf
skipping file: 362559.pdf
skipping file: 312124.pdf
skipping file: 331994.pdf
skipping file: 54404.pdf
skipping file: 387322.pdf
skipping file: 322

skipping file: 409639.pdf
skipping file: 414377.pdf
skipping file: 383614.pdf
skipping file: 378331.pdf
skipping file: 284735.pdf
skipping file: 378214.pdf
skipping file: 381163.pdf
skipping file: 338292.pdf
skipping file: 378215.pdf
skipping file: 176721.pdf
skipping file: 241323.pdf
skipping file: 53906.pdf
skipping file: 220235.pdf
skipping file: 338271.pdf
skipping file: 338312.pdf
skipping file: 435790.pdf
skipping file: 367515.pdf
skipping file: 341181.pdf
skipping file: 332611.pdf
skipping file: 345212.pdf
skipping file: 365083.pdf
skipping file: 153470.pdf
skipping file: 54181.pdf
skipping file: 23421.pdf
skipping file: 314388.pdf
skipping file: 68007.pdf
skipping file: 356222.pdf
skipping file: 314387.pdf
skipping file: 59958.pdf
skipping file: 332605.pdf
skipping file: 59959.pdf
skipping file: 330084.pdf
skipping file: 422219.pdf
skipping file: 386049.pdf
skipping file: 54255.pdf
skipping file: 54260.pdf
skipping file: 460234.pdf
skipping file: 203784.pdf
skipping file: 35669

skipping file: 345260.pdf
skipping file: 338510.pdf
skipping file: 338515.pdf
skipping file: 347019.pdf
skipping file: 347018.pdf
skipping file: 69740.pdf
skipping file: 69741.pdf
skipping file: 387082.pdf
skipping file: 345257.pdf
skipping file: 360986.pdf
skipping file: 338056.pdf
skipping file: 495234.pdf
skipping file: 59340.pdf
skipping file: 54439.pdf
skipping file: 345249.pdf
skipping file: 265131.pdf
skipping file: 59988.pdf
skipping file: 331549.pdf
skipping file: 380478.pdf
skipping file: 378212.pdf
skipping file: 54434.pdf
skipping file: 101987.pdf
skipping file: 374164.pdf
skipping file: 332616.pdf
skipping file: 343075.pdf
skipping file: 323851.pdf
skipping file: 489156.pdf
skipping file: 60032.pdf
skipping file: 378139.pdf
skipping file: 345259.pdf
skipping file: 495014.pdf
skipping file: 322450.pdf
skipping file: 386549.pdf
skipping file: 378137.pdf
skipping file: 18654.pdf
skipping file: 441210.pdf
skipping file: 495504.pdf
skipping file: 395344.pdf
skipping file: 33948

skipping file: 360740.pdf
skipping file: 32856.pdf
skipping file: 295033.pdf
skipping file: 266255.pdf
skipping file: 295062.pdf
skipping file: 32857.pdf
skipping file: 407469.pdf
skipping file: 295032.pdf
skipping file: 338270.pdf
skipping file: 368622.pdf
skipping file: 341176.pdf
skipping file: 331384.pdf
skipping file: 433849.pdf
skipping file: 386425.pdf
skipping file: 333317.pdf
skipping file: 335235.pdf
skipping file: 54374.pdf
skipping file: 326558.pdf
skipping file: 372162.pdf
skipping file: 343423.pdf
skipping file: 429087.pdf
skipping file: 380486.pdf
skipping file: 153471.pdf
skipping file: 54269.pdf
skipping file: 337118.pdf
skipping file: 357125.pdf
skipping file: 266103.pdf
skipping file: 388172.pdf
skipping file: 263105.pdf
skipping file: 332966.pdf
skipping file: 112576.pdf
skipping file: 340058.pdf
skipping file: 306559.pdf
skipping file: 338050.pdf
skipping file: 495227.pdf
skipping file: 338326.pdf
skipping file: 395350.pdf
skipping file: 495506.pdf
skipping file: 1

skipping file: 112430.pdf
skipping file: 203385.pdf
skipping file: 22824.pdf
skipping file: 107618.pdf
skipping file: 250277.pdf
skipping file: 138164.pdf
skipping file: 102705.pdf
skipping file: 108883.pdf
skipping file: 311060.pdf
skipping file: 108880.pdf
skipping file: 333873.pdf
skipping file: 317582.pdf
skipping file: 17180.pdf
skipping file: 345626.pdf
skipping file: 493377.pdf
skipping file: 22852.pdf
skipping file: 147277.pdf
skipping file: 516805.pdf
skipping file: 22893.pdf
skipping file: 138586.pdf
skipping file: 107694.pdf
skipping file: 374716.pdf
skipping file: 351208.pdf
skipping file: 107642.pdf
skipping file: 103883.pdf
skipping file: 120918.pdf
skipping file: 34589.pdf
skipping file: 23170.pdf
skipping file: 438454.pdf
skipping file: 471426.pdf
skipping file: 22845.pdf
skipping file: 110011.pdf
skipping file: 10985.pdf
skipping file: 109042.pdf
skipping file: 110010.pdf
skipping file: 97823.pdf
skipping file: 22891.pdf
skipping file: 97822.pdf
skipping file: 367180.p

skipping file: 84162.pdf
skipping file: 516786.pdf
skipping file: 22825.pdf
skipping file: 302004.pdf
skipping file: 352023.pdf
skipping file: 246833.pdf
skipping file: 41943.pdf
skipping file: 77174.pdf
skipping file: 41942.pdf
skipping file: 77173.pdf
skipping file: 23099.pdf
skipping file: 246134.pdf
skipping file: 22901.pdf
skipping file: 101428.pdf
skipping file: 106925.pdf
skipping file: 13154.pdf
skipping file: 22986.pdf
skipping file: 177612.pdf
skipping file: 154584.pdf
skipping file: 23199.pdf
skipping file: 494811.pdf
skipping file: 88081.pdf
skipping file: 88080.pdf
skipping file: 41549.pdf
skipping file: 309180.pdf
skipping file: 81993.pdf
skipping file: 23078.pdf
skipping file: 81994.pdf
skipping file: 41548.pdf
skipping file: 174153.pdf
skipping file: 22821.pdf
skipping file: 97689.pdf
skipping file: 82756.pdf
skipping file: 97690.pdf
skipping file: 82757.pdf
skipping file: 245548.pdf
skipping file: 13512.pdf
skipping file: 23198.pdf
skipping file: 347105.pdf
skipping fi

skipping file: 334901.pdf
skipping file: 117525.pdf
skipping file: 317803.pdf
skipping file: 12057.pdf
skipping file: 475809.pdf
skipping file: 516790.pdf
skipping file: 109219.pdf
skipping file: 34540.pdf
skipping file: 102641.pdf
skipping file: 167816.pdf
skipping file: 23081.pdf
skipping file: 108868.pdf
skipping file: 88084.pdf
skipping file: 88085.pdf
skipping file: 13158.pdf
skipping file: 507506.pdf
skipping file: 163728.pdf
skipping file: 12062.pdf
skipping file: 317596.pdf
skipping file: 466339.pdf
skipping file: 34543.pdf
skipping file: 411857.pdf
skipping file: 23141.pdf
skipping file: 23080.pdf
skipping file: 97829.pdf
skipping file: 97830.pdf
skipping file: 71097.pdf
skipping file: 18356.pdf
skipping file: 71096.pdf
skipping file: 71153.pdf
skipping file: 22992.pdf
skipping file: 517080.pdf
skipping file: 30065.pdf
skipping file: 56206.pdf
skipping file: 112439.pdf
skipping file: 84396.pdf
skipping file: 22810.pdf
skipping file: 208169.pdf
skipping file: 107614.pdf
skippin

skipping file: 22137.pdf
skipping file: 52969.pdf
skipping file: 240160.pdf
skipping file: 58926.pdf
skipping file: 521515.pdf
skipping file: 40234.pdf
skipping file: 52706.pdf
skipping file: 173180.pdf
skipping file: 47234.pdf
skipping file: 78069.pdf
skipping file: 458349.pdf
skipping file: 155103.pdf
skipping file: 78070.pdf
skipping file: 343916.pdf
skipping file: 262342.pdf
skipping file: 36912.pdf
skipping file: 36913.pdf
skipping file: 71415.pdf
skipping file: 11635.pdf
skipping file: 37445.pdf
skipping file: 170290.pdf
skipping file: 333019.pdf
skipping file: 423241.pdf
skipping file: 175902.pdf
skipping file: 11658.pdf
skipping file: 96276.pdf
skipping file: 316544.pdf
skipping file: 33523.pdf
skipping file: 59514.pdf
skipping file: 232149.pdf
skipping file: 58239.pdf
skipping file: 229739.pdf
skipping file: 23355.pdf
skipping file: 291379.pdf
skipping file: 232979.pdf
skipping file: 29965.pdf
skipping file: 12262.pdf
skipping file: 56667.pdf
skipping file: 103487.pdf
skipping

In [None]:
# remove bad files from intermed folder:
#using os.walk, move all the .pdf files to an intermediate folder for EZ access
all_bad_names = []
all_score_names = []
for subdir, dirs, files in os.walk(resultsPath):
    for pdf_file in files:
        if 'pdf' in pdf_file:
            html_path = os.path.join(subdir, 'html.txt')
            bad_names = make_bad_pdfs(html_path)
            score_names = make_score_pdfs(html_path)
            for bad_name in bad_names:
                all_bad_names.append(bad_name)
            for score_name in score_names:
                all_score_names.append(score_name)
            if pdf_file in bad_names or pdf_file in score_names:
                if pdf_file in os.listdir(intermedPath):
                    print("removing file: " + str(pdf_file))
                    os.remove(os.path.join(intermedPath, pdf_file))
                else:
                    continue
            if pdf_file in all_bad_names or pdf_file in all_score_names:
                if pdf_file in os.listdir(intermedPath):
                    rmv_path = os.path.join(intermedPath, pdf_file)
                    print("about to remove: " + str(rmv_path))
                    os.remove(rmv_path)
                else:
                    continue

removing file: 272418.pdf
removing file: 143297.pdf
removing file: 240430.pdf
removing file: 77568.pdf
removing file: 385712.pdf
removing file: 180678.pdf
removing file: 450215.pdf
removing file: 450401.pdf
removing file: 450407.pdf
removing file: 450402.pdf
removing file: 450404.pdf
removing file: 450408.pdf
removing file: 450216.pdf
removing file: 450405.pdf
removing file: 450409.pdf
removing file: 450403.pdf
removing file: 453848.pdf
removing file: 378201.pdf
removing file: 236847.pdf
removing file: 225689.pdf
removing file: 77563.pdf
removing file: 50975.pdf
removing file: 504189.pdf
removing file: 382034.pdf
removing file: 111894.pdf
removing file: 105452.pdf
removing file: 18578.pdf
removing file: 111620.pdf
removing file: 423983.pdf
removing file: 377820.pdf
removing file: 77564.pdf
removing file: 421428.pdf
removing file: 290401.pdf
removing file: 290400.pdf
removing file: 225449.pdf
removing file: 225450.pdf
removing file: 512603.pdf
removing file: 110881.pdf
removing file: 30

removing file: 190122.pdf
removing file: 190117.pdf
removing file: 190143.pdf
removing file: 383422.pdf
removing file: 418533.pdf
removing file: 383424.pdf
removing file: 114356.pdf
removing file: 141078.pdf
removing file: 363036.pdf
removing file: 114358.pdf
removing file: 408713.pdf
removing file: 348445.pdf
removing file: 348444.pdf
removing file: 348453.pdf
removing file: 348455.pdf
removing file: 348457.pdf
removing file: 439265.pdf
removing file: 348443.pdf
removing file: 16169.pdf
removing file: 348447.pdf
removing file: 348442.pdf
removing file: 348460.pdf
removing file: 348446.pdf
removing file: 294852.pdf
removing file: 348448.pdf
removing file: 95931.pdf
removing file: 348462.pdf
removing file: 348461.pdf
removing file: 294854.pdf
removing file: 348454.pdf
removing file: 348456.pdf
removing file: 348459.pdf
removing file: 348458.pdf
removing file: 348449.pdf
removing file: 348440.pdf
removing file: 348451.pdf
removing file: 348441.pdf
removing file: 348463.pdf
removing file:

removing file: 449193.pdf
removing file: 61851.pdf
removing file: 61853.pdf
removing file: 61850.pdf
removing file: 108611.pdf
removing file: 112353.pdf
removing file: 28872.pdf
removing file: 61527.pdf
removing file: 332353.pdf
removing file: 484504.pdf
removing file: 108610.pdf
removing file: 484503.pdf
removing file: 484501.pdf
removing file: 61852.pdf
removing file: 108614.pdf
removing file: 66417.pdf
removing file: 205176.pdf
removing file: 400437.pdf
removing file: 184884.pdf
removing file: 117593.pdf
removing file: 74135.pdf
removing file: 367701.pdf
removing file: 367702.pdf
removing file: 367703.pdf
removing file: 400442.pdf
removing file: 252896.pdf
removing file: 53188.pdf
removing file: 28596.pdf
removing file: 251996.pdf
removing file: 166822.pdf
removing file: 251995.pdf
removing file: 166819.pdf
removing file: 166820.pdf
removing file: 251992.pdf
removing file: 166821.pdf
removing file: 28597.pdf
removing file: 251994.pdf
removing file: 251997.pdf
removing file: 47674.pd

removing file: 466507.pdf
removing file: 318878.pdf
removing file: 310222.pdf
removing file: 445621.pdf
removing file: 343962.pdf
removing file: 453426.pdf
removing file: 332834.pdf
removing file: 466504.pdf
removing file: 392349.pdf
removing file: 349850.pdf
removing file: 349852.pdf
removing file: 392343.pdf
removing file: 392348.pdf
removing file: 392350.pdf
removing file: 392346.pdf
removing file: 349854.pdf
removing file: 349855.pdf
removing file: 349856.pdf
removing file: 392345.pdf
removing file: 349849.pdf
removing file: 349853.pdf
removing file: 392347.pdf
removing file: 392344.pdf
removing file: 349851.pdf
removing file: 450098.pdf
removing file: 450099.pdf
removing file: 334626.pdf
removing file: 334627.pdf
removing file: 338912.pdf
removing file: 335797.pdf
removing file: 350152.pdf
removing file: 342019.pdf
removing file: 338913.pdf
removing file: 311068.pdf
removing file: 319395.pdf
removing file: 319396.pdf
removing file: 319397.pdf
removing file: 311086.pdf
removing fil

removing file: 499111.pdf
removing file: 135560.pdf
removing file: 290941.pdf
removing file: 486857.pdf
removing file: 486864.pdf
removing file: 486865.pdf
removing file: 486861.pdf
removing file: 486859.pdf
removing file: 486866.pdf
removing file: 486863.pdf
removing file: 297947.pdf
removing file: 297948.pdf
removing file: 297946.pdf
removing file: 493927.pdf
removing file: 428078.pdf
removing file: 393038.pdf
removing file: 428077.pdf
removing file: 248393.pdf
removing file: 297758.pdf
removing file: 334790.pdf
removing file: 376713.pdf
removing file: 376714.pdf
removing file: 376715.pdf
removing file: 297759.pdf
removing file: 297760.pdf
removing file: 211195.pdf
removing file: 350073.pdf
removing file: 436198.pdf
removing file: 350076.pdf
removing file: 350077.pdf
removing file: 214466.pdf
removing file: 312600.pdf
removing file: 133745.pdf
removing file: 350072.pdf
removing file: 214465.pdf
removing file: 350071.pdf
removing file: 350075.pdf
removing file: 312599.pdf
removing fil

removing file: 362270.pdf
removing file: 464812.pdf
removing file: 297882.pdf
removing file: 358127.pdf
removing file: 362273.pdf
removing file: 16314.pdf
removing file: 272348.pdf
removing file: 135569.pdf
removing file: 500258.pdf
removing file: 210727.pdf
removing file: 210726.pdf
removing file: 502498.pdf
removing file: 482635.pdf
removing file: 502499.pdf
removing file: 134643.pdf
removing file: 502497.pdf
removing file: 225760.pdf
removing file: 225759.pdf
removing file: 225761.pdf
removing file: 460803.pdf
removing file: 487064.pdf
removing file: 485136.pdf
removing file: 485138.pdf
removing file: 485137.pdf
removing file: 483186.pdf
removing file: 131284.pdf
removing file: 134669.pdf
removing file: 240137.pdf
removing file: 240138.pdf
removing file: 500276.pdf
removing file: 240139.pdf
removing file: 309072.pdf
removing file: 309075.pdf
removing file: 309074.pdf
removing file: 309073.pdf
removing file: 436851.pdf
removing file: 494760.pdf
removing file: 426627.pdf
removing file

removing file: 250991.pdf
removing file: 168836.pdf
removing file: 221121.pdf
removing file: 221122.pdf
removing file: 135181.pdf
removing file: 495480.pdf
removing file: 432848.pdf
removing file: 260352.pdf
removing file: 260354.pdf
removing file: 248919.pdf
removing file: 260353.pdf
removing file: 279535.pdf
removing file: 485153.pdf
removing file: 485152.pdf
removing file: 161942.pdf
removing file: 502594.pdf
removing file: 161943.pdf
removing file: 122249.pdf
removing file: 129212.pdf
removing file: 162049.pdf
removing file: 502593.pdf
removing file: 338602.pdf
removing file: 429227.pdf
removing file: 502596.pdf
removing file: 504804.pdf
removing file: 502598.pdf
removing file: 122043.pdf
removing file: 338651.pdf
removing file: 338603.pdf
removing file: 502597.pdf
removing file: 122247.pdf
removing file: 122248.pdf
removing file: 338601.pdf
removing file: 338604.pdf
removing file: 429228.pdf
removing file: 161941.pdf
removing file: 161944.pdf
removing file: 128960.pdf
removing fil

In [34]:
%cd ttemp/Project/Summer2018/

/data1/dbashir/Project/Summer2018


In [36]:
print(miniPDFDataset)

pdf/mini_dataset_pdf


In [8]:
extra_dataset = 'pdf/extras_to_check'

In [16]:
#change directory to the intermediate folder
# if os.getcwd() != intermedFromTop:
#     os.chdir(intermedPath)
g = glob.glob(os.path.join(intermedPath,'*.pdf'))
shuf = np.random.permutation(g)
for i in range(1):
    name = shuf[i]
    name = name[25:]
    #os.rename(shuf[i], os.path.join(extra_dataset, name)) # also add so we can look at just these 5
    os.rename(shuf[i], os.path.join(miniPDFDataset, name))

Before splitting, I'll examine this small dataset manually to see how it looks.

In [11]:
def pdf_splitter(path):
    fname = os.path.splitext(os.path.basename(path))[0]
    
    #print(path)
    path = os.path.join(miniPDFDataset, path)
 
    pdf = PdfFileReader(open(path,"rb"))
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))
 
        output_filename = '{}_page_{}.pdf'.format(
            fname, page+1)
    
        output_filename = os.path.join(miniPDFDataset, output_filename)
 
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)
 
        print('Created: {}'.format(output_filename))

In [None]:
os.chdir(pdfFromTop)
os.getcwd()

In [13]:
for pdf_file in os.listdir(miniPDFDataset):
    try:
        pdf_splitter(pdf_file)
    except:
        continue



Created: pdf/mini_dataset_pdf/113063_page_1.pdf
Created: pdf/mini_dataset_pdf/113063_page_2.pdf
Created: pdf/mini_dataset_pdf/113063_page_3.pdf
Created: pdf/mini_dataset_pdf/113063_page_4.pdf
Created: pdf/mini_dataset_pdf/113063_page_5.pdf
Created: pdf/mini_dataset_pdf/113063_page_6.pdf
Created: pdf/mini_dataset_pdf/113063_page_7.pdf
Created: pdf/mini_dataset_pdf/113063_page_8.pdf
Created: pdf/mini_dataset_pdf/113063_page_9.pdf
Created: pdf/mini_dataset_pdf/113063_page_10.pdf
Created: pdf/mini_dataset_pdf/113063_page_11.pdf
Created: pdf/mini_dataset_pdf/113063_page_12.pdf
Created: pdf/mini_dataset_pdf/113063_page_13.pdf
Created: pdf/mini_dataset_pdf/113063_page_14.pdf
Created: pdf/mini_dataset_pdf/113063_page_15.pdf
Created: pdf/mini_dataset_pdf/113063_page_16.pdf
Created: pdf/mini_dataset_pdf/113063_page_17.pdf
Created: pdf/mini_dataset_pdf/113063_page_18.pdf
Created: pdf/mini_dataset_pdf/113063_page_19.pdf
Created: pdf/mini_dataset_pdf/113063_page_20.pdf
Created: pdf/mini_dataset_pdf

Created: pdf/mini_dataset_pdf/236213_page_5.pdf
Created: pdf/mini_dataset_pdf/236213_page_6.pdf
Created: pdf/mini_dataset_pdf/236213_page_7.pdf
Created: pdf/mini_dataset_pdf/236213_page_8.pdf
Created: pdf/mini_dataset_pdf/236213_page_9.pdf
Created: pdf/mini_dataset_pdf/236213_page_10.pdf
Created: pdf/mini_dataset_pdf/236213_page_11.pdf
Created: pdf/mini_dataset_pdf/236213_page_12.pdf
Created: pdf/mini_dataset_pdf/27233_page_1.pdf
Created: pdf/mini_dataset_pdf/27233_page_2.pdf
Created: pdf/mini_dataset_pdf/27233_page_3.pdf
Created: pdf/mini_dataset_pdf/27233_page_4.pdf
Created: pdf/mini_dataset_pdf/27233_page_5.pdf
Created: pdf/mini_dataset_pdf/441592_page_1.pdf
Created: pdf/mini_dataset_pdf/441592_page_2.pdf
Created: pdf/mini_dataset_pdf/246294_page_1.pdf
Created: pdf/mini_dataset_pdf/246294_page_2.pdf
Created: pdf/mini_dataset_pdf/246294_page_3.pdf
Created: pdf/mini_dataset_pdf/246294_page_4.pdf
Created: pdf/mini_dataset_pdf/246294_page_5.pdf
Created: pdf/mini_dataset_pdf/03287_page_1

Created: pdf/mini_dataset_pdf/407960_page_1.pdf
Created: pdf/mini_dataset_pdf/407960_page_2.pdf
Created: pdf/mini_dataset_pdf/407960_page_3.pdf
Created: pdf/mini_dataset_pdf/407960_page_4.pdf
Created: pdf/mini_dataset_pdf/407960_page_5.pdf
Created: pdf/mini_dataset_pdf/407960_page_6.pdf
Created: pdf/mini_dataset_pdf/407960_page_7.pdf
Created: pdf/mini_dataset_pdf/319376_page_1.pdf
Created: pdf/mini_dataset_pdf/319376_page_2.pdf
Created: pdf/mini_dataset_pdf/319376_page_3.pdf
Created: pdf/mini_dataset_pdf/319376_page_4.pdf
Created: pdf/mini_dataset_pdf/167982_page_1.pdf
Created: pdf/mini_dataset_pdf/167982_page_2.pdf
Created: pdf/mini_dataset_pdf/08796_page_1.pdf
Created: pdf/mini_dataset_pdf/08796_page_2.pdf
Created: pdf/mini_dataset_pdf/08796_page_3.pdf
Created: pdf/mini_dataset_pdf/49635_page_1.pdf
Created: pdf/mini_dataset_pdf/49635_page_2.pdf
Created: pdf/mini_dataset_pdf/49635_page_3.pdf
Created: pdf/mini_dataset_pdf/49635_page_4.pdf
Created: pdf/mini_dataset_pdf/49635_page_5.pdf




Created: pdf/mini_dataset_pdf/53481_page_1.pdf
Created: pdf/mini_dataset_pdf/53481_page_2.pdf
Created: pdf/mini_dataset_pdf/53481_page_3.pdf
Created: pdf/mini_dataset_pdf/53481_page_4.pdf
Created: pdf/mini_dataset_pdf/53481_page_5.pdf
Created: pdf/mini_dataset_pdf/53481_page_6.pdf
Created: pdf/mini_dataset_pdf/53481_page_7.pdf
Created: pdf/mini_dataset_pdf/53481_page_8.pdf
Created: pdf/mini_dataset_pdf/53481_page_9.pdf
Created: pdf/mini_dataset_pdf/53481_page_10.pdf
Created: pdf/mini_dataset_pdf/53481_page_11.pdf
Created: pdf/mini_dataset_pdf/53481_page_12.pdf
Created: pdf/mini_dataset_pdf/53481_page_13.pdf
Created: pdf/mini_dataset_pdf/111403_page_1.pdf
Created: pdf/mini_dataset_pdf/111403_page_2.pdf
Created: pdf/mini_dataset_pdf/53381_page_1.pdf
Created: pdf/mini_dataset_pdf/53381_page_2.pdf
Created: pdf/mini_dataset_pdf/53381_page_3.pdf
Created: pdf/mini_dataset_pdf/53381_page_4.pdf
Created: pdf/mini_dataset_pdf/53381_page_5.pdf
Created: pdf/mini_dataset_pdf/377645_page_1.pdf
Create

Created: pdf/mini_dataset_pdf/105172_page_1.pdf
Created: pdf/mini_dataset_pdf/105172_page_2.pdf
Created: pdf/mini_dataset_pdf/105172_page_3.pdf
Created: pdf/mini_dataset_pdf/105172_page_4.pdf
Created: pdf/mini_dataset_pdf/105172_page_5.pdf
Created: pdf/mini_dataset_pdf/105172_page_6.pdf
Created: pdf/mini_dataset_pdf/105172_page_7.pdf
Created: pdf/mini_dataset_pdf/105172_page_8.pdf
Created: pdf/mini_dataset_pdf/105172_page_9.pdf
Created: pdf/mini_dataset_pdf/105172_page_10.pdf
Created: pdf/mini_dataset_pdf/105172_page_11.pdf
Created: pdf/mini_dataset_pdf/105172_page_12.pdf
Created: pdf/mini_dataset_pdf/105172_page_13.pdf
Created: pdf/mini_dataset_pdf/105172_page_14.pdf
Created: pdf/mini_dataset_pdf/105172_page_15.pdf
Created: pdf/mini_dataset_pdf/105172_page_16.pdf
Created: pdf/mini_dataset_pdf/105172_page_17.pdf
Created: pdf/mini_dataset_pdf/105172_page_18.pdf
Created: pdf/mini_dataset_pdf/105172_page_19.pdf
Created: pdf/mini_dataset_pdf/105172_page_20.pdf
Created: pdf/mini_dataset_pdf

Created: pdf/mini_dataset_pdf/105172_page_179.pdf
Created: pdf/mini_dataset_pdf/105172_page_180.pdf
Created: pdf/mini_dataset_pdf/105172_page_181.pdf
Created: pdf/mini_dataset_pdf/105172_page_182.pdf
Created: pdf/mini_dataset_pdf/105172_page_183.pdf
Created: pdf/mini_dataset_pdf/105172_page_184.pdf
Created: pdf/mini_dataset_pdf/105172_page_185.pdf
Created: pdf/mini_dataset_pdf/105172_page_186.pdf
Created: pdf/mini_dataset_pdf/105172_page_187.pdf
Created: pdf/mini_dataset_pdf/105172_page_188.pdf
Created: pdf/mini_dataset_pdf/105172_page_189.pdf
Created: pdf/mini_dataset_pdf/105172_page_190.pdf
Created: pdf/mini_dataset_pdf/105172_page_191.pdf
Created: pdf/mini_dataset_pdf/105172_page_192.pdf
Created: pdf/mini_dataset_pdf/105172_page_193.pdf
Created: pdf/mini_dataset_pdf/105172_page_194.pdf
Created: pdf/mini_dataset_pdf/105172_page_195.pdf
Created: pdf/mini_dataset_pdf/105172_page_196.pdf
Created: pdf/mini_dataset_pdf/105172_page_197.pdf
Created: pdf/mini_dataset_pdf/105172_page_198.pdf




Created: pdf/mini_dataset_pdf/351735_page_21.pdf
Created: pdf/mini_dataset_pdf/351735_page_22.pdf
Created: pdf/mini_dataset_pdf/351735_page_23.pdf
Created: pdf/mini_dataset_pdf/351735_page_24.pdf
Created: pdf/mini_dataset_pdf/351735_page_25.pdf
Created: pdf/mini_dataset_pdf/351735_page_26.pdf
Created: pdf/mini_dataset_pdf/351735_page_27.pdf
Created: pdf/mini_dataset_pdf/351735_page_28.pdf
Created: pdf/mini_dataset_pdf/351735_page_29.pdf
Created: pdf/mini_dataset_pdf/351735_page_30.pdf
Created: pdf/mini_dataset_pdf/351735_page_31.pdf
Created: pdf/mini_dataset_pdf/351735_page_32.pdf
Created: pdf/mini_dataset_pdf/351735_page_33.pdf
Created: pdf/mini_dataset_pdf/351735_page_34.pdf
Created: pdf/mini_dataset_pdf/351735_page_35.pdf
Created: pdf/mini_dataset_pdf/351735_page_36.pdf
Created: pdf/mini_dataset_pdf/351735_page_37.pdf
Created: pdf/mini_dataset_pdf/351735_page_38.pdf
Created: pdf/mini_dataset_pdf/351735_page_39.pdf
Created: pdf/mini_dataset_pdf/351735_page_40.pdf
Created: pdf/mini_da

In [None]:
for pdf_file in os.listdir(miniPDFDataset):
    if 

In [18]:
for pdf_file in os.listdir(miniPDFDataset):
    if 'page' not in pdf_file:
        pdf_splitter(pdf_file)

Created: pdf/mini_dataset_pdf/209703_page_1.pdf


Now we'll do some conversions.

In [19]:
for pdf_file in os.listdir(miniPDFDataset):
    if 'page' not in pdf_file:
        os.remove(os.path.join(miniPDFDataset, pdf_file))

In [21]:
pdf_files = glob.glob(miniPDFDataset + '/*.pdf')
for pdf_file in pdf_files:
    basename = os.path.splitext(os.path.basename(pdf_file))[0]
    #print(basename)
    pngoutName = basename + '.png'
    #print(pngoutName)
    pngout = os.path.join(pngFromTop, pngoutName)
    print(pngout)
    subprocess.call(['convert', '-density', '300', pdf_file, pngout]) # 72 dpi is the default value

pdf/mini_dataset_png/134108_page_18.png
pdf/mini_dataset_png/134108_page_28.png
pdf/mini_dataset_png/236213_page_11.png
pdf/mini_dataset_png/351735_page_24.png
pdf/mini_dataset_png/351735_page_30.png
pdf/mini_dataset_png/08796_page_3.png
pdf/mini_dataset_png/35208_page_1.png
pdf/mini_dataset_png/66043_page_2.png
pdf/mini_dataset_png/134108_page_32.png
pdf/mini_dataset_png/134108_page_38.png
pdf/mini_dataset_png/105172_page_238.png
pdf/mini_dataset_png/289624_page_4.png
pdf/mini_dataset_png/351735_page_10.png
pdf/mini_dataset_png/04216_page_7.png
pdf/mini_dataset_png/390590_page_1.png
pdf/mini_dataset_png/55121_page_9.png
pdf/mini_dataset_png/105172_page_217.png
pdf/mini_dataset_png/105172_page_137.png
pdf/mini_dataset_png/221917_page_3.png
pdf/mini_dataset_png/39124_page_17.png
pdf/mini_dataset_png/105172_page_84.png
pdf/mini_dataset_png/105172_page_10.png
pdf/mini_dataset_png/407960_page_5.png
pdf/mini_dataset_png/221917_page_18.png
pdf/mini_dataset_png/105172_page_171.png
pdf/mini_da

pdf/mini_dataset_png/53481_page_1.png
pdf/mini_dataset_png/311080_page_1.png
pdf/mini_dataset_png/351735_page_39.png
pdf/mini_dataset_png/105172_page_48.png
pdf/mini_dataset_png/105172_page_141.png
pdf/mini_dataset_png/351735_page_57.png
pdf/mini_dataset_png/38776_page_14.png
pdf/mini_dataset_png/43276_page_65.png
pdf/mini_dataset_png/351735_page_20.png
pdf/mini_dataset_png/295034_page_3.png
pdf/mini_dataset_png/43519_page_9.png
pdf/mini_dataset_png/43276_page_17.png
pdf/mini_dataset_png/105172_page_188.png
pdf/mini_dataset_png/105172_page_34.png
pdf/mini_dataset_png/43276_page_33.png
pdf/mini_dataset_png/246273_page_1.png
pdf/mini_dataset_png/04216_page_5.png
pdf/mini_dataset_png/53481_page_4.png
pdf/mini_dataset_png/309804_page_5.png
pdf/mini_dataset_png/236213_page_9.png
pdf/mini_dataset_png/134108_page_13.png
pdf/mini_dataset_png/43276_page_64.png
pdf/mini_dataset_png/105172_page_110.png
pdf/mini_dataset_png/105172_page_42.png
pdf/mini_dataset_png/53481_page_13.png
pdf/mini_dataset

pdf/mini_dataset_png/105172_page_44.png
pdf/mini_dataset_png/351735_page_15.png
pdf/mini_dataset_png/134108_page_47.png
pdf/mini_dataset_png/105172_page_104.png
pdf/mini_dataset_png/134108_page_51.png
pdf/mini_dataset_png/351735_page_37.png
pdf/mini_dataset_png/105172_page_82.png
pdf/mini_dataset_png/134108_page_39.png
pdf/mini_dataset_png/105172_page_235.png
pdf/mini_dataset_png/203382_page_43.png
pdf/mini_dataset_png/351735_page_13.png
pdf/mini_dataset_png/43276_page_8.png
pdf/mini_dataset_png/29075_page_2.png
pdf/mini_dataset_png/407960_page_3.png
pdf/mini_dataset_png/43276_page_9.png
pdf/mini_dataset_png/29075_page_4.png
pdf/mini_dataset_png/04075_page_2.png
pdf/mini_dataset_png/55121_page_12.png
pdf/mini_dataset_png/111506_page_1.png
pdf/mini_dataset_png/203382_page_31.png
pdf/mini_dataset_png/105172_page_2.png
pdf/mini_dataset_png/105172_page_208.png
pdf/mini_dataset_png/57465_page_3.png
pdf/mini_dataset_png/351735_page_27.png
pdf/mini_dataset_png/351735_page_5.png
pdf/mini_datas

pdf/mini_dataset_png/100340_page_4.png
pdf/mini_dataset_png/49635_page_11.png
pdf/mini_dataset_png/50109_page_15.png
pdf/mini_dataset_png/246294_page_2.png
pdf/mini_dataset_png/203382_page_20.png
pdf/mini_dataset_png/55789_page_3.png
pdf/mini_dataset_png/55121_page_7.png
pdf/mini_dataset_png/39124_page_12.png
pdf/mini_dataset_png/100340_page_1.png
pdf/mini_dataset_png/105172_page_195.png
pdf/mini_dataset_png/38776_page_18.png
pdf/mini_dataset_png/66043_page_3.png
pdf/mini_dataset_png/43276_page_60.png
pdf/mini_dataset_png/105172_page_56.png
pdf/mini_dataset_png/105172_page_135.png
pdf/mini_dataset_png/105172_page_132.png
pdf/mini_dataset_png/134108_page_52.png
pdf/mini_dataset_png/111506_page_3.png
pdf/mini_dataset_png/105172_page_123.png
pdf/mini_dataset_png/146320_page_1.png
pdf/mini_dataset_png/105172_page_209.png
pdf/mini_dataset_png/43276_page_22.png
pdf/mini_dataset_png/221917_page_1.png
pdf/mini_dataset_png/105172_page_79.png
pdf/mini_dataset_png/55121_page_20.png
pdf/mini_datas

pdf/mini_dataset_png/50109_page_5.png
pdf/mini_dataset_png/38776_page_20.png
pdf/mini_dataset_png/236213_page_12.png
pdf/mini_dataset_png/43276_page_75.png
pdf/mini_dataset_png/377645_page_2.png
pdf/mini_dataset_png/121582_page_5.png
pdf/mini_dataset_png/338220_page_2.png
pdf/mini_dataset_png/105172_page_109.png
pdf/mini_dataset_png/351735_page_4.png
pdf/mini_dataset_png/134108_page_11.png
pdf/mini_dataset_png/146320_page_2.png
pdf/mini_dataset_png/351735_page_47.png
pdf/mini_dataset_png/105172_page_202.png
pdf/mini_dataset_png/221917_page_6.png
pdf/mini_dataset_png/105172_page_231.png
pdf/mini_dataset_png/39124_page_7.png
pdf/mini_dataset_png/65447_page_2.png
pdf/mini_dataset_png/29075_page_24.png
pdf/mini_dataset_png/43276_page_55.png
pdf/mini_dataset_png/66043_page_4.png
pdf/mini_dataset_png/145924_page_2.png
pdf/mini_dataset_png/43519_page_18.png
pdf/mini_dataset_png/55121_page_28.png
pdf/mini_dataset_png/39124_page_4.png
pdf/mini_dataset_png/55789_page_4.png


Converted!

# Make Larger Datasets

In [3]:
current_dir = '/data1/dbashir/Project/Summer2018/'
topFolder = 'pdf'
resultsFolder = 'results_top50'
intermedFolder = 'results_intermediate'
pdfSmallFolder = 'small_dataset_pdf'
pngSmallFolder = 'small_dataset_png'

dataSetSize = 700

resultsPath = os.path.join(topFolder,resultsFolder)
intermedPath = os.path.join(topFolder, intermedFolder)
intermedFromTop = os.path.join(current_dir, intermedPath)

pdfDir = os.path.join(current_dir, topFolder)
smallPDFDataset = os.path.join(topFolder, pdfSmallFolder)
smallPNGDataset = os.path.join(pdfDir, pngSmallFolder)
pngFromTop = os.path.join(topFolder, pngSmallFolder)
pdfFromTop = os.path.join(pdfDir, pdfSmallFolder)
rotPngFromTop = os.path.join(topFolder, 'small_dataset_png_rot')

fileList = []

print(pdfDir)

/data1/dbashir/Project/Summer2018/pdf


In [6]:
g = glob.glob(os.path.join(intermedPath,'*.pdf'))
shuf = np.random.permutation(g)
for i in range(dataSetSize):
    name = shuf[i]
    name = name[25:]
    #os.rename(shuf[i], os.path.join(extra_dataset, name)) # also add so we can look at just these 5
    os.rename(shuf[i], os.path.join(smallPDFDataset, name))

In [6]:
def pdf_splitter_small(path):
    fname = os.path.splitext(os.path.basename(path))[0]
    
    #print(path)
    path = os.path.join(smallPDFDataset, path)
    #print(path)
 
    pdf = PdfFileReader(open(path,"rb"))
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))
 
        output_filename = '{}_page_{}.pdf'.format(
            fname, page+1)
    
        output_filename = os.path.join(smallPDFDataset, output_filename)
 
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)
 
        print('Created: {}'.format(output_filename))

In [2]:
for pdf_file in os.listdir(smallPDFDataset):
    try:
        pdf_splitter_small(pdf_file)
    except:
        continue

NameError: name 'smallPDFDataset' is not defined

In [23]:
for pdf_file in os.listdir(smallPDFDataset):
    if 'page' not in pdf_file:
        os.remove(os.path.join(smallPDFDataset, pdf_file))

In [7]:
pdf_files = glob.glob(smallPDFDataset + '/*.pdf')
for pdf_file in pdf_files:
    basename = os.path.splitext(os.path.basename(pdf_file))[0]
    #print(basename)
    pngoutName = basename + '.png'
    #print(pngoutName)
    pngout = os.path.join(pngFromTop, pngoutName)
    print(pngout)
    if pngoutName not in os.listdir(pngFromTop):
        subprocess.call(['convert', '-density', '300', pdf_file, pngout]) # 72 dpi is the default value
    else:
        print("already converted")
        continue

pdf/small_dataset_png/279976_page_39.png
already converted
pdf/small_dataset_png/06999_page_11.png
already converted
pdf/small_dataset_png/240042_page_23.png
already converted
pdf/small_dataset_png/192497_page_2.png
already converted
pdf/small_dataset_png/00066_page_12.png
already converted
pdf/small_dataset_png/28605_page_1.png
already converted
pdf/small_dataset_png/497439_page_7.png
already converted
pdf/small_dataset_png/106406_page_112.png
already converted
pdf/small_dataset_png/106406_page_131.png
already converted
pdf/small_dataset_png/40843_page_10.png
already converted
pdf/small_dataset_png/26790_page_33.png
already converted
pdf/small_dataset_png/65145_page_18.png
already converted
pdf/small_dataset_png/297229_page_95.png
already converted
pdf/small_dataset_png/481267_page_5.png
already converted
pdf/small_dataset_png/26655_page_68.png
already converted
pdf/small_dataset_png/28896_page_8.png
already converted
pdf/small_dataset_png/427935_page_1.png
already converted
pdf/small

already converted
pdf/small_dataset_png/26655_page_105.png
already converted
pdf/small_dataset_png/268479_page_3.png
already converted
pdf/small_dataset_png/30415_page_115.png
already converted
pdf/small_dataset_png/51782_page_29.png
already converted
pdf/small_dataset_png/40843_page_20.png
already converted
pdf/small_dataset_png/311740_page_3.png
already converted
pdf/small_dataset_png/400052_page_4.png
already converted
pdf/small_dataset_png/344911_page_3.png
already converted
pdf/small_dataset_png/35997_page_6.png
already converted
pdf/small_dataset_png/30415_page_81.png
already converted
pdf/small_dataset_png/222198_page_3.png
already converted
pdf/small_dataset_png/106406_page_28.png
already converted
pdf/small_dataset_png/445835_page_46.png
already converted
pdf/small_dataset_png/106406_page_218.png
already converted
pdf/small_dataset_png/247511_page_4.png
already converted
pdf/small_dataset_png/65144_page_42.png
already converted
pdf/small_dataset_png/46592_page_1.png
already co

already converted
pdf/small_dataset_png/435137_page_17.png
already converted
pdf/small_dataset_png/51782_page_15.png
already converted
pdf/small_dataset_png/06999_page_2.png
already converted
pdf/small_dataset_png/26015_page_14.png
already converted
pdf/small_dataset_png/02231_page_4.png
already converted
pdf/small_dataset_png/241697_page_11.png
already converted
pdf/small_dataset_png/240042_page_52.png
already converted
pdf/small_dataset_png/38410_page_9.png
already converted
pdf/small_dataset_png/445712_page_72.png
already converted
pdf/small_dataset_png/114536_page_56.png
already converted
pdf/small_dataset_png/06357_page_6.png
already converted
pdf/small_dataset_png/442916_page_10.png
already converted
pdf/small_dataset_png/13598_page_4.png
already converted
pdf/small_dataset_png/12980_page_46.png
already converted
pdf/small_dataset_png/107236_page_58.png
already converted
pdf/small_dataset_png/293550_page_1.png
already converted
pdf/small_dataset_png/261900_page_18.png
already con

already converted
pdf/small_dataset_png/348450_page_12.png
already converted
pdf/small_dataset_png/106406_page_99.png
already converted
pdf/small_dataset_png/330059_page_14.png
already converted
pdf/small_dataset_png/33056_page_8.png
already converted
pdf/small_dataset_png/178105_page_12.png
already converted
pdf/small_dataset_png/00100_page_19.png
already converted
pdf/small_dataset_png/355461_page_7.png
already converted
pdf/small_dataset_png/433909_page_3.png
already converted
pdf/small_dataset_png/42119_page_16.png
already converted
pdf/small_dataset_png/261900_page_44.png
already converted
pdf/small_dataset_png/19386_page_43.png
already converted
pdf/small_dataset_png/426281_page_12.png
already converted
pdf/small_dataset_png/89141_page_111.png
already converted
pdf/small_dataset_png/02028_page_121.png
already converted
pdf/small_dataset_png/297229_page_50.png
already converted
pdf/small_dataset_png/19386_page_129.png
already converted
pdf/small_dataset_png/114536_page_9.png
alrea

pdf/small_dataset_png/02028_page_29.png
pdf/small_dataset_png/347782_page_15.png
pdf/small_dataset_png/44700_page_55.png
pdf/small_dataset_png/51033_page_10.png
pdf/small_dataset_png/107236_page_15.png
pdf/small_dataset_png/38410_page_50.png
pdf/small_dataset_png/25425_page_8.png
pdf/small_dataset_png/297229_page_24.png
pdf/small_dataset_png/04372_page_3.png
pdf/small_dataset_png/26994_page_1.png
pdf/small_dataset_png/348450_page_5.png
pdf/small_dataset_png/65218_page_17.png
pdf/small_dataset_png/27331_page_130.png
pdf/small_dataset_png/104004_page_4.png
pdf/small_dataset_png/01225_page_7.png
pdf/small_dataset_png/445712_page_7.png
pdf/small_dataset_png/61169_page_3.png
pdf/small_dataset_png/297229_page_191.png
pdf/small_dataset_png/454022_page_1.png
pdf/small_dataset_png/64778_page_3.png
pdf/small_dataset_png/445835_page_62.png
pdf/small_dataset_png/106406_page_159.png
pdf/small_dataset_png/01927_page_6.png
pdf/small_dataset_png/36038_page_4.png
pdf/small_dataset_png/106406_page_195.p

pdf/small_dataset_png/111610_page_27.png
pdf/small_dataset_png/27890_page_1.png
pdf/small_dataset_png/381919_page_17.png
pdf/small_dataset_png/269758_page_3.png
pdf/small_dataset_png/89141_page_71.png
pdf/small_dataset_png/495995_page_7.png
pdf/small_dataset_png/297229_page_125.png
pdf/small_dataset_png/241875_page_46.png
pdf/small_dataset_png/248471_page_1.png
pdf/small_dataset_png/02028_page_54.png
pdf/small_dataset_png/106406_page_69.png
pdf/small_dataset_png/03072_page_14.png
pdf/small_dataset_png/228493_page_29.png
pdf/small_dataset_png/45557_page_31.png
pdf/small_dataset_png/228493_page_40.png
pdf/small_dataset_png/00066_page_27.png
pdf/small_dataset_png/53802_page_3.png
pdf/small_dataset_png/445859_page_28.png
pdf/small_dataset_png/43372_page_66.png
pdf/small_dataset_png/420447_page_1.png
pdf/small_dataset_png/02022_page_14.png
pdf/small_dataset_png/104004_page_1.png
pdf/small_dataset_png/03892_page_18.png
pdf/small_dataset_png/43748_page_11.png
pdf/small_dataset_png/111610_page

pdf/small_dataset_png/238222_page_6.png
pdf/small_dataset_png/44698_page_4.png
pdf/small_dataset_png/89141_page_54.png
pdf/small_dataset_png/445712_page_82.png
pdf/small_dataset_png/106406_page_235.png
pdf/small_dataset_png/25248_page_3.png
pdf/small_dataset_png/377773_page_17.png
pdf/small_dataset_png/02231_page_5.png
pdf/small_dataset_png/445835_page_21.png
pdf/small_dataset_png/111610_page_238.png
pdf/small_dataset_png/111610_page_102.png
pdf/small_dataset_png/48172_page_2.png
pdf/small_dataset_png/01896_page_3.png
pdf/small_dataset_png/426281_page_16.png
pdf/small_dataset_png/269758_page_6.png
pdf/small_dataset_png/35155_page_5.png
pdf/small_dataset_png/301327_page_3.png
pdf/small_dataset_png/28605_page_10.png
pdf/small_dataset_png/42632_page_11.png
pdf/small_dataset_png/38410_page_24.png
pdf/small_dataset_png/228493_page_126.png
pdf/small_dataset_png/27331_page_88.png
pdf/small_dataset_png/12980_page_39.png
pdf/small_dataset_png/512964_page_1.png
pdf/small_dataset_png/328659_page_

pdf/small_dataset_png/54069_page_12.png
pdf/small_dataset_png/309814_page_1.png
pdf/small_dataset_png/43372_page_44.png
pdf/small_dataset_png/26655_page_24.png
pdf/small_dataset_png/440774_page_20.png
pdf/small_dataset_png/297229_page_103.png
pdf/small_dataset_png/42632_page_8.png
pdf/small_dataset_png/228493_page_111.png
pdf/small_dataset_png/33606_page_2.png
pdf/small_dataset_png/63436_page_13.png
pdf/small_dataset_png/114536_page_97.png
pdf/small_dataset_png/28605_page_15.png
pdf/small_dataset_png/228493_page_149.png
pdf/small_dataset_png/301327_page_41.png
pdf/small_dataset_png/317486_page_6.png
pdf/small_dataset_png/114536_page_51.png
pdf/small_dataset_png/173670_page_2.png
pdf/small_dataset_png/02028_page_20.png
pdf/small_dataset_png/178105_page_16.png
pdf/small_dataset_png/06103_page_29.png
pdf/small_dataset_png/01856_page_3.png
pdf/small_dataset_png/27887_page_4.png
pdf/small_dataset_png/106406_page_132.png
pdf/small_dataset_png/293209_page_12.png
pdf/small_dataset_png/27315_pa

pdf/small_dataset_png/26790_page_18.png
pdf/small_dataset_png/00189_page_8.png
pdf/small_dataset_png/09691_page_63.png
pdf/small_dataset_png/51179_page_28.png
pdf/small_dataset_png/63697_page_5.png
pdf/small_dataset_png/445712_page_40.png
pdf/small_dataset_png/65175_page_8.png
pdf/small_dataset_png/10069_page_15.png
pdf/small_dataset_png/445859_page_69.png
pdf/small_dataset_png/106406_page_43.png
pdf/small_dataset_png/51782_page_7.png
pdf/small_dataset_png/111610_page_225.png
pdf/small_dataset_png/395118_page_16.png
pdf/small_dataset_png/280401_page_10.png
pdf/small_dataset_png/114536_page_100.png
pdf/small_dataset_png/20997_page_21.png
pdf/small_dataset_png/06357_page_44.png
pdf/small_dataset_png/177333_page_19.png
pdf/small_dataset_png/323607_page_3.png
pdf/small_dataset_png/339466_page_1.png
pdf/small_dataset_png/02028_page_100.png
pdf/small_dataset_png/26907_page_6.png
pdf/small_dataset_png/70729_page_20.png
pdf/small_dataset_png/30415_page_33.png
pdf/small_dataset_png/240042_page_

pdf/small_dataset_png/114536_page_8.png
pdf/small_dataset_png/19386_page_19.png
pdf/small_dataset_png/107236_page_48.png
pdf/small_dataset_png/27331_page_35.png
pdf/small_dataset_png/37677_page_4.png
pdf/small_dataset_png/44700_page_166.png
pdf/small_dataset_png/50701_page_6.png
pdf/small_dataset_png/26994_page_6.png
pdf/small_dataset_png/01896_page_2.png
pdf/small_dataset_png/43372_page_43.png
pdf/small_dataset_png/27331_page_95.png
pdf/small_dataset_png/261900_page_43.png
pdf/small_dataset_png/44698_page_72.png
pdf/small_dataset_png/225237_page_5.png
pdf/small_dataset_png/54069_page_35.png
pdf/small_dataset_png/213856_page_8.png
pdf/small_dataset_png/19386_page_55.png
pdf/small_dataset_png/377773_page_115.png
pdf/small_dataset_png/52190_page_10.png
pdf/small_dataset_png/25425_page_18.png
pdf/small_dataset_png/27331_page_33.png
pdf/small_dataset_png/462065_page_1.png
pdf/small_dataset_png/256276_page_2.png
pdf/small_dataset_png/03854_page_2.png
pdf/small_dataset_png/02439_page_1.png
p

pdf/small_dataset_png/295254_page_1.png
pdf/small_dataset_png/178105_page_9.png
pdf/small_dataset_png/445835_page_7.png
pdf/small_dataset_png/20997_page_73.png
pdf/small_dataset_png/02028_page_82.png
pdf/small_dataset_png/297229_page_88.png
pdf/small_dataset_png/279976_page_47.png
pdf/small_dataset_png/33056_page_83.png
pdf/small_dataset_png/293209_page_9.png
pdf/small_dataset_png/393509_page_7.png
pdf/small_dataset_png/00066_page_35.png
pdf/small_dataset_png/228493_page_68.png
pdf/small_dataset_png/25425_page_38.png
pdf/small_dataset_png/279976_page_64.png
pdf/small_dataset_png/65218_page_31.png
pdf/small_dataset_png/44700_page_159.png
pdf/small_dataset_png/439681_page_9.png
pdf/small_dataset_png/03892_page_27.png
pdf/small_dataset_png/304048_page_61.png
pdf/small_dataset_png/347782_page_29.png
pdf/small_dataset_png/12980_page_4.png
pdf/small_dataset_png/89141_page_123.png
pdf/small_dataset_png/107236_page_22.png
pdf/small_dataset_png/355461_page_6.png
pdf/small_dataset_png/01659_page

pdf/small_dataset_png/445835_page_40.png
pdf/small_dataset_png/445712_page_71.png
pdf/small_dataset_png/51660_page_6.png
pdf/small_dataset_png/263314_page_11.png
pdf/small_dataset_png/09691_page_22.png
pdf/small_dataset_png/19386_page_69.png
pdf/small_dataset_png/298137_page_25.png
pdf/small_dataset_png/111610_page_160.png
pdf/small_dataset_png/27331_page_118.png
pdf/small_dataset_png/65144_page_53.png
pdf/small_dataset_png/253953_page_10.png
pdf/small_dataset_png/27706_page_87.png
pdf/small_dataset_png/304048_page_54.png
pdf/small_dataset_png/455910_page_29.png
pdf/small_dataset_png/65219_page_31.png
pdf/small_dataset_png/106406_page_206.png
pdf/small_dataset_png/44700_page_49.png
pdf/small_dataset_png/263458_page_4.png
pdf/small_dataset_png/30550_page_35.png
pdf/small_dataset_png/444134_page_13.png
pdf/small_dataset_png/60462_page_46.png
pdf/small_dataset_png/39966_page_11.png
pdf/small_dataset_png/76336_page_15.png
pdf/small_dataset_png/74207_page_7.png
pdf/small_dataset_png/38410_p

pdf/small_dataset_png/19240_page_51.png
pdf/small_dataset_png/21142_page_21.png
pdf/small_dataset_png/304048_page_7.png
pdf/small_dataset_png/347782_page_36.png
pdf/small_dataset_png/45557_page_43.png
pdf/small_dataset_png/290820_page_1.png
pdf/small_dataset_png/44700_page_20.png
pdf/small_dataset_png/53802_page_1.png
pdf/small_dataset_png/01507_page_49.png
pdf/small_dataset_png/440774_page_27.png
pdf/small_dataset_png/228493_page_185.png
pdf/small_dataset_png/297229_page_126.png
pdf/small_dataset_png/44700_page_9.png
pdf/small_dataset_png/20997_page_52.png
pdf/small_dataset_png/111610_page_21.png
pdf/small_dataset_png/51179_page_13.png
pdf/small_dataset_png/01507_page_17.png
pdf/small_dataset_png/48271_page_3.png
pdf/small_dataset_png/19386_page_95.png
pdf/small_dataset_png/29540_page_2.png
pdf/small_dataset_png/395118_page_74.png
pdf/small_dataset_png/01507_page_36.png
pdf/small_dataset_png/259849_page_2.png
pdf/small_dataset_png/89141_page_149.png
pdf/small_dataset_png/301327_page_4

pdf/small_dataset_png/44700_page_10.png
pdf/small_dataset_png/06999_page_6.png
pdf/small_dataset_png/51660_page_1.png
pdf/small_dataset_png/107236_page_64.png
pdf/small_dataset_png/38845_page_8.png
pdf/small_dataset_png/240042_page_2.png
pdf/small_dataset_png/33056_page_54.png
pdf/small_dataset_png/377773_page_37.png
pdf/small_dataset_png/38410_page_45.png
pdf/small_dataset_png/409643_page_2.png
pdf/small_dataset_png/298137_page_38.png
pdf/small_dataset_png/27706_page_66.png
pdf/small_dataset_png/177333_page_22.png
pdf/small_dataset_png/97631_page_10.png
pdf/small_dataset_png/177333_page_25.png
pdf/small_dataset_png/41270_page_2.png
pdf/small_dataset_png/298137_page_17.png
pdf/small_dataset_png/06357_page_70.png
pdf/small_dataset_png/266425_page_2.png
pdf/small_dataset_png/108737_page_10.png
pdf/small_dataset_png/03091_page_3.png
pdf/small_dataset_png/381919_page_22.png
pdf/small_dataset_png/38410_page_67.png
pdf/small_dataset_png/41059_page_1.png
pdf/small_dataset_png/259849_page_5.pn

pdf/small_dataset_png/297229_page_202.png
pdf/small_dataset_png/65144_page_24.png
pdf/small_dataset_png/44698_page_111.png
pdf/small_dataset_png/28896_page_20.png
pdf/small_dataset_png/54069_page_36.png
pdf/small_dataset_png/06357_page_11.png
pdf/small_dataset_png/65218_page_101.png
pdf/small_dataset_png/44700_page_139.png
pdf/small_dataset_png/347782_page_16.png
pdf/small_dataset_png/60462_page_22.png
pdf/small_dataset_png/65218_page_88.png
pdf/small_dataset_png/65145_page_16.png
pdf/small_dataset_png/01174_page_11.png
pdf/small_dataset_png/43255_page_6.png
pdf/small_dataset_png/298137_page_37.png
pdf/small_dataset_png/25156_page_19.png
pdf/small_dataset_png/298137_page_64.png
pdf/small_dataset_png/38410_page_8.png
pdf/small_dataset_png/347782_page_20.png
pdf/small_dataset_png/54069_page_17.png
pdf/small_dataset_png/51179_page_29.png
pdf/small_dataset_png/297229_page_254.png
pdf/small_dataset_png/01225_page_3.png
pdf/small_dataset_png/01659_page_24.png
pdf/small_dataset_png/395118_pag

pdf/small_dataset_png/44698_page_27.png
pdf/small_dataset_png/435137_page_5.png
pdf/small_dataset_png/02028_page_94.png
pdf/small_dataset_png/393509_page_13.png
pdf/small_dataset_png/328659_page_32.png
pdf/small_dataset_png/39119_page_2.png
pdf/small_dataset_png/266425_page_4.png
pdf/small_dataset_png/38356_page_6.png
pdf/small_dataset_png/505642_page_4.png
pdf/small_dataset_png/502046_page_11.png
pdf/small_dataset_png/28605_page_18.png
pdf/small_dataset_png/111610_page_36.png
pdf/small_dataset_png/26655_page_39.png
pdf/small_dataset_png/279976_page_42.png
pdf/small_dataset_png/33056_page_44.png
pdf/small_dataset_png/377773_page_84.png
pdf/small_dataset_png/275061_page_1.png
pdf/small_dataset_png/33056_page_78.png
pdf/small_dataset_png/347588_page_1.png
pdf/small_dataset_png/110287_page_1.png
pdf/small_dataset_png/09691_page_50.png
pdf/small_dataset_png/43372_page_23.png
pdf/small_dataset_png/85142_page_9.png
pdf/small_dataset_png/301327_page_16.png
pdf/small_dataset_png/297229_page_25

pdf/small_dataset_png/00222_page_7.png
pdf/small_dataset_png/44700_page_145.png
pdf/small_dataset_png/50642_page_3.png
pdf/small_dataset_png/02028_page_41.png
pdf/small_dataset_png/279976_page_70.png
pdf/small_dataset_png/446959_page_7.png
pdf/small_dataset_png/420447_page_2.png
pdf/small_dataset_png/26907_page_11.png
pdf/small_dataset_png/02028_page_34.png
pdf/small_dataset_png/09691_page_16.png
pdf/small_dataset_png/445859_page_53.png
pdf/small_dataset_png/26655_page_5.png
pdf/small_dataset_png/108737_page_6.png
pdf/small_dataset_png/244124_page_1.png
pdf/small_dataset_png/455910_page_12.png
pdf/small_dataset_png/445712_page_65.png
pdf/small_dataset_png/340197_page_17.png
pdf/small_dataset_png/37949_page_1.png
pdf/small_dataset_png/12980_page_33.png
pdf/small_dataset_png/65218_page_30.png
pdf/small_dataset_png/27706_page_68.png
pdf/small_dataset_png/01507_page_3.png
pdf/small_dataset_png/238222_page_3.png
pdf/small_dataset_png/435137_page_4.png
pdf/small_dataset_png/62709_page_1.png


pdf/small_dataset_png/30521_page_45.png
pdf/small_dataset_png/38410_page_66.png
pdf/small_dataset_png/55136_page_21.png
pdf/small_dataset_png/51660_page_4.png
pdf/small_dataset_png/19690_page_5.png
pdf/small_dataset_png/224557_page_1.png
pdf/small_dataset_png/25381_page_68.png
pdf/small_dataset_png/02028_page_128.png
pdf/small_dataset_png/228493_page_147.png
pdf/small_dataset_png/00066_page_36.png
pdf/small_dataset_png/252216_page_1.png
pdf/small_dataset_png/41270_page_9.png
pdf/small_dataset_png/211194_page_1.png
pdf/small_dataset_png/117285_page_31.png
pdf/small_dataset_png/65261_page_1.png
pdf/small_dataset_png/38356_page_4.png
pdf/small_dataset_png/51782_page_11.png
pdf/small_dataset_png/228493_page_28.png
pdf/small_dataset_png/66515_page_4.png
pdf/small_dataset_png/44698_page_94.png
pdf/small_dataset_png/111610_page_39.png
pdf/small_dataset_png/00853_page_30.png
pdf/small_dataset_png/107236_page_1.png
pdf/small_dataset_png/16319_page_5.png
pdf/small_dataset_png/297229_page_106.png

pdf/small_dataset_png/26994_page_5.png
pdf/small_dataset_png/111610_page_38.png
pdf/small_dataset_png/25248_page_53.png
pdf/small_dataset_png/263458_page_18.png
pdf/small_dataset_png/111610_page_34.png
pdf/small_dataset_png/28046_page_1.png
pdf/small_dataset_png/465707_page_5.png
pdf/small_dataset_png/27331_page_119.png
pdf/small_dataset_png/520654_page_7.png
pdf/small_dataset_png/297229_page_222.png
pdf/small_dataset_png/45557_page_61.png
pdf/small_dataset_png/19240_page_77.png
pdf/small_dataset_png/241697_page_6.png
pdf/small_dataset_png/40843_page_38.png
pdf/small_dataset_png/60462_page_34.png
pdf/small_dataset_png/50199_page_1.png
pdf/small_dataset_png/297229_page_60.png
pdf/small_dataset_png/06028_page_15.png
pdf/small_dataset_png/426281_page_8.png
pdf/small_dataset_png/228493_page_146.png
pdf/small_dataset_png/114536_page_25.png
pdf/small_dataset_png/427261_page_1.png
pdf/small_dataset_png/313634_page_1.png
pdf/small_dataset_png/16015_page_3.png
pdf/small_dataset_png/263458_page_

pdf/small_dataset_png/439681_page_2.png
pdf/small_dataset_png/26655_page_22.png
pdf/small_dataset_png/26655_page_32.png
pdf/small_dataset_png/178105_page_31.png
pdf/small_dataset_png/06357_page_45.png
pdf/small_dataset_png/01927_page_4.png
pdf/small_dataset_png/03219_page_2.png
pdf/small_dataset_png/03892_page_35.png
pdf/small_dataset_png/30415_page_130.png
pdf/small_dataset_png/347782_page_2.png
pdf/small_dataset_png/00853_page_9.png
pdf/small_dataset_png/219570_page_2.png
pdf/small_dataset_png/347782_page_50.png
pdf/small_dataset_png/09691_page_11.png
pdf/small_dataset_png/60462_page_41.png
pdf/small_dataset_png/06357_page_31.png
pdf/small_dataset_png/28039_page_2.png
pdf/small_dataset_png/263314_page_3.png
pdf/small_dataset_png/393509_page_18.png
pdf/small_dataset_png/377773_page_65.png
pdf/small_dataset_png/44698_page_60.png
pdf/small_dataset_png/498786_page_4.png
pdf/small_dataset_png/45557_page_2.png
pdf/small_dataset_png/27331_page_22.png
pdf/small_dataset_png/26905_page_1.png
p

pdf/small_dataset_png/279976_page_28.png
pdf/small_dataset_png/445859_page_79.png
pdf/small_dataset_png/269106_page_1.png
pdf/small_dataset_png/27331_page_39.png
pdf/small_dataset_png/57686_page_32.png
pdf/small_dataset_png/106406_page_185.png
pdf/small_dataset_png/445835_page_18.png
pdf/small_dataset_png/177333_page_17.png
pdf/small_dataset_png/304048_page_44.png
pdf/small_dataset_png/30550_page_14.png
pdf/small_dataset_png/02028_page_79.png
pdf/small_dataset_png/53432_page_3.png
pdf/small_dataset_png/44698_page_25.png
pdf/small_dataset_png/454784_page_3.png
pdf/small_dataset_png/10131_page_12.png
pdf/small_dataset_png/45557_page_81.png
pdf/small_dataset_png/288070_page_5.png
pdf/small_dataset_png/01659_page_15.png
pdf/small_dataset_png/19690_page_19.png
pdf/small_dataset_png/02028_page_99.png
pdf/small_dataset_png/240042_page_35.png
pdf/small_dataset_png/218474_page_1.png
pdf/small_dataset_png/111017_page_4.png
pdf/small_dataset_png/240042_page_13.png
pdf/small_dataset_png/51782_page

pdf/small_dataset_png/76336_page_26.png
pdf/small_dataset_png/45557_page_48.png
pdf/small_dataset_png/52190_page_2.png
pdf/small_dataset_png/347782_page_49.png
pdf/small_dataset_png/261900_page_36.png
pdf/small_dataset_png/01507_page_54.png
pdf/small_dataset_png/377773_page_1.png
pdf/small_dataset_png/240042_page_38.png
pdf/small_dataset_png/445712_page_28.png
pdf/small_dataset_png/27331_page_43.png
pdf/small_dataset_png/01507_page_19.png
pdf/small_dataset_png/213856_page_18.png
pdf/small_dataset_png/00230_page_1.png
pdf/small_dataset_png/261900_page_16.png
pdf/small_dataset_png/33056_page_48.png
pdf/small_dataset_png/428307_page_1.png
pdf/small_dataset_png/241697_page_7.png
pdf/small_dataset_png/298137_page_65.png
pdf/small_dataset_png/503175_page_1.png
pdf/small_dataset_png/297229_page_162.png
pdf/small_dataset_png/19386_page_85.png
pdf/small_dataset_png/427261_page_4.png
pdf/small_dataset_png/228493_page_167.png
pdf/small_dataset_png/30550_page_8.png
pdf/small_dataset_png/269758_pag

pdf/small_dataset_png/240042_page_37.png
pdf/small_dataset_png/44700_page_91.png
pdf/small_dataset_png/43372_page_34.png
pdf/small_dataset_png/500618_page_4.png
pdf/small_dataset_png/178105_page_1.png
pdf/small_dataset_png/27331_page_57.png
pdf/small_dataset_png/06357_page_42.png
pdf/small_dataset_png/19386_page_47.png
pdf/small_dataset_png/22671_page_2.png
pdf/small_dataset_png/301327_page_2.png
pdf/small_dataset_png/09691_page_65.png
pdf/small_dataset_png/297229_page_206.png
pdf/small_dataset_png/70729_page_24.png
pdf/small_dataset_png/72610_page_7.png
pdf/small_dataset_png/106406_page_122.png
pdf/small_dataset_png/38410_page_40.png
pdf/small_dataset_png/106406_page_183.png
pdf/small_dataset_png/09691_page_3.png
pdf/small_dataset_png/09431_page_4.png
pdf/small_dataset_png/01377_page_14.png
pdf/small_dataset_png/26655_page_48.png
pdf/small_dataset_png/38410_page_37.png
pdf/small_dataset_png/412886_page_1.png
pdf/small_dataset_png/469992_page_4.png
pdf/small_dataset_png/19240_page_40.p

pdf/small_dataset_png/12980_page_37.png
pdf/small_dataset_png/26655_page_6.png
pdf/small_dataset_png/395118_page_15.png
pdf/small_dataset_png/28896_page_16.png
pdf/small_dataset_png/44698_page_88.png
pdf/small_dataset_png/19386_page_64.png
pdf/small_dataset_png/19386_page_77.png
pdf/small_dataset_png/228493_page_108.png
pdf/small_dataset_png/25381_page_11.png
pdf/small_dataset_png/228493_page_177.png
pdf/small_dataset_png/134695_page_1.png
pdf/small_dataset_png/279976_page_7.png
pdf/small_dataset_png/106406_page_22.png
pdf/small_dataset_png/297229_page_132.png
pdf/small_dataset_png/44698_page_91.png
pdf/small_dataset_png/112431_page_23.png
pdf/small_dataset_png/30550_page_22.png
pdf/small_dataset_png/297229_page_85.png
pdf/small_dataset_png/44698_page_49.png
pdf/small_dataset_png/455910_page_32.png
pdf/small_dataset_png/512624_page_3.png
pdf/small_dataset_png/30521_page_102.png
pdf/small_dataset_png/03205_page_4.png
pdf/small_dataset_png/25425_page_29.png
pdf/small_dataset_png/246957_p

pdf/small_dataset_png/35620_page_8.png
pdf/small_dataset_png/324076_page_4.png
pdf/small_dataset_png/240042_page_10.png
pdf/small_dataset_png/27331_page_66.png
pdf/small_dataset_png/25156_page_2.png
pdf/small_dataset_png/20805_page_6.png
pdf/small_dataset_png/01507_page_22.png
pdf/small_dataset_png/20997_page_69.png
pdf/small_dataset_png/298137_page_2.png
pdf/small_dataset_png/445712_page_90.png
pdf/small_dataset_png/111610_page_159.png
pdf/small_dataset_png/241875_page_2.png
pdf/small_dataset_png/44700_page_41.png
pdf/small_dataset_png/495995_page_10.png
pdf/small_dataset_png/65218_page_48.png
pdf/small_dataset_png/54069_page_42.png
pdf/small_dataset_png/55136_page_23.png
pdf/small_dataset_png/253953_page_2.png
pdf/small_dataset_png/35565_page_2.png
pdf/small_dataset_png/89141_page_142.png
pdf/small_dataset_png/04565_page_2.png
pdf/small_dataset_png/111610_page_218.png
pdf/small_dataset_png/60196_page_1.png
pdf/small_dataset_png/106406_page_221.png
pdf/small_dataset_png/01246_page_11.

pdf/small_dataset_png/43372_page_22.png
pdf/small_dataset_png/304048_page_43.png
pdf/small_dataset_png/00853_page_35.png
pdf/small_dataset_png/111610_page_83.png
pdf/small_dataset_png/274794_page_4.png
pdf/small_dataset_png/166577_page_1.png
pdf/small_dataset_png/29067_page_5.png
pdf/small_dataset_png/444134_page_2.png
pdf/small_dataset_png/65144_page_37.png
pdf/small_dataset_png/347782_page_48.png
pdf/small_dataset_png/112719_page_7.png
pdf/small_dataset_png/30521_page_18.png
pdf/small_dataset_png/440301_page_12.png
pdf/small_dataset_png/397785_page_7.png
pdf/small_dataset_png/497326_page_10.png
pdf/small_dataset_png/114536_page_23.png
pdf/small_dataset_png/28896_page_2.png
pdf/small_dataset_png/44700_page_155.png
pdf/small_dataset_png/65144_page_2.png
pdf/small_dataset_png/228493_page_113.png
pdf/small_dataset_png/106406_page_3.png
pdf/small_dataset_png/107236_page_96.png
pdf/small_dataset_png/330059_page_18.png
pdf/small_dataset_png/51179_page_1.png
pdf/small_dataset_png/111610_page

pdf/small_dataset_png/380499_page_21.png
pdf/small_dataset_png/192497_page_3.png
pdf/small_dataset_png/44698_page_39.png
pdf/small_dataset_png/297229_page_269.png
pdf/small_dataset_png/00706_page_2.png
pdf/small_dataset_png/43255_page_7.png
pdf/small_dataset_png/26905_page_4.png
pdf/small_dataset_png/03354_page_16.png
pdf/small_dataset_png/54069_page_16.png
pdf/small_dataset_png/02194_page_5.png
pdf/small_dataset_png/41059_page_6.png
pdf/small_dataset_png/30550_page_45.png
pdf/small_dataset_png/27331_page_20.png
pdf/small_dataset_png/42632_page_5.png
pdf/small_dataset_png/43372_page_78.png
pdf/small_dataset_png/09431_page_7.png
pdf/small_dataset_png/02028_page_126.png
pdf/small_dataset_png/26535_page_1.png
pdf/small_dataset_png/25156_page_28.png
pdf/small_dataset_png/19386_page_10.png
pdf/small_dataset_png/435137_page_10.png
pdf/small_dataset_png/53432_page_1.png
pdf/small_dataset_png/08736_page_9.png
pdf/small_dataset_png/48684_page_2.png
pdf/small_dataset_png/26655_page_104.png
pdf/s

KeyboardInterrupt: 

# Medium Dataset

Note: I couldn't get out all of the handwritten scores it seems. Other things to consider might be removing anything where publisher date says it's before 1800 or smth, things like that.

In [None]:
current_dir = '/data1/dbashir/Project/Summer2018/'
topFolder = 'pdf'
resultsFolder = 'results_top50'
intermedFolder = 'results_intermediate'
pdfMedFolder = 'medium_dataset_pdf'
pngMedFolder = 'medium_dataset_png'

dataSetSize = 7000

resultsPath = os.path.join(topFolder,resultsFolder)
intermedPath = os.path.join(topFolder, intermedFolder)
intermedFromTop = os.path.join(current_dir, intermedPath)

pdfDir = os.path.join(current_dir, topFolder)
medPDFDataset = os.path.join(topFolder, pdfMedFolder)
medPNGDataset = os.path.join(pdfDir, pngMedFolder)
pngFromTop = os.path.join(topFolder, pngMedFolder)
pdfFromTop = os.path.join(pdfDir, pdfMedFolder)
rotPngFromTop = os.path.join(topFolder, 'medium_dataset_png_rot')

fileList = []

print(pdfDir)

In [None]:
g = glob.glob(os.path.join(intermedPath,'*.pdf'))
shuf = np.random.permutation(g)
for i in range(dataSetSize):
    name = shuf[i]
    name = name[25:]
    #os.rename(shuf[i], os.path.join(extra_dataset, name)) # also add so we can look at just these 5
    os.rename(shuf[i], os.path.join(medPDFDataset, name))

In [None]:
def pdf_splitter_med(path):
    fname = os.path.splitext(os.path.basename(path))[0]
    
    #print(path)
    path = os.path.join(medPDFDataset, path)
    #print(path)
 
    pdf = PdfFileReader(open(path,"rb"))
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))
 
        output_filename = '{}_page_{}.pdf'.format(
            fname, page+1)
    
        output_filename = os.path.join(medPDFDataset, output_filename)
 
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)
 
        print('Created: {}'.format(output_filename))

In [None]:
for pdf_file in os.listdir(medPDFDataset):
    try:
        pdf_splitter_med(pdf_file)
    except:
        continue

In [None]:
for pdf_file in os.listdir(medPDFDataset):
    if 'page' not in pdf_file:
        os.remove(os.path.join(medPDFDataset, pdf_file))

In [None]:
pdf_files = glob.glob(medPDFDataset + '/*.pdf')
for pdf_file in pdf_files:
    basename = os.path.splitext(os.path.basename(pdf_file))[0]
    #print(basename)
    pngoutName = basename + '.png'
    #print(pngoutName)
    pngout = os.path.join(pngFromTop, pngoutName)
    print(pngout)  
    if pngoutName not in os.listdir(pngFromTop):
        subprocess.call(['convert', '-density', '300', pdf_file, pngout]) # 72 dpi is the default value
    else:
        print("already converted")
        continue

# Large Dataset

TODO: we need to make sure we correctly crawl the entire database here (not just top 50 composers) since we want this dataset to have a size of 70000. We could also do something like what we did before w the results_intermediate:
- crawl the full dataset
- for i in range 70000: move things to results_intermediate which will actually be final
- convert everything to pages
- convert all to PNG

This seems like it'll take forever though, given what we've seen so far.

In [None]:
current_dir = '/data1/dbashir/Project/Summer2018'
topFolder = 'pdf'
toResults = '/data1/dbashir/Project/'
resultsFolder = 'score_scrape/results/'
intermedFolder = 'results_intermediate_large'
pdfLargeFolder = 'large_dataset_pdf'
pngLargeFolder = 'large_dataset_png'

dataSetSize = 70000

resultsPath = os.path.join(toResults,resultsFolder)
intermedPath = os.path.join(topFolder, intermedFolder)
intermedFromTop = os.path.join(current_dir, intermedPath)

pdfDir = os.path.join(current_dir, topFolder)
largePDFDataset = os.path.join(topFolder, pdfLargeFolder)
largePNGDataset = os.path.join(pdfDir, pngLargeFolder)
pngFromTop = os.path.join(topFolder, pngLargeFolder)
pdfFromTop = os.path.join(pdfDir, pdfLargeFolder)
rotPngFromTop = os.path.join(topFolder, 'large_dataset_png_rot')

fileList = []

print(pdfDir)

In [None]:
def make_bad_pdfs(html_path):
    bad_names = []
    with open(html_path) as f:
        soup = bsoup(f, "html.parser")

    for div in soup.find_all('div', class_ = 'we'):
        # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
        trs = div.find_all('tr') # also get no publisher
        for tr in trs:
            tr = str(tr)
            try:
                if 'Pub' not in tr:
                    value_on = tr[tr.find('indexes='):]
                    bad_name = value_on[8:14]
                    if bad_name[-1] == '/':
                        # if it's a slash, there's something else after it
                        bad_name_two = value_on[14:20]
                        # check if last thing is actually a digit, in case of 5-digit ID
                        if not bad_name_two[-1].isdigit():
                            bad_name_two = bad_name_two[:-1]
                        bad_pdf_two = bad_name_two + '.pdf'
                        bad_names.append(bad_pdf_two)
                    if not bad_name[-1].isdigit():
                        # make could be a 5-digit ID
                        bad_name = bad_name[:-1]
                    bad_pdf = bad_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
                    bad_names.append(bad_pdf) # append this to a list of bad name
            except:
                continue
        div = str(div)
        try:
            if 'Manuscript' in div or 'manuscript' in div:
                value_on = div[div.find('indexes='):] # this is where the ID should show up on a page
                bad_name = value_on[8:14] # just get the id, assuming 6 digits
                if bad_name[-1] == '/':
                    # if it's a slash, there's something else after it
                    bad_name_two = value_on[14:20]
                    # check if last thing is actually a digit, in case of 5-digit ID
                    if not bad_name_two[-1].isdigit():
                        bad_name_two = bad_name_two[:-1]
                    bad_pdf_two = bad_name_two + '.pdf'
                    bad_names.append(bad_pdf_two)
                if not bad_name[-1].isdigit():
                    # make could be a 5-digit ID
                    bad_name = bad_name[:-1]
                bad_pdf = bad_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
                bad_names.append(bad_pdf) # append this to a list of bad names
        except:
            continue
            
    return bad_names

In [None]:
def make_score_pdfs(html_path):
    score_names = []
    divList = []
    with open(html_path) as f:
        soup = bsoup(f, "html.parser")
    for div in soup.find_all('div', class_ = 'we'):
        # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
        div = str(div)
        if 'Score' in div or 'score' in div:
            divList.append(div)
    for div in divList:
        try:
            value_on = div[div.find('indexes='):]
            score_name = value_on[8:14]
            if score_name[-1] == '/':
                    # if it's a slash, there's something else after it
                    score_name_two = value_on[14:20]
                    # check if last thing is actually a digit, in case of 5-digit ID
                    if not score_name_two[-1].isdigit():
                        score_name_two = score_name_two[:-1]
                    score_pdf_two = score_name_two + '.pdf'
                    score_names.append(score_pdf_two)
            if not score_name[-1].isdigit():
                # make could be a 5-digit ID
                score_name = score_name[:-1]
            score_pdf = score_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
            score_names.append(score_pdf) # append this to a list of bad names
        except:
            continue
        
    return score_names

In [None]:
#using os.walk, move all the .pdf files to an intermediate folder for EZ access
all_bad_names = []
all_score_names = []
for subdir, dirs, files in os.walk(resultsPath):
    for pdf_file in files:
        if 'pdf' in pdf_file:
            #move it to intermediate file for shuffling
            html_path = os.path.join(subdir, 'html.txt'j)
            bad_names = make_bad_pdfs(html_path)
            score_names = make_score_pdfs(html_path)
            for bad_name in bad_names:
                all_bad_names.append(bad_name)
            for score_name in score_names:
                all_score_names.append(score_name)
            if pdf_file in bad_names or pdf_file in score_names:
                print("skipping file: " + str(pdf_file))
                continue # skip if it's a handwritten or orchestral score
            if pdf_file not in all_bad_names or pdf_file not in all_score_names:
                pdf_file_path = os.path.join(subdir, pdf_file)
                final_path = os.path.join(intermedPath, pdf_file)
                shutil.copy(pdf_file_path, final_path)