# Objectives

-	Increase dpi and redo conversion CHECK 
-	Require a publisher (will avoid hand-drawn) CHECK
-	Top 50 composers for now in toy dataset
-	Elim orchestral scores
-	Make sure there aren’t cover pages, etc.
-	Make sure data in grayscale—standardize
-	Do as much of this as possible in an automated fashion. Publisher requirement will require some iteration and testing. 
-	ALSO re-do cell phone pictures, taking into account these constraints.


In [1]:
import glob
import os
import subprocess
import numpy as np
from scipy import ndimage
from scipy import signal
from scipy.misc import imsave
from scipy.signal import convolve2d
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from skimage.transform import rotate, resize
import time
import pylab as pl
from IPython import display
import librosa
import pickle
import pandas as pd
from bs4 import BeautifulSoup as bsoup

## Convert w 300 dpi

In [None]:
root_dir = 'score_data'
pdf_dir = root_dir + '/prepped_pdf' # directory containing pdfs of scores
png_dir = root_dir + '/prepped_png' # root output directory for png files

In [None]:
#os.mkdir(png_dir)
pdf_files = glob(pdf_dir + '/*.pdf')
for pdf_file in pdf_files:
    basename = os.path.splitext(os.path.basename(pdf_file))[0]
    pngout = png_dir + '/' + basename + '.png'
    subprocess.call(['convert', '-density', '300', pdf_file, pngout]) # 72 dpi is the default value

## Rotate

In [None]:
def importImage(pngfile):
    img = ndimage.imread(pngfile, flatten=True)
    img = 1 - img/255.0 # make white -> 0, black -> 1
    return img

In [None]:
def showImage(X, sz = (12,12)):
    plt.figure(figsize=sz)
    plt.imshow(1-X, cmap='gray', vmin=0, vmax=1)

In [None]:
def rotateToHorizontal(img, lb=-2, ub=2, incr=.01, topN=40):
    bestscore = -np.inf
    bestTheta = 0
    for theta in np.arange(lb, ub, incr):
        imgRot = rotate(img,theta)
        rsums = np.sum(imgRot, axis=1)
        rsums_sorted = sorted(rsums)[::-1]
        curscore = np.sum(rsums_sorted[0:topN])
        if curscore > bestscore:
            bestscore = curscore
            bestTheta = theta
    result = rotate(img,bestTheta)
    return result, bestTheta

In [None]:
I = importImage('score_data/prepped_png/score17.png')
Irot, theta = rotateToHorizontal(I,-1,1)
showImage(Irot)

In [None]:
def rotateAllImages(indir, outdir):
    png_files = glob.glob(indir + '/*.png')
    for png_file in png_files:
        I = importImage(png_file)
        Irot, theta = rotateToHorizontal(I)
        img = 255 - np.uint8(Irot*255)
        basename = os.path.basename(png_file)
        pngout = outdir + '/' + basename
        imsave(pngout,img)

In [None]:
pngrot_dir = root_dir + '/toAnnotate' # rotated png files

In [None]:
os.mkdir(pngrot_dir)
rotateAllImages(png_dir, pngrot_dir)

Here's an example of something w/o publisher: https://imslp.org/wiki/Acis_and_Galatea%2C_HWV_49_(Handel%2C_George_Frideric)
just go to the page parts and it's the first thing w/ id 106106

## Get bad (no publisher) names

We'll assume that we first loaded our text file as html text so we can parse with beautifulsoup. soup is the name of the object containing html info

In [None]:
# open html text file as a beautifulsoup object
with open('html.txt') as f:
    soup = BeautifulSoup(f, "html.parser")

In [None]:
for div in soup.find_all('div', class_ = 'we'):
    # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
    div = str(div)
    if 'Manuscript' in div or 'manuscript' in div:
        value_on = div[div.find('indexes='):] # this is where the ID should show up on a page
        bad_name = value_on[8:14] # just get the id, assuming 6 digits
        if bad_name[-1] = '/':
            # if it's a slash, there's something else after it
            bad_name_two = value_on[14:20]
            # check if last thing is actually a digit, in case of 5-digit ID
            if not bad_name_two[-1].isdigit():
                bad_name_two = bad_name_two[:-1]
            bad_pdf_two = bad_name_two + '.pdf'
            bad_names.append(bad_pdf_two)
        if not bad_name[-1].isdigit():
            # make could be a 5-digit ID
            bad_name = bad_name[:-1]
        bad_pdf = bad_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
        bad_names.append(bad_pdf) # append this to a list of bad names

## Check if Orchestral Score

Let's see if we can automate checking if smth is an orchestral score or just a part. Let's do a Beethoven Symphony :D

In [None]:
divList = []
score_names = []

In [None]:
for div in soup.find_all('div', class_ = 'we'):
    # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
    div = str(div)
    if 'Score' in div or 'score' in div:
        divList.append(div)

In [None]:
for div in divList:
    value_on = div[div.find('indexes='):]
    score_name = value_on[8:14]
    if score_name[-1] = '/':
            # if it's a slash, there's something else after it
            score_name_two = value_on[14:20]
            # check if last thing is actually a digit, in case of 5-digit ID
            if not score_name_two[-1].isdigit():
                score_name_two = score_name_two[:-1]
            score_pdf_two = score_name_two + '.pdf'
            score_names.append(score_pdf_two)
    if not score_name[-1].isdigit():
        # make could be a 5-digit ID
        score_name = score_name[:-1]
    score_pdf = score_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
    score_names.append(score_pdf) # append this to a list of bad names

# Re-create Dataset

In [2]:
current_dir = '/data1/dbashir/Project/Summer2018/'
topFolder = 'pdf'
resultsFolder = 'results_top50'
intermedFolder = 'results_intermediate'
pdfMiniFolder = 'mini_dataset_pdf'
pngMiniFolder = 'mini_dataset_png'

dataSetSize = 70

resultsPath = os.path.join(topFolder,resultsFolder)
intermedPath = os.path.join(topFolder, intermedFolder)
intermedFromTop = os.path.join(current_dir, intermedPath)

pdfDir = os.path.join(current_dir, topFolder)
miniPDFDataset = os.path.join(topFolder, pdfMiniFolder)
miniPNGDataset = os.path.join(pdfDir, pngMiniFolder)
pngFromTop = os.path.join(topFolder, pngMiniFolder)
pdfFromTop = os.path.join(pdfDir, pdfMiniFolder)
rotPngFromTop = os.path.join(topFolder, 'mini_dataset_png_rot')

fileList = []

print(pdfDir)

/data1/dbashir/Project/Summer2018/pdf


In [3]:
if not os.path.exists(intermedPath):
    os.makedirs(intermedPath)

In [4]:
if not os.path.exists(miniPDFDataset):
    os.makedirs(miniPDFDataset)

In [5]:
if not os.path.exists(miniPNGDataset):
    os.makedirs(miniPNGDataset)

In [6]:
if not os.path.exists(rotPngFromTop):
    os.makedirs(rotPngFromTop)

In [3]:
def make_bad_pdfs(html_path):
    bad_names = []
    with open(html_path) as f:
        soup = bsoup(f, "html.parser")

    for div in soup.find_all('div', class_ = 'we'):
        # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
        div = str(div)
        try:
            if 'Manuscript' in div or 'manuscript' in div:
                value_on = div[div.find('indexes='):] # this is where the ID should show up on a page
                bad_name = value_on[8:14] # just get the id, assuming 6 digits
                if bad_name[-1] == '/':
                    # if it's a slash, there's something else after it
                    bad_name_two = value_on[14:20]
                    # check if last thing is actually a digit, in case of 5-digit ID
                    if not bad_name_two[-1].isdigit():
                        bad_name_two = bad_name_two[:-1]
                    bad_pdf_two = bad_name_two + '.pdf'
                    bad_names.append(bad_pdf_two)
                if not bad_name[-1].isdigit():
                    # make could be a 5-digit ID
                    bad_name = bad_name[:-1]
                bad_pdf = bad_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
                bad_names.append(bad_pdf) # append this to a list of bad names
        except:
            continue
            
    return bad_names

In [4]:
def make_score_pdfs(html_path):
    score_names = []
    divList = []
    with open(html_path) as f:
        soup = bsoup(f, "html.parser")
    for div in soup.find_all('div', class_ = 'we'):
        # we will contain both an ID and the publisher info for a piece so we can make sure it's not handwritten
        div = str(div)
        if 'Score' in div or 'score' in div:
            divList.append(div)
    for div in divList:
        try:
            value_on = div[div.find('indexes='):]
            score_name = value_on[8:14]
            if score_name[-1] == '/':
                    # if it's a slash, there's something else after it
                    score_name_two = value_on[14:20]
                    # check if last thing is actually a digit, in case of 5-digit ID
                    if not score_name_two[-1].isdigit():
                        score_name_two = score_name_two[:-1]
                    score_pdf_two = score_name_two + '.pdf'
                    score_names.append(score_pdf_two)
            if not score_name[-1].isdigit():
                # make could be a 5-digit ID
                score_name = score_name[:-1]
            score_pdf = score_name + '.pdf' # add to .pdf so we have the actual filename we want to avoid/delete if it's in our thing
            score_names.append(score_pdf) # append this to a list of bad names
        except:
            continue
        
    return score_names

In [5]:
import shutil

In [None]:
#using os.walk, move all the .pdf files to an intermediate folder for EZ access
all_bad_names = []
all_score_names = []
for subdir, dirs, files in os.walk(resultsPath):
    for pdf_file in files:
        if 'pdf' in pdf_file:
            #move it to intermediate file for shuffling
#             print(pdf_file)
#             print("subdirectory: " + subdir)
            html_path = os.path.join(subdir, 'html.txt')
            bad_names = make_bad_pdfs(html_path)
            score_names = make_score_pdfs(html_path)
            for bad_name in bad_names:
                all_bad_names.append(bad_name)
            for score_name in score_names:
                all_score_names.append(score_name)
            if pdf_file in bad_names or pdf_file in score_names:
                print("skipping file: " + str(pdf_file))
                continue # skip if it's a handwritten or orchestral score
            if pdf_file not in all_bad_names or pdf_file not in all_score_names:
                pdf_file_path = os.path.join(subdir, pdf_file)
                final_path = os.path.join(intermedPath, pdf_file)
                shutil.copy(pdf_file_path, final_path)

skipping file: 270629.pdf
skipping file: 159741.pdf
skipping file: 42540.pdf
skipping file: 20515.pdf
skipping file: 87055.pdf
skipping file: 270796.pdf
skipping file: 234591.pdf
skipping file: 143296.pdf
skipping file: 272417.pdf
skipping file: 58813.pdf
skipping file: 68459.pdf
skipping file: 270430.pdf
skipping file: 105269.pdf
skipping file: 452822.pdf
skipping file: 42541.pdf
skipping file: 15191.pdf
skipping file: 270632.pdf
skipping file: 385573.pdf
skipping file: 411838.pdf
skipping file: 87056.pdf
skipping file: 240429.pdf
skipping file: 16144.pdf
skipping file: 14555.pdf
skipping file: 504187.pdf
skipping file: 270423.pdf
skipping file: 322115.pdf
skipping file: 385686.pdf
skipping file: 385730.pdf
skipping file: 385691.pdf
skipping file: 337078.pdf
skipping file: 180677.pdf
skipping file: 385711.pdf
skipping file: 383664.pdf
skipping file: 520316.pdf
skipping file: 505600.pdf
skipping file: 291632.pdf
skipping file: 291628.pdf
skipping file: 420951.pdf
skipping file: 102874.

In [34]:
%cd ttemp/Project/Summer2018/

/data1/dbashir/Project/Summer2018


In [36]:
print(miniPDFDataset)

pdf/mini_dataset_pdf


In [42]:
#change directory to the intermediate folder
# if os.getcwd() != intermedFromTop:
#     os.chdir(intermedPath)
g = glob.glob(os.path.join(intermedPath,'*.pdf'))
shuf = np.random.permutation(g)
for i in range(dataSetSize):
    name = shuf[i]
    name = name[25:]
    os.rename(shuf[i], os.path.join(miniPDFDataset, name))

Before splitting, I'll examine this small dataset manually to see how it looks.

In [58]:
def pdf_splitter(path):
    fname = os.path.splitext(os.path.basename(path))[0]
    
    #print(path)
    path = os.path.join(miniPDFDataset, path)
 
    pdf = PdfFileReader(open(path,"rb"))
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))
 
        output_filename = '{}_page_{}.pdf'.format(
            fname, page+1)
    
        output_filename = os.path.join(miniPDFDataset, output_filename)
 
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)
 
        print('Created: {}'.format(output_filename))

In [None]:
os.chdir(pdfFromTop)
os.getcwd()

In [51]:
from PyPDF2 import PdfFileWriter, PdfFileReader

In [59]:
for pdf_file in os.listdir(miniPDFDataset):
    try:
        pdf_splitter(pdf_file)
    except:
        continue

Created: pdf/mini_dataset_pdf/348445_page_1.pdf
Created: pdf/mini_dataset_pdf/348445_page_2.pdf
Created: pdf/mini_dataset_pdf/348445_page_3.pdf
Created: pdf/mini_dataset_pdf/348445_page_4.pdf
Created: pdf/mini_dataset_pdf/348445_page_5.pdf
Created: pdf/mini_dataset_pdf/348445_page_6.pdf
Created: pdf/mini_dataset_pdf/348445_page_7.pdf
Created: pdf/mini_dataset_pdf/348445_page_8.pdf
Created: pdf/mini_dataset_pdf/348445_page_9.pdf
Created: pdf/mini_dataset_pdf/348445_page_10.pdf
Created: pdf/mini_dataset_pdf/348445_page_11.pdf
Created: pdf/mini_dataset_pdf/348445_page_12.pdf
Created: pdf/mini_dataset_pdf/348445_page_13.pdf
Created: pdf/mini_dataset_pdf/348445_page_14.pdf
Created: pdf/mini_dataset_pdf/348445_page_15.pdf
Created: pdf/mini_dataset_pdf/348445_page_16.pdf
Created: pdf/mini_dataset_pdf/348445_page_17.pdf
Created: pdf/mini_dataset_pdf/348445_page_18.pdf
Created: pdf/mini_dataset_pdf/348445_page_19.pdf
Created: pdf/mini_dataset_pdf/348445_page_20.pdf
Created: pdf/mini_dataset_pdf



Created: pdf/mini_dataset_pdf/02054_page_1.pdf
Created: pdf/mini_dataset_pdf/02054_page_2.pdf
Created: pdf/mini_dataset_pdf/02054_page_3.pdf
Created: pdf/mini_dataset_pdf/02054_page_4.pdf
Created: pdf/mini_dataset_pdf/02054_page_5.pdf
Created: pdf/mini_dataset_pdf/02054_page_6.pdf
Created: pdf/mini_dataset_pdf/02054_page_7.pdf
Created: pdf/mini_dataset_pdf/02054_page_8.pdf
Created: pdf/mini_dataset_pdf/02054_page_9.pdf
Created: pdf/mini_dataset_pdf/02054_page_10.pdf
Created: pdf/mini_dataset_pdf/02054_page_11.pdf
Created: pdf/mini_dataset_pdf/02054_page_12.pdf
Created: pdf/mini_dataset_pdf/02054_page_13.pdf
Created: pdf/mini_dataset_pdf/02054_page_14.pdf
Created: pdf/mini_dataset_pdf/02054_page_15.pdf
Created: pdf/mini_dataset_pdf/02054_page_16.pdf
Created: pdf/mini_dataset_pdf/02054_page_17.pdf
Created: pdf/mini_dataset_pdf/02054_page_18.pdf
Created: pdf/mini_dataset_pdf/02054_page_19.pdf
Created: pdf/mini_dataset_pdf/02054_page_20.pdf
Created: pdf/mini_dataset_pdf/02054_page_21.pdf
C

Created: pdf/mini_dataset_pdf/131148_page_38.pdf
Created: pdf/mini_dataset_pdf/131148_page_39.pdf
Created: pdf/mini_dataset_pdf/131148_page_40.pdf
Created: pdf/mini_dataset_pdf/131148_page_41.pdf
Created: pdf/mini_dataset_pdf/131148_page_42.pdf
Created: pdf/mini_dataset_pdf/131148_page_43.pdf
Created: pdf/mini_dataset_pdf/131148_page_44.pdf
Created: pdf/mini_dataset_pdf/131148_page_45.pdf
Created: pdf/mini_dataset_pdf/131148_page_46.pdf
Created: pdf/mini_dataset_pdf/131148_page_47.pdf
Created: pdf/mini_dataset_pdf/131148_page_48.pdf
Created: pdf/mini_dataset_pdf/131148_page_49.pdf
Created: pdf/mini_dataset_pdf/131148_page_50.pdf
Created: pdf/mini_dataset_pdf/131148_page_51.pdf
Created: pdf/mini_dataset_pdf/131148_page_52.pdf
Created: pdf/mini_dataset_pdf/131148_page_53.pdf
Created: pdf/mini_dataset_pdf/131148_page_54.pdf
Created: pdf/mini_dataset_pdf/131148_page_55.pdf
Created: pdf/mini_dataset_pdf/131148_page_56.pdf
Created: pdf/mini_dataset_pdf/131148_page_57.pdf
Created: pdf/mini_da

Created: pdf/mini_dataset_pdf/09949_page_1.pdf
Created: pdf/mini_dataset_pdf/09949_page_2.pdf
Created: pdf/mini_dataset_pdf/09949_page_3.pdf
Created: pdf/mini_dataset_pdf/09949_page_4.pdf
Created: pdf/mini_dataset_pdf/09949_page_5.pdf
Created: pdf/mini_dataset_pdf/09949_page_6.pdf
Created: pdf/mini_dataset_pdf/09949_page_7.pdf
Created: pdf/mini_dataset_pdf/09949_page_8.pdf
Created: pdf/mini_dataset_pdf/09949_page_9.pdf
Created: pdf/mini_dataset_pdf/09949_page_10.pdf
Created: pdf/mini_dataset_pdf/09949_page_11.pdf
Created: pdf/mini_dataset_pdf/09949_page_12.pdf
Created: pdf/mini_dataset_pdf/09949_page_13.pdf
Created: pdf/mini_dataset_pdf/09949_page_14.pdf
Created: pdf/mini_dataset_pdf/09949_page_15.pdf
Created: pdf/mini_dataset_pdf/09949_page_16.pdf
Created: pdf/mini_dataset_pdf/09949_page_17.pdf
Created: pdf/mini_dataset_pdf/09949_page_18.pdf
Created: pdf/mini_dataset_pdf/09949_page_19.pdf
Created: pdf/mini_dataset_pdf/09949_page_20.pdf
Created: pdf/mini_dataset_pdf/09949_page_21.pdf
C

Created: pdf/mini_dataset_pdf/64287_page_38.pdf
Created: pdf/mini_dataset_pdf/64287_page_39.pdf
Created: pdf/mini_dataset_pdf/64287_page_40.pdf
Created: pdf/mini_dataset_pdf/64287_page_41.pdf
Created: pdf/mini_dataset_pdf/64287_page_42.pdf
Created: pdf/mini_dataset_pdf/64287_page_43.pdf
Created: pdf/mini_dataset_pdf/64287_page_44.pdf
Created: pdf/mini_dataset_pdf/64287_page_45.pdf
Created: pdf/mini_dataset_pdf/64287_page_46.pdf
Created: pdf/mini_dataset_pdf/64287_page_47.pdf
Created: pdf/mini_dataset_pdf/64287_page_48.pdf
Created: pdf/mini_dataset_pdf/64287_page_49.pdf
Created: pdf/mini_dataset_pdf/64287_page_50.pdf
Created: pdf/mini_dataset_pdf/64287_page_51.pdf
Created: pdf/mini_dataset_pdf/64287_page_52.pdf
Created: pdf/mini_dataset_pdf/64287_page_53.pdf
Created: pdf/mini_dataset_pdf/64287_page_54.pdf
Created: pdf/mini_dataset_pdf/64287_page_55.pdf
Created: pdf/mini_dataset_pdf/64287_page_56.pdf
Created: pdf/mini_dataset_pdf/64287_page_57.pdf
Created: pdf/mini_dataset_pdf/64287_page

Created: pdf/mini_dataset_pdf/505621_page_6.pdf
Created: pdf/mini_dataset_pdf/505621_page_7.pdf
Created: pdf/mini_dataset_pdf/21377_page_1.pdf
Created: pdf/mini_dataset_pdf/21377_page_2.pdf
Created: pdf/mini_dataset_pdf/21377_page_3.pdf
Created: pdf/mini_dataset_pdf/21377_page_4.pdf
Created: pdf/mini_dataset_pdf/21377_page_5.pdf
Created: pdf/mini_dataset_pdf/21377_page_6.pdf
Created: pdf/mini_dataset_pdf/21377_page_7.pdf
Created: pdf/mini_dataset_pdf/21377_page_8.pdf
Created: pdf/mini_dataset_pdf/21377_page_9.pdf
Created: pdf/mini_dataset_pdf/21377_page_10.pdf
Created: pdf/mini_dataset_pdf/21377_page_11.pdf
Created: pdf/mini_dataset_pdf/21377_page_12.pdf
Created: pdf/mini_dataset_pdf/51731_page_1.pdf
Created: pdf/mini_dataset_pdf/51731_page_2.pdf
Created: pdf/mini_dataset_pdf/51731_page_3.pdf
Created: pdf/mini_dataset_pdf/51731_page_4.pdf
Created: pdf/mini_dataset_pdf/51731_page_5.pdf
Created: pdf/mini_dataset_pdf/51731_page_6.pdf
Created: pdf/mini_dataset_pdf/51731_page_7.pdf
Created:

Now we'll do some conversions.

In [61]:
for pdf_file in os.listdir(miniPDFDataset):
    if 'page' not in pdf_file:
        os.remove(os.path.join(miniPDFDataset, pdf_file))

In [62]:
#also remove first pages just as a rule
os.remove(os.path.join(miniPDFDataset, '66477_page_1.pdf'))

In [None]:
pdf_files = glob(miniPDFDataset + '/*.pdf')
for pdf_file in pdf_files:
    basename = os.path.splitext(os.path.basename(pdf_file))[0]
    #print(basename)
    pngoutName = basename + '.png'
    #print(pngoutName)
    pngout = os.path.join(pngFromTop, pngoutName)
    print(pngout)
    subprocess.call(['convert', '-density', '300', pdf_file, pngout]) # 72 dpi is the default value