In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import xlrd
import pandas as pd

In [2]:
def convert_pdf_to_txt(path):
    '''
    converts a pdf to text using pdfminer
    '''
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [3]:
def get_soc_codes():
    '''
    returns a sorted list of soc_codes from ons data file
    '''
    df = pd.read_excel('../data/soc2000index_tcm77-179123.xls', sheetname=1 ) 
    soc_codes = df.SOC2000.unique().tolist()
    soc_codes.sort()
    return soc_codes

In [4]:
def clean_text(text):
    """
    clean the text
    """
    text = text.replace('Standard Occupational Classification 2000 Volume 1','')
    text = text.replace('\xe2\x80\xa2','')
    text = text.replace('(cid:149)','')
    return text

In [5]:
def clean_desc(string):
    """
    clean soc descriptor strings
    """
    match = re.search(r'\s[0-9]{2,3}.*9', string, re.DOTALL)
    if match:
        string = re.sub(match.group(),'', string)
    return string

In [6]:
def make_soc_desc_lookup(text, soc_codes):
    """
    make a dictionary from the text {soc_code:description of the job}
    """
    soc_desc = dict()
    for i, item in enumerate(soc_codes):
        try:
            a = text.rfind(str(eval(soc_codes[i]))) # to avoid hitting index
            b = text.rfind(str(eval(soc_codes[i+1])))
            soc_desc[soc_codes[i]] = text[a:b]
        except SyntaxError:
            continue
    return soc_desc

In [7]:
def soc_descriptor(soc_code):
    """
    return a description for a soc code
    """
    desc = soc_desc.get(soc_code)
    if desc:
        print(desc)
    else:
        print ('valid soc_codes are {0}'.format(soc_desc.keys()))
    return desc

In [8]:
def main():
    """
    get a sorted list of soc codes, get the text of the soc code descriptions from a pdf,
    make a clean(ish) dict lookup
    """
    soc_codes = get_soc_codes()
    
    text = convert_pdf_to_txt("../data/soc2000/single-pages-output.pdf")
    # this was our third attempt at parsing the text from the pdf!
    # pdf has two column layout
    # success achieved following https://stackoverflow.com/questions/29319533/convert-pdf-with-columns-to-text
    #
    #  1. gs     -o left-sections.pdf     -sDEVICE=pdfwrite     -g3000x8500     -c "<</PageOffset [0 0]>> setpagedevice"     -f soc2000vol1v5_tcm77-179121.pdf  
    #  2. gs     -o right-sections.pdf     -sDEVICE=pdfwrite     -g3000x8500     -c "<</PageOffset [-300 0]>> setpagedevice"     -f soc2000vol1v5_tcm77-179121.pdf 
    #  3. pdftk   A=right-sections.pdf   B=left-sections.pdf shuffle B A   output single-pages-output.pdf
    #
    # text = convert_pdf_to_txt("../data/soc2000/soc2000vol1v5_tcm77-179121.pdf")
    # with open('../data/soc2000/soc2000-layout.txt', 'r') as myfile:
    #    text = myfile.read() # using text file prepared with pdftotext -layout
    
    text = clean_text(text)
    soc_desc = make_soc_desc_lookup(text, soc_codes)
    #for i, item in enumerate(soc_desc):
    #    soc_desc[item] = clean_desc(soc_desc[item]) # currently broken
    
    return soc_codes, text, soc_desc

In [9]:
soc_codes, text, soc_desc = main()

In [10]:
# uncomment and add tests to check all
# standard format is 4 digit code, job title, typical entry routes and 
# qualifications, tasks, related job titles
# can use headings to make more structured
#
#for soc_code in soc_codes:
#    soc_descriptor(soc_code)

In [11]:
df = pd.read_excel('../data/soc2000index_tcm77-179123.xls', sheetname=1 ) 


In [12]:
df['desc'] = df['SOC2000'].map(lambda x: soc_desc.get(x))

In [13]:
df = df[['SOC','SOC2000','INDEXOCC','desc']]

In [14]:
# ?http://blog.yhat.com/posts/fuzzy-matching-with-yhat.html

df[df.INDEXOCC.str.contains('Carpenter')]

Unnamed: 0,SOC,SOC2000,INDEXOCC,desc
3424,570,5315,Carpenter,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...
3425,570,5315,Carpenter and joiner,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...
3426,599,5315,Carpenter-diver,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...


In [15]:
print(soc_desc['5315'])

5315 CARPENTERS AND JOINERS

Carpenters and joiners construct, erect, install and
repair wooden structures and fittings used in internal
and  external  frameworks  and  cut,  shape,  fit  and
assemble wood to make templates, jigs, scale models
and scenic equipment for theatres.

TYPICAL ENTRY ROUTES AND
ASSOCIATED QUALIFICATIONS

There are no formal academic entry requirements,
though GCSEs/S grades are advantageous.  Entry is
typically  through  a  Modern Apprenticeship  or
National Traineeship approved by the Construction
Industry Training Board leading to an NVQ/SVQ in
General Construction at Level 3.

TASKS









examines  drawings  and  specifications  to
determine job requirements;

selects and measures appropriate wood and cuts,
shapes  and  drills  to  specification  using  saws,
planes, chisels and other power or hand tools;

aligns  and  fixes  prepared  wood  pieces  by
screwing, nailing, gluing and dowelling to form
frames,  shop  fronts,  counter  units,  decking,
thea