In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import xlrd
import pandas as pd

In [2]:
def convert_pdf_to_txt(path):
    '''
    converts a pdf to text using pdfminer
    '''
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [3]:
def get_soc_codes():
    '''
    returns a sorted list of soc_codes from ons data file
    '''
    df = pd.read_excel('../data/soc2000index_tcm77-179123.xls', sheetname=1 ) 
    soc_codes = df.SOC2000.unique().tolist()
    soc_codes.sort()
    soc_codes.remove( u'}}}}') # remove as not a soc code (used to help look up)
    return soc_codes

In [4]:
def clean_text(text):
    """
    clean the text
    """
    text = text.replace('Standard Occupational Classification 2000 Volume 1','')
    text = text.replace('\xe2\x80\xa2','')
    text = text.replace('\ncaf\xc3\xa9s','')
    text = text.replace('\xc3\xa8', 'e')
    text = text.replace('\xc3\xa7', 'c')
    text = text.replace('(cid:149)','')
    return text

In [5]:
def clean_desc(string):
    """
    clean soc descriptor strings
    """
    match = re.search(r'\s[0-9]{2,3}.*9', string, re.DOTALL)
    if match:
        string = re.sub(match.group(),'', string)
    return string

In [6]:
def make_soc_desc_lookup(text, soc_codes):
    """
    make a dictionary from the text {soc_code:description of the job}
    """
    soc_desc = dict()
    for i, item in enumerate(soc_codes):
        if i < (len(soc_codes)-1):
            a = text.rfind(str(eval(soc_codes[i]))) # to avoid hitting index
            b = text.rfind(str(eval(soc_codes[i+1])))
            soc_desc[soc_codes[i]] = text[a:b]
        else:
            a = text.rfind(str(eval(soc_codes[i]))) # to avoid hitting index
            soc_desc[soc_codes[i]] = text[a:]
    return soc_desc

In [7]:
def soc_descriptor(soc_code):
    """
    return a description for a soc code
    """
    desc = soc_desc.get(soc_code)
    if desc:
        print(desc)
    else:
        print ('valid soc_codes are {0}'.format(soc_desc.keys()))
    return desc

In [8]:
def main():
    """
    get a sorted list of soc codes, get the text of the soc code descriptions from a pdf,
    make a clean(ish) dict lookup
    """
    soc_codes = get_soc_codes()
    
    text = convert_pdf_to_txt("../data/soc2000/single-pages-output-cropped.pdf")
    # this was our third attempt at parsing the text from the pdf!
    # pdf has two column layout
    # success achieved following https://stackoverflow.com/questions/29319533/convert-pdf-with-columns-to-text
    #
    #  1. gs     -o left-sections.pdf     -sDEVICE=pdfwrite     -g3000x8500     -c "<</PageOffset [0 0]>> setpagedevice"     -f soc2000vol1v5_tcm77-179121.pdf  
    #  2. gs     -o right-sections.pdf     -sDEVICE=pdfwrite     -g3000x8500     -c "<</PageOffset [-300 0]>> setpagedevice"     -f soc2000vol1v5_tcm77-179121.pdf 
    #  3. use krop to trim junk from edges of pdf
    #  4. pdftk   A=right-sections.pdf   B=left-sections.pdf shuffle B A   output single-pages-output.pdf
    #
    # text = convert_pdf_to_txt("../data/soc2000/soc2000vol1v5_tcm77-179121.pdf")
    # with open('../data/soc2000/soc2000-layout.txt', 'r') as myfile:
    #    text = myfile.read() # using text file prepared with pdftotext -layout
    
    text = clean_text(text)
    soc_desc = make_soc_desc_lookup(text, soc_codes)
    #for i, item in enumerate(soc_desc):
    #    soc_desc[item] = clean_desc(soc_desc[item]) # currently broken
    
    return soc_codes, text, soc_desc

In [9]:
def short_desc_lookup(soc_code):
    """
    extract short desc for a soc code
    """
    string = soc_desc[soc_code]
    a = string.find(soc_code) 
    b = string.find('TYPICAL ENTRY ROUTES AND\nASSOCIATED QUALIFICATIONS')
    return string[a:b]

In [47]:
def tasks_lookup(soc_code):
    """
    extract tasks for a soc code
    """
    string = soc_desc[soc_code]
    a = string.find('TASKS') 
    b = string.find('RELATED JOB TITLES')
    string = string[a:b].replace('TASKS','')
    string = string.strip()
    string = 'TASKS\n\n' + string + '\n\n'
    return string

In [11]:
def entry_lookup(soc_code):
    """
    extract entry route for a soc code
    """
    string = soc_desc[soc_code]
    a = string.find('TYPICAL ENTRY') 
    b = string.find('TASKS')
    string = string[a:b].replace('TYPICAL ENTRY ROUTES AND\nASSOCIATED QUALIFICATIONS\n\n','')
    string = string.strip()
    string = 'TYPICAL ENTRY ROUTES AND\nASSOCIATED QUALIFICATIONS\n\n' + string + '\n\n'
    return string

In [50]:
def related_lookup(soc_code):
    """
    extract related jobs for a soc code
    """
    string = soc_desc[soc_code]
    a = string.find('RELATED JOB TITLES') 
    b = string.find('\x0c\x0c')
    string = string[a:b].replace('RELATED JOB TITLES\n\n','')
    string = string.strip()
    string = 'RELATED JOB TITLES\n\n' + string + '\n\n'
    return string

In [51]:
soc_codes, text, soc_desc = main()

In [14]:
# uncomment and add tests to check all
# standard format is 4 digit code, job title, typical entry routes and 
# qualifications, tasks, related job titles
# can use headings to make more structured
#
#for soc_code in soc_codes:
#    soc_descriptor(soc_code)

In [27]:
df = pd.read_excel('../data/soc2000index_tcm77-179123.xls', sheetname=1 ) 


In [28]:
df = df[df['SOC2000'].isin(soc_codes)]

In [29]:
df['short_desc'] = df['SOC2000'].map(lambda x: short_desc_lookup(x))
df['tasks'] = df['SOC2000'].map(lambda x: tasks_lookup(x))
df['entry'] = df['SOC2000'].map(lambda x: entry_lookup(x))
df['related'] = df['SOC2000'].map(lambda x: related_lookup(x))

In [30]:
df = df[['SOC','SOC2000','INDEXOCC', 'short_desc', 'entry', 'tasks', 'related']]
df.entry = df.entry.str.replace('TYPICAL ENTRY ROUTES AND\nASSOCIATED QUALIFICATIONS\n\n','')
df.tasks = df.tasks.str.replace('TASKS','')
df.tasks = df.tasks.str.strip()
df.related = df.related.str.replace('RELATED JOB TITLES\n\n','')

In [31]:
# ?http://blog.yhat.com/posts/fuzzy-matching-with-yhat.html

df[df.INDEXOCC.str.contains('Carpenter')]

Unnamed: 0,SOC,SOC2000,INDEXOCC,short_desc,entry,tasks,related
3424,570,5315,Carpenter,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...,There are no formal academic entry requirement...,examines drawings and specifications to\nd...,Boat builder\nBuilder's joiner\nCarpenter\nCar...
3425,570,5315,Carpenter and joiner,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...,There are no formal academic entry requirement...,examines drawings and specifications to\nd...,Boat builder\nBuilder's joiner\nCarpenter\nCar...
3426,599,5315,Carpenter-diver,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...,There are no formal academic entry requirement...,examines drawings and specifications to\nd...,Boat builder\nBuilder's joiner\nCarpenter\nCar...


In [43]:
def lookup_soc(soc_code):
    '''
    prints info for a given soc2000 code
    '''
    print short_desc_lookup(soc_code)
    print entry_lookup(soc_code)
    print tasks_lookup(soc_code)
    print related_lookup(soc_code)

In [54]:
lookup_soc('5222')

5222 TOOL MAKERS, TOOL

FITTERS AND MARKERS-OUT

Tool makers, tool fitters and markers-out mark out
metal  for  machining  and  fit,  assemble  and  repair
machine and press tools, dies, jigs, fixtures and other
tools.


TYPICAL ENTRY ROUTES AND
ASSOCIATED QUALIFICATIONS

There are no formal academic requirements although
some  employers  may  require  GCSEs/S  grades.
Training is usually received on-the-job. NVQs/SVQs
at Level 2 are available.


TASKS

examines  drawings  and  specifications  to
determine appropriate method and  sequence of
operations;

 marks  out  reference  points  using  measuring
instruments and tools such as punches, rules and
squares;

 operates  hand  and  machine  tools  to  shape
workpieces to specifications and checks accuracy
of machining;





assembles prepared parts, checks their alignment
with micrometers, optical projectors and other
measuring equipment and adjusts as necessary;

repairs damaged or worn tools.


RELATED JOB TITLES

Die sinker (metal 

In [58]:
df[df.tasks.str.contains('asbestos')].head()

Unnamed: 0,SOC,SOC2000,INDEXOCC,short_desc,entry,tasks,related
71,820,8114,Acidifier,8114 CHEMICAL AND RELATED\n\nPROCESS OPERATIVE...,There are no formal academic entry requirement...,loads prescribed quantities of ingredients...,Colour mixer\nDye house operative (textile mfr...
566,820,8114,Annealer,8114 CHEMICAL AND RELATED\n\nPROCESS OPERATIVE...,There are no formal academic entry requirement...,loads prescribed quantities of ingredients...,Colour mixer\nDye house operative (textile mfr...
648,506,8149,"Asphalter, mastic",8149 CONSTRUCTION\n\nOPERATIVES NEC\n\nWorkers...,There are no formal academic entry requirement...,"fills machine with insulating mixture, positio...",Building site foreman\nCeiling fixer\nPipe lay...
681,820,8114,"Assembler, cell",8114 CHEMICAL AND RELATED\n\nPROCESS OPERATIVE...,There are no formal academic entry requirement...,loads prescribed quantities of ingredients...,Colour mixer\nDye house operative (textile mfr...
992,820,8114,"Assistant, craft",8114 CHEMICAL AND RELATED\n\nPROCESS OPERATIVE...,There are no formal academic entry requirement...,loads prescribed quantities of ingredients...,Colour mixer\nDye house operative (textile mfr...


In [59]:
lookup_soc('8149')

8149 CONSTRUCTION

OPERATIVES NEC

Workers  in  this  unit  group  operate  insulating
equipment, fix plasterboard or dry linings to ceilings
and  walls,  help  construct,  maintain,  repair  and
demolish buildings and clean and resurface eroded
stonework, and lay, join and examine pipe sections
for drainage, gas, water or similar piping systems.


TYPICAL ENTRY ROUTES AND
ASSOCIATED QUALIFICATIONS

There are no formal academic entry requirements.
Training is typically provided on-the-job.  NVQs/
SVQs  in  General  Construction  Operations  are
available at Levels 1, 2 and 3.


TASKS

fills machine with insulating mixture, positions
hose, drills access hole and fills cavities or coats
surfaces to prevent loss or absorption of heat and
provide fire protection;

selects  appropriate  plasterboard  or  dry  lining
panels, cuts them to required size and fixes them
to ceilings and walls;









cuts, shapes and fits wood, lays bricks and tiles,
cleans  exterior  surfaces  of  buildings  

In [71]:
highpmrsoc90codes = [211, 516, 521, 532, 533, 534, 541, 570, 893, 896, 913]
# a la peto

In [70]:
df[df.SOC.isin([str(i) for i in highpmrsoc90codes])].head()

Unnamed: 0,SOC,SOC2000,INDEXOCC,short_desc,entry,tasks,related
29,893,8124,APA,8124 ENERGY PLANT OPERATIVES\n\nWorkers in thi...,There are no formal academic entry requirement...,determines job requirements from switchboard\n...,Auxiliary plant attendant\nBoiler attendant\n\...
81,516,5223,"Adjuster, brake",5223 METAL WORKING\nPRODUCTION AND\nMAINTENANC...,"Entrants usually possess GCSEs/S grades, a GNV...",examines drawings and specifications to\nd...,Aircraft engineer\nAircraft fitter\nAircraft g...
89,516,5223,"Adjuster, machine",5223 METAL WORKING\nPRODUCTION AND\nMAINTENANC...,"Entrants usually possess GCSEs/S grades, a GNV...",examines drawings and specifications to\nd...,Aircraft engineer\nAircraft fitter\nAircraft g...
91,516,5223,"Adjuster, spring, set",5223 METAL WORKING\nPRODUCTION AND\nMAINTENANC...,"Entrants usually possess GCSEs/S grades, a GNV...",examines drawings and specifications to\nd...,Aircraft engineer\nAircraft fitter\nAircraft g...
96,516,5223,Adjuster,5223 METAL WORKING\nPRODUCTION AND\nMAINTENANC...,"Entrants usually possess GCSEs/S grades, a GNV...",examines drawings and specifications to\nd...,Aircraft engineer\nAircraft fitter\nAircraft g...


In [67]:
str(highpmrsoc90codes)

'[211, 516, 521, 532, 533, 534, 541, 570, 893, 896, 913]'

In [75]:
print(df[df.SOC == '211'].iloc[0])

SOC                                                         211
SOC2000                                                    2122
INDEXOCC                                       Architect, naval
short_desc    2122 MECHANICAL ENGINEERS\n\nMechanical engine...
entry         Mechanical engineers usually possess an accred...
tasks         undertakes research and advises on energy use,...
related       Aeronautical engineer (professional)\nAutomobi...
Name: 595, dtype: object
