In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re
import xlrd
import pandas as pd

In [2]:
def convert_pdf_to_txt(path):
    '''
    converts a pdf to text using pdfminer
    '''
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [3]:
def get_soc_codes():
    '''
    returns a sorted list of soc_codes from ons data file
    '''
    df = pd.read_excel('../data/soc2000index_tcm77-179123.xls', sheetname=1 ) 
    soc_codes = df.SOC2000.unique().tolist()
    soc_codes.sort()
    return soc_codes

In [4]:
def clean_text(text):
    """
    clean the text
    """
    text = text.replace('Standard Occupational Classification 2000 Volume 1','')
    text = text.replace('\xe2\x80\xa2','')
    return text

In [5]:
def clean_desc(string):
    """
    clean soc descriptor strings
    """
    match = re.search(r'\D[0-9]{2,3}.*9', string, re.DOTALL)
    if match:
        string = re.sub(match.group(),'', string)
    return string

In [6]:
def make_soc_desc_lookup(text, soc_codes):
    """
    make a dictionary from the text {soc_code:description of the job}
    """
    soc_desc = dict()
    for i, item in enumerate(soc_codes):
        try:
            a = text.rfind(str(eval(soc_codes[i]))) # to avoid hitting index
            b = text.rfind(str(eval(soc_codes[i+1])))
            soc_desc[soc_codes[i]] = text[a:b]
        except SyntaxError:
            continue
    return soc_desc

In [7]:
def soc_descriptor(soc_code):
    desc = soc_desc.get(soc_code)
    if desc:
        print(desc)
    else:
        print ('valid soc_codes are {0}'.format(soc_desc.keys()))
    return desc

In [8]:
def main():
    """
    get a sorted list of soc codes, get the text of the soc code descriptions from a pdf,
    make a clean(ish) dict lookup
    """
    soc_codes = get_soc_codes()
    
    text = convert_pdf_to_txt("../data/soc2000/single-pages-output.pdf")
    # this was our third attempt at parsing the text from the pdf!
    # pdf has two column layout
    # success achieved following https://stackoverflow.com/questions/29319533/convert-pdf-with-columns-to-text
    #
    #  1. gs     -o left-sections.pdf     -sDEVICE=pdfwrite     -g3000x8500     -c "<</PageOffset [0 0]>> setpagedevice"     -f soc2000vol1v5_tcm77-179121.pdf  
    #  2. gs     -o right-sections.pdf     -sDEVICE=pdfwrite     -g3000x8500     -c "<</PageOffset [-300 0]>> setpagedevice"     -f soc2000vol1v5_tcm77-179121.pdf 
    #  3. pdftk   A=right-sections.pdf   B=left-sections.pdf shuffle B A   output single-pages-output.pdf
    #
    # text = convert_pdf_to_txt("../data/soc2000/soc2000vol1v5_tcm77-179121.pdf")
    # with open('../data/soc2000/soc2000-layout.txt', 'r') as myfile:
    #    text = myfile.read() # using text file prepared with pdftotext -layout
    
    text = clean_text(text)
    soc_desc = make_soc_desc_lookup(text, soc_codes)
    #for i, item in enumerate(soc_desc):
     #   soc_desc[item] = clean_desc(soc_desc[item])
    
    return soc_codes, text, soc_desc

In [9]:
soc_codes, text, soc_desc = main()

In [10]:
print(soc_desc['5316'])

5316 GLAZIERS, WINDOW

FABRICATORS AND FITTERS

Workers in this unit group install pre-glazed wooden,
metal or PVC framework, and cut, fit and set glass
in windows, doors, shop fronts, and other structural
frames.

TYPICAL ENTRY ROUTES AND
ASSOCIATED QUALIFICATIONS

There  are  no  formal  academic  entry  requirements.
Entry is typically through a Modern Apprenticeship
in  Glazing  Installation  or  Installing Architectural
Glazing Systems leading to an NVQ/SVQ at Level 3.

TASKS

(cid:149)

(cid:149)

examines drawings or specifications to determine
job requirements;

scores  plain,  coloured,  safety  and  ornamental
glass with hand cutter and breaks off glass by
hand or with pliers;

        191

M

a
j
o
r
 

G
r
o
u
p

 

5

1

2

3

4

5

6

7

8

9

5

 

p
u
o
r
G

 
r
o
j
a

M

1

2

3

4

5

6

7

8

9

(cid:149)

(cid:149)

(cid:149)

smoothes edges of glass and positions and secures
in frame or grooved lead strips;

applies mastic, putty or adhesive between glass
and fram

In [11]:
soc_descriptor('3552') 
# buggy - descriptor for 8223 is mixed with 8229. this is because of the way
# the original two column pdf text has been parsed.

3552 COUNTRYSIDE AND PARK

RANGERS

Countryside  and  park  rangers  look  after  the
countryside for the benefit of wildlife and the public
through  practical  conservation,  environmental
education  and  liaison  between  land  owners,  local
communities and visitors.

TYPICAL ENTRY ROUTES AND
ASSOCIATED QUALIFICATIONS

Entrants usually possess a BTEC/SQA higher award
or degree although some employers may require a
higher  degree  or  postgraduate  qualification.   A
majority of entrants have prior practical experience.
A  range  of  training  courses  from  professional

        147

M

a
j
o
r
 

G
r
o
u
p

 

3

1

2

3

4

5

6

7

8

9

3

 

p
u
o
r
G

 
r
o
j
a

M

1

2

3

4

5

6

7

8

9

associations  and  NVQs/SVQs  in  Environmental
Conservation at Levels 2 and 3 are available.

TASKS

(cid:149)

(cid:149)

advises  visitors,  organises  guided  walks  and
answers questions from the public about an area
and its wildlife;

encourages  environmental  education  through
ad

'3552 COUNTRYSIDE AND PARK\n\nRANGERS\n\nCountryside  and  park  rangers  look  after  the\ncountryside for the benefit of wildlife and the public\nthrough  practical  conservation,  environmental\neducation  and  liaison  between  land  owners,  local\ncommunities and visitors.\n\nTYPICAL ENTRY ROUTES AND\nASSOCIATED QUALIFICATIONS\n\nEntrants usually possess a BTEC/SQA higher award\nor degree although some employers may require a\nhigher  degree  or  postgraduate  qualification.   A\nmajority of entrants have prior practical experience.\nA  range  of  training  courses  from  professional\n\n        147\n\nM\n\na\nj\no\nr\n \n\nG\nr\no\nu\np\n\n \n\n3\n\n1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n7\n\n8\n\n9\n\n\x0c3\n\n \n\np\nu\no\nr\nG\n\n \nr\no\nj\na\n\nM\n\n1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n7\n\n8\n\n9\n\nassociations  and  NVQs/SVQs  in  Environmental\nConservation at Levels 2 and 3 are available.\n\nTASKS\n\n(cid:149)\n\n(cid:149)\n\nadvises  visitors,  organises  guided  walks  and\nanswers 

In [12]:
# uncomment and add tests to check all
# standard format is 4 digit code, job title, typical entry routes and 
# qualifications, tasks, related job titles
# can use headings to make more structured
#
#for soc_code in soc_codes:
#    soc_descriptor(soc_code)

In [13]:
df = pd.read_excel('../data/soc2000index_tcm77-179123.xls', sheetname=1 ) 


In [14]:
df['desc'] = df['SOC2000'].map(lambda x: soc_desc.get(x))

In [15]:
df = df[['SOC','SOC2000','INDEXOCC','desc']]

In [16]:
# ?http://blog.yhat.com/posts/fuzzy-matching-with-yhat.html

df[df.INDEXOCC.str.contains('Carpenter')]

Unnamed: 0,SOC,SOC2000,INDEXOCC,desc
3424,570,5315,Carpenter,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...
3425,570,5315,Carpenter and joiner,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...
3426,599,5315,Carpenter-diver,5315 CARPENTERS AND JOINERS\n\nCarpenters and ...


In [17]:
soc_desc['5315']

"5315 CARPENTERS AND JOINERS\n\nCarpenters and joiners construct, erect, install and\nrepair wooden structures and fittings used in internal\nand  external  frameworks  and  cut,  shape,  fit  and\nassemble wood to make templates, jigs, scale models\nand scenic equipment for theatres.\n\nTYPICAL ENTRY ROUTES AND\nASSOCIATED QUALIFICATIONS\n\nThere are no formal academic entry requirements,\nthough GCSEs/S grades are advantageous.  Entry is\ntypically  through  a  Modern Apprenticeship  or\nNational Traineeship approved by the Construction\nIndustry Training Board leading to an NVQ/SVQ in\nGeneral Construction at Level 3.\n\n\x0cTASKS\n\n(cid:149)\n\n(cid:149)\n\n(cid:149)\n\n(cid:149)\n\nexamines  drawings  and  specifications  to\ndetermine job requirements;\n\nselects and measures appropriate wood and cuts,\nshapes  and  drills  to  specification  using  saws,\nplanes, chisels and other power or hand tools;\n\naligns  and  fixes  prepared  wood  pieces  by\nscrewing, nailing, gluing 

In [18]:
print(text[397896:400000])

ative functions
in government departments such as recruitment
and training, the negotiation and arrangement of
contracts,  building  and  capital  management,
monitoring  and  authorising  department
expenditure etc.;

(cid:149) organises  resources  for  the  acceptance  and
recording  of  vacancy  details,  the  selection  of
suitable  applicants  and  other  Job  Centre
activities;

        149

M

a
j
o
r
 

G
r
o
u
p

 

3

1

2

3

4

5

6

7

8

9

3

 

p
u
o
r
G

 
r
o
j
a

M

1

2

3

4

5

6

7

8

9

(cid:149)

(cid:149)

authorises the payment of social security benefits,
arranges  for  domiciliary  visits  to  assess  the
financial  circumstances  of  claimants  and
investigates  any  state  insurance  contribution
problems;

advises  public  or  companies  on  general  tax
problems and arranges for the issue, receipt and
examination of tax forms, assessment of PAYE
codes  and  the  computation  of  tax  arrears  and
rebates.

RELATED JOB TITLES

Higher executive officer

In [19]:
print(df.iloc[26132]['desc'])

3415 MUSICIANS

Musicians write, arrange, orchestrate, conduct and
perform musical compositions.

TYPICAL ENTRY ROUTES AND
ASSOCIATED QUALIFICATIONS

There are no formal academic entry requirements
although many possess degrees or diploma courses.

Entry to a degree or graduate diploma course requires
A  levels/H  grades.    Entrants  to  the  performers'
diploma course generally possess GCSEs/S grades
and Associated Board examination passes in their
chosen instrument(s) and will be required to audition
for places.

TASKS

(cid:149)

(cid:149)

conceives and writes original music;

tunes instrument and studies and rehearses score;

(cid:149) plays instrument as a soloist or as a member of a

group or orchestra;

(cid:149)

(cid:149)

scores  music  for  different  combinations  of
voices  and  instruments  to  produce  desired
effect;

auditions and selects performers and rehearses
and  conducts  them  in  the  performance  of  the
composition.

RELATED JOB TITLES

Cellist
Composer (m