In [1]:
import re, requests, json, base64
import xml.etree.ElementTree as ET

from lxml import etree as etree_lxml
from datetime import datetime

PROXIES = {
    "https" : "http://172.17.0.1:3128",
    "http"  : "http://172.17.0.1:3128"
}

def clean_text(s):
    # Replace symbols with language
    s = s.replace('&', '_and_')
    s = s.replace('#', '_sharp_')
    s = s.replace('@', '_at_')
    s = s.replace('*', '_star_')
    s = s.replace('%', '_prcnt_')

    s = s.replace('(', '_ob_')
    s = s.replace(')', '_cb_')
    s = s.replace('{', '_ocb_')
    s = s.replace('}', '_ccb_')
    s = s.replace('[', '_osb_')
    s = s.replace(']', '_csb_')

    s = s.replace('=', '_eq_')
    s = s.replace('>', '_gt_')
    s = s.replace('<', '_lt_')
    s = s.replace('+', '_plus_')
    s = s.replace('-', '_dash_')
    s = s.replace('/', '_fsl_')
    s = s.replace('?', '_qm_')
    s = s.replace('!', '_em_')

    s = s.replace('.', '_dot_')
    s = s.replace(',', '_coma_')
    s = s.replace(':', '_cln_')
    s = s.replace(';', '_scln_')

    s = re.sub('[^0-9a-zA-Z]+', '_', s)
    return  s

def replace_xml_tags(xml_tag):
    xml_tag = str(xml_tag).replace('{http://actonomy.com/hrxml/2.5}', '')
    xml_tag = xml_tag.replace('{http://schemas.xmlsoap.org/soap/envelope/}', '')
    xml_tag = xml_tag.replace('{http://xmp.actonomy.com}', '')
    
    xml_tag = xml_tag.replace('StructuredXMLResume', '')
    xml_tag = xml_tag.replace('ContactInfo', 'CI')
    xml_tag = xml_tag.replace('ContactMethod', 'CM')
    xml_tag = xml_tag.replace('EmploymentHistory', 'EH')
    xml_tag = xml_tag.replace('EmployerOrg', 'EO')
    xml_tag = xml_tag.replace('EducationHistory', 'EDH')
    xml_tag = xml_tag.replace('SchoolOrInstitution', 'SOI')
    xml_tag = xml_tag.replace('Qualifications', 'QLS')
    xml_tag = xml_tag.replace('UserArea', '')
    xml_tag = xml_tag.replace('Classifications', 'CLS')
    xml_tag = xml_tag.replace('Value', '')
    xml_tag = xml_tag.replace('Competency', 'Skills')
    xml_tag = xml_tag.replace('LocationSummary', 'Location')
    xml_tag = xml_tag.replace('PositionHistory', 'Position')
    xml_tag = xml_tag.replace('AnyDate', 'YearMonth')

    return xml_tag

def xml_actonomy_2json(xml_str, tag_prefix='hrx'):
    jout = {}
    tags = {}
    tc   = 0
    
    # missing tags to be filled with 'Unknown' in groups specified as keys in this dictionary
    mtags = {
        'EO'         : ['EOName'],
        'Position'   : ['Title'],
        'Location'   : ['Municipality', 'CountryCode']
    }
    
    def recursive_parse(children, tag_prefix, tc):

        tc = tc+1
        tags[tc] = []
        
        for child in list(children):

            child_text = str(child.text).replace("\n",' ').replace("\r",' ').strip()           
            
            child_tag_clean = replace_xml_tags(child.tag)          
            tags[tc].append(child_tag_clean)

            if replace_xml_tags(children.tag) == 'Position':
                child_count = tags[tc].count(child_tag_clean)
                if child_count > 1:
                    child_tag_clean = child_tag_clean + "_alv" + str(child_count)
            
            
            if tag_prefix == '':
                child_tag = child_tag_clean
            else:
                child_tag = tag_prefix + "_" + child_tag_clean
                
                if child.attrib.get('type') is not None:
                    child_tag = child_tag + '_' + child.attrib.get('type', '')

            if child.attrib.get('name') is not None:
                if child_text == '' or child_text == 'None':
                    child_text = child.attrib['name']
                else:
                    child_text = child.attrib['name'] + " | " + child_text

            if child_text != '' and child_text != 'None':
                if jout.get(child_tag) is None:
                    jout[child_tag] = []
                    
                if jout.get(child_tag + '_weighted') is None:                      # also create a list of items repeated according to its weight
                    jout[child_tag + '_weighted'] = []

                jout[child_tag].append(child_text)
                #jout[child_tag + '_weighted'].append(child_text)                  # also add to a list of items repeated according to its weight

                # extract "weight" attribute and add it as separate json item
                if child.attrib.get('weight') is not None:
                    w_tag = child_tag + '_' + clean_text(child_text)

                    if jout.get(w_tag) is None:
                        jout[w_tag] = []

                    jout[w_tag].append(float(child.attrib.get('weight')))
                    
                    jout[child_tag + '_weighted'].extend([child_text]*int(round(10*float(child.attrib.get('weight')))))  # repeat item 10 times its weight

            # apply same function recursively for all children
            if len(list(child)) > 0:                    
                recursive_parse(child, child_tag, tc)   
            else:               
                # if no children exist for StartDate or EndDate tag then add empty YearMonth tag to make sure it is recorded
                if child_tag_clean == 'EndDate' or child_tag_clean == 'StartDate' or child_tag_clean == 'DegreeDate':
                    child_tag_ym = child_tag + '_YearMonth'
                    if jout.get(child_tag_ym) is None:
                        jout[child_tag_ym] = []
                    jout[child_tag_ym].append('1600-01')
                    
        # add default value for missing children tags in specified groups        
        for tag in mtags:
            if replace_xml_tags(children.tag) == tag:
                for ctag in mtags[tag]:
                    if ctag not in tags[tc]:
                        child_tag = tag_prefix + "_" + ctag
                        if jout.get(child_tag) is None:
                            jout[child_tag] = []

                        jout[child_tag].append('Unknown')
                    
    # - end of recursive function
    
    try:         
        root = etree_lxml.fromstring(xml_str.encode('utf-8'))
        recursive_parse(root, tag_prefix, tc)
    except Exception as e:
        print (e)
        print (str(datetime.now()), 'Error in XML')
        return {}

    return jout

In [2]:
def ontology_json_to_classes(groups, labels):
    out_dict = {}
    for i in range(0,len(labels)):
        group = groups[i]
        label = labels[i]
        
        if out_dict.get(group, None) == None:
            out_dict[group] = []
            
        out_dict[group].append(label)
        
    return out_dict

In [18]:
def actonomy_esco_query(text_to_parse):

    body_terms = """<soapenv:Envelope xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/" xmlns:xmp="http://xmp.actonomy.com">
                        <soapenv:Header/>
                            <soapenv:Body>
                              <xmp:standardFind>
                                 <action>
                                    <standard>esco</standard>
                                    <text>{text}</text>
                                    <searchAlgorithm>PARSE_TEXT</searchAlgorithm>
                                    <ontologyProperties>
                                       <language>ENG</language>
                                       <locale>NLD</locale>
                                       <returnRelations>true</returnRelations>
                                       <returnGroups>true</returnGroups>
                                    </ontologyProperties>
                                 </action>
                              </xmp:standardFind>
                            </soapenv:Body>
                     </soapenv:Envelope>"""

    # replace content keyword in XML template with actual content encoded as base64
    body_subm = body_terms.format(text = text_to_parse)
    body_subm = body_subm.encode('utf-8')

    # prepare authentication    
    auth_str = "wcn:wcn"
    encoded  = base64.b64encode(auth_str.encode('ascii'))
    encoded  = str(encoded)[2:-1]
    auth     = "Basic " + encoded
    headers  = {'content-type': 'text/soap+xml', 'authorization': auth} 

    r = requests.post('https://10.2.3.21/v5_7/OntologyService', proxies=PROXIES, data=body_subm, headers=headers, verify=False)
    
#     ontology_json = xml_actonomy_2json(r.text, 'ONT')
    
#     ont_classes = ontology_json_to_classes(ontology_json.get('ONT_Body_findTermsResponse_return_termHits_groups_termCategory',[]), ontology_json.get('ONT_Body_findTermsResponse_return_termHits_groups_preferredLabels_label',[]))
    
    return {"act_xml":r}

In [19]:
xmlr=actonomy_esco_query('software developer')



In [1]:
xmlr["act_xml"].text

NameError: name 'xmlr' is not defined