# Extract VERB and NOUN Phrases from Job Descriptions

This notebook is using the verb and noun pattern to extract soft skill phrase candidate from job description.

In [1]:
import spacy
from spacy.util import filter_spans
from spacy import displacy
from spacy.tokens import Span, Doc
from spacy.language import Language
from spacy.matcher import DependencyMatcher
import re
import pandas as pd
import sys
import textacy


In [2]:
lnkjobs = pd.read_csv("../data/dice_solution_architect_2021-12-14T17:45:56.csv")
jobdescs = [jd for jd in lnkjobs ["job_desc"]]

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
lnkjobs.head(20)

Unnamed: 0,index,subject,url,job_desc
0,0,SAP Solutions Architect,https://www.dice.com/jobs/detail/SAP-Solutions...,"SAP Solution Architect, Remote working from th..."
1,1,SAP Solution Architect,https://www.dice.com/jobs/detail/SAP-Solution-...,Job Title: SAP Solution Architect / Competency...
2,2,Salesforce Solution Architect,https://www.dice.com/jobs/detail/Salesforce-So...,"Hi ,\nHope you are doing good..!\nLocation: Re..."
3,3,"Solutions Architect, Health...",https://www.dice.com/jobs/detail/Solutions-Arc...,Albertsons Companies is one of the largest gro...
4,4,Sr. Solution Architect,https://www.dice.com/jobs/detail/Sr.-Solution-...,"What We'll Bring:\nAt TransUnion, we strive to..."
5,5,"Solutions Architect, Digital...",https://www.dice.com/jobs/detail/Solutions-Arc...,Albertsons Companies is one of the largest foo...
6,6,Cloud Solutions Architect - Manager,https://www.dice.com/jobs/detail/Cloud-Solutio...,Title: Cloud Solution Architect Manager\nLocat...
7,7,Azure Infrastructure Solution...,https://www.dice.com/jobs/detail/Azure-Infrast...,Azure Infrastructure Solution Architect\nAre y...
8,8,Cloud Solutions Architect,https://www.dice.com/jobs/detail/Cloud-Solutio...,Are you looking to make an immediate impact wh...
9,9,"Senior Manager, Cloud Solution...",https://www.dice.com/jobs/detail/Senior-Manage...,"Senior Manager, Cloud Solution Architect (AWS)..."


In [16]:
"""
The dedup phrases functions joins sub phrases that are part of the bigger
phrase. For exmaple, the three candidate phrases will be dedup into the actual phrase 

Candidates:
- supports reusable
- supports reusable application
- supports reusable application components

actual phrase:
- supports reusable application components
"""

def dedup_phrases(phrases: list) -> list:
    
    results = set()
    last_index = len(phrases) - 1
    for i, c in enumerate(phrases):
        if i < last_index:
            results.add(dedup_phrase(c, phrases[i+1]))
        else:
            results.add(str(c))
        
    return list(results)

def dedup_phrase(phrase: str, next_phrase: str) -> str:
    
    result = Span    
    if((phrase in next_phrase) and (len(phrase) < len(next_phrase))):
        result = next_phrase
    elif((next_phrase in phrase) and (len(next_phrase) < len(phrase))):
        result = phrase
    else:
        result = phrase

    return result
    
    

In [21]:
"""
exctrat_phrases function extract phrases from job description. 
The patterns are design to identify noun and verb phrase that can consider 
candidate for job expirence. Example phrases 
"experience delivering solutions"
"roadmap of existing enterprise platforms"

"""
def extract_phrases(texts: list) -> list:
    
    # Regex to clean hits from non alpha numeric characters and hyphens
    regex = re.compile('[^a-zA-Z0-9\-\s]')
    
    patterns = []
    patterns.append([{"POS": "VERB"}, {"POS": {"IN": ["ADP", "ADJ"]}, "OP": "+"}, {"POS":"NOUN", "OP": "+"}])
    patterns.append([{"POS":"NOUN"}, {"POS": {"IN": ["ADP", "ADJ"]}, "OP": "?"}, {"POS": "VERB", "OP": "+"}, {"POS":"NOUN", "OP": "+"}])
    patterns.append([{"POS":"NOUN"}, {"POS": "VERB", "OP": "+"}, {"POS":"PROPN", "OP": "+"}])
    patterns.append([{"POS": "VERB"}, {"POS": "NOUN"}, {"POS": "CCONJ"}, {"POS": {"IN": ["ADP", "ADJ"]}, "OP": "+"}, {"POS": "CCONJ"}, {"POS":"VERB"}])

    records = []

    for desc in texts:
        doc = nlp(desc)
        chunks = []
        for pattern in patterns:
            temp_list = []
            verb_chunks = textacy.extract.token_matches(doc, pattern)
            for vc in verb_chunks:
                ## Convert the span into string and append to list
                temp_list.append(regex.sub('', vc.text).strip())
            if len(temp_list) > 0:
                records.extend(dedup_phrases(temp_list))
    
    return records

In [22]:
extract_phrases(jobdescs)

['Demonstrated executive presence',
 'burning natural gas service',
 'Work on cross-functional projects',
 'deliver complex technology solutions',
 'adhered to in accordance',
 'act as adviser',
 'requires excellent client services',
 'handling multiple projects',
 'architect innovative solutions',
 'defining architectural direction',
 'have good experience',
 'catering to cross-technology integrations',
 'dictates various processes',
 'offers competitive pay',
 'growing natural gas utility',
 'pay including pay',
 'team means embracing excellence',
 'degree in related field',
 'responsibility for achieving goals',
 'ecosystem meet defined Enterprise Architecture',
 'experience using ABAP',
 'involving multiple software components',
 'Drive best practice business processes',
 'have primary responsibility',
 'propose strategic solution',
 'meet dynamic market demands',
 'Drive best practice business',
 'work with customers',
 'use keen business process',
 'Work on billable customer enga

In [12]:
jobdescs

["SAP Solution Architect, Remote working from the United States.\nSummit is a growing natural gas utility providing safe, reliable, and clean-burning natural gas service to homes and businesses in Arkansas, Colorado, Maine, Missouri, and Oklahoma. Being part of the Summit team means embracing excellence, diversity, and innovation, committing to safety each and every day, and doing all that we can to serve each other, our customers, and the communities where we live. We aim to bring warmth and energy to everything we do.\nPOSITION SUMMARY:\nThe purpose of this role is to own and manage end-to-end architecture and service delivery for the SAP solutions and related integrations. The SAP Solution Architect will need to have in-depth knowledge and expertise around SAP products and processes and work enterprise-wide implementation of SAP S/4 HANA. The SAP Solution Architect will define the roadmap and strategy of the Summit Utilities SAP ecosystem. They will need to understand the big pictur