In [36]:
from nltk.corpus import stopwords
from gensim.models import word2vec
import nltk.data
import re
from nltk.stem.snowball import *
import itertools
import html.parser as html_parser
from nltk.stem import WordNetLemmatizer, PorterStemmer
wordnet_lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
nltk.download("wordnet", "/Users/bharath/sourcecode/cynor/skill2vec/nltk_data/")
nltk.data.path.append('./nltk_data/')
import sys
if sys.version_info[0] >= 3:
    unicode = str

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bharath/sourcecode/cynor/skill2vec/nltk_data/..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
def skill_transform(skill, remove_stopwords = True):
    skill = unicode(skill)
    skill = html_parser.unescape(skill)
    
    skill = skill.replace("_", " ").split()
    skill = " ".join([sk for sk in skill if sk])
    
    skill = re.sub(r"\(.*\)", "", skill)
    skill = skill.replace("-", "") \
        .replace(".", "") \
        .replace(",", "") \
        .replace("-", "") \
        .replace(":", "") \
        .replace("(", "") \
        .replace(")", "") \
        .replace(u"åá", "") \
        .replace(u"&", "and") \
        .replace(" js", "js") \
        .replace("-js", "js") \
        .replace("_js", "js") \
        .replace("java script", "js") 
    
    skill = skill.lower()
    
    # Special cases replace
    special_case = {}
    special_case["javascript"] = [ "js", "java script", "javascripts", "java scrip" ]
    special_case["wireframe"] = [ "wireframes", "wire frame", "wire frames", "wire-frame", "wirefram", "wire fram", "wireframing" ]
    special_case["OOP"] = [  "object oriented", "object oriented programming", ]
    special_case["OOD"] = [ "object oriented design", ]
    special_case["OLAP"] = [ "online analytical processing",  ]
    special_case["Ecommerce"] = [ "e commerce",  ]
    special_case["consultant"] = [ "consulting",  ]
    special_case["ux"] = [ "user experience", "web user experience design", "user experience design", "ux designer", "user experience/ux" ]
    special_case["html5"] = [ "html 5",  ]
    special_case["j2ee"] = [ "jee",  ]
    special_case["osx"] = [ "mac os x", "os x" ]
    special_case["senior"] = [ "sr" ]
    special_case["qa"] = [ "quality",  ]
    special_case["bigdata"] = [ "big data",  ]
    special_case["webservice"] = [ "webservices", "website", "webapps" ]
    special_case["xml"] = [ "xml file", "xml schemas", "xml/json", "xml web service" ]
    special_case["bigdata"] = [ "big data",  ]
    special_case["nlp"] = [ "natural language process", "natural language", "nltk" ]
    special_case["aws"] = [ "amazon web service", ]
    
    for root_skill in special_case:
        if skill in special_case[root_skill]:
            skill = root_skill
    
    # Special case regex
    special_case_regex = {
        r'^angular.*$': 'angularjs',
        r'^node.*$': 'nodejs',
        r'^(.*)[_\s]js$': '\\1js',
        r'^(.*) js$': '\\1js',
        r'^(.*) (and|or).*$': '\\1',
    }
    for regex_rule in special_case_regex:
        after_skill = re.sub(regex_rule, special_case_regex[regex_rule], skill)
        if after_skill != skill:
            skill = after_skill
            break
    
    # Stem
    if len(skill) > 2:
        skill_after = skill.split(" ")
        skill_after = [wordnet_lemmatizer.lemmatize(sk, pos="v") for sk in skill_after]
        skill_after = " ".join(skill_after)
        skill = skill_after
    
    # skill stopwords 
    if remove_stopwords:
        skill_stopwords = [ "app", "touch", "the", "application", "programming", "program", "design"
                           "developer", "framework", "development", "programmer", "technologies",
                          "advance", "core"]
        skill_after = skill.split(" ")
        skill = " ".join([ sk for sk in skill_after if sk not in skill_stopwords ])
    
    skill = skill.lower().strip().replace(" ", "_")
    skill = re.sub(' +',' ', skill)
    
    # NOTE: replace js tail
    skill = re.sub('js$','', skill)
    
    return skill

In [38]:
model = word2vec.KeyedVectors.load_word2vec_format('cynor_skill2vec_200d.model')

In [51]:
model.similar_by_word(skill_transform('HR'))

[('human_resource_management', 0.799447238445282),
 ('hr_and_recruitment', 0.7993995547294617),
 ('human_resource', 0.7639515399932861),
 ('talent_acquisition', 0.7623394727706909),
 ('hr_operations', 0.7591198682785034),
 ('hr_generalist', 0.7581818699836731),
 ('hr_executive', 0.7557071447372437),
 ('hr_generalist_activities', 0.7504816055297852),
 ('hr_assistance', 0.748255729675293),
 ('hrm', 0.7438874840736389)]

In [40]:
model.similar_by_word(skill_transform('azure'))

[('microsoft_azure', 0.9070008993148804),
 ('cloud_platforms', 0.8551174402236938),
 ('aws_architect', 0.8526540994644165),
 ('cloud_applications', 0.8447308540344238),
 ('windows_azure', 0.842508852481842),
 ('architecting', 0.8376837968826294),
 ('cloud_architect', 0.833653450012207),
 ('vs', 0.8219773769378662),
 ('aws', 0.8154913783073425),
 ('dotnet_architect', 0.8154746890068054)]

In [41]:
model.similar_by_word(skill_transform('devops'))

[('ansible', 0.9231548309326172),
 ('puppet', 0.9220324158668518),
 ('dev_ops', 0.92069011926651),
 ('devops_engineer', 0.902111828327179),
 ('jenkins', 0.9017714262008667),
 ('cloud_engineer', 0.899010956287384),
 ('jenkin', 0.8910309672355652),
 ('cloudwatch', 0.8891658186912537),
 ('release_manager', 0.8863235712051392),
 ('bash_script', 0.8855523467063904)]

In [42]:
model.similar_by_word(skill_transform('waiter'))

[('bartender', 0.9928094744682312),
 ('fandb_service', 0.9894704818725586),
 ('waitress', 0.9880703687667847),
 ('stewards', 0.9870071411132812),
 ('senior_steward', 0.9853909611701965),
 ('steward', 0.9852117300033569),
 ('fandb_associate', 0.9836377501487732),
 ('barman', 0.9822597503662109),
 ('captain', 0.979358434677124),
 ('cafe', 0.9786368012428284)]

In [46]:
model.similar_by_word(skill_transform('teacher'))

[('teachers', 0.9782964587211609),
 ('ntt', 0.9612639546394348),
 ('instructor', 0.9572398066520691),
 ('primary_teacher', 0.9555884599685669),
 ('teach_english', 0.953647792339325),
 ('school_teacher', 0.9532197713851929),
 ('kindergarten', 0.9518149495124817),
 ('tuition', 0.9489309787750244),
 ('pgt', 0.9460960030555725),
 ('preschool', 0.9450435042381287)]

In [47]:
model.similar_by_word(skill_transform('nurse'))

[('medical_cod', 0.8508073687553406),
 ('mbbs', 0.8491320013999939),
 ('staff_nurse', 0.8470920920372009),
 ('head_nurse', 0.839325487613678),
 ('gnm', 0.8389165997505188),
 ('ot', 0.8374587893486023),
 ('ot_nurse', 0.8331429958343506),
 ('icu_nurse', 0.8313146233558655),
 ('icu', 0.8306599855422974),
 ('ward_nurse', 0.8219895362854004)]

In [48]:
model.similar_by_word(skill_transform('automation'))

[('qtp', 0.6914501786231995),
 ('drive', 0.6887904405593872),
 ('winrunner', 0.6606099009513855),
 ('sensors', 0.6561761498451233),
 ('automation_engineer', 0.6472973227500916),
 ('software_test_engineer', 0.6469188928604126),
 ('vb', 0.6374378204345703),
 ('plc', 0.6309812664985657),
 ('automation_tool', 0.6271375417709351),
 ('selenium', 0.6270813345909119)]

In [49]:
model.similar_by_word(skill_transform('python'))

[('ruby', 0.776968777179718),
 ('perl', 0.7663470506668091),
 ('languages', 0.7233844995498657),
 ('apache', 0.7142069339752197),
 ('algorithms', 0.7089732885360718),
 ('python_script', 0.7002631425857544),
 ('python_developer', 0.6992126703262329),
 ('open_source', 0.6950978636741638),
 ('postgresql', 0.691146194934845),
 ('data_structure', 0.6888243556022644)]

In [50]:
model.similar_by_word(skill_transform('lawyer'))

[('advocate', 0.9932965040206909),
 ('legal_advisor', 0.9762375950813293),
 ('non_litigation', 0.9744182229042053),
 ('affidavits', 0.9731534123420715),
 ('legal_notice', 0.9731290936470032),
 ('property_matter', 0.9682076573371887),
 ('litigation_matter', 0.9678948521614075),
 ('criminal', 0.9671271443367004),
 ('law_firm', 0.9656476974487305),
 ('intellectual_property_law', 0.9655910134315491)]