Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

skill_transform not found #1

Closed
ks716 opened this issue May 23, 2021 · 1 comment
Closed

skill_transform not found #1

ks716 opened this issue May 23, 2021 · 1 comment

Comments

@ks716
Copy link

ks716 commented May 23, 2021

Hi sir,

Could you provide the python file skill_transform, it is imported in the notebooks but not provided in the repo.

Thank you

@ks716 ks716 closed this as completed Jun 1, 2021
@Delmark1904
Copy link

Delmark1904 commented Feb 15, 2023

def skill_transform(skill, remove_stopwords = True):
    skill = str(skill)
    skill = html.unescape(skill)
    
    skill = skill.replace("_", " ").split()
    skill = " ".join([sk for sk in skill if sk])
    
    skill = re.sub(r"\(.*\)", "", skill)
    skill = skill.replace("-", "") \
        .replace(".", "") \
        .replace(",", "") \
        .replace("-", "") \
        .replace(":", "") \
        .replace("(", "") \
        .replace(")", "") \
        .replace(u"åá", "") \
        .replace(u"&", "and") \
        .replace(" js", "js") \
        .replace("-js", "js") \
        .replace("_js", "js") \
        .replace("java script", "js") 
    
    skill = skill.lower()
    
    # Special cases replace
    special_case = {}
    special_case["javascript"] = [ "js", "java script", "javascripts", "java scrip" ]
    special_case["wireframe"] = [ "wireframes", "wire frame", "wire frames", "wire-frame", "wirefram", "wire fram", "wireframing" ]
    special_case["OOP"] = [  "object oriented", "object oriented programming", ]
    special_case["OOD"] = [ "object oriented design", ]
    special_case["OLAP"] = [ "online analytical processing",  ]
    special_case["Ecommerce"] = [ "e commerce",  ]
    special_case["consultant"] = [ "consulting",  ]
    special_case["ux"] = [ "user experience", "web user experience design", "user experience design", "ux designer", "user experience/ux" ]
    special_case["html5"] = [ "html 5",  ]
    special_case["j2ee"] = [ "jee",  ]
    special_case["osx"] = [ "mac os x", "os x" ]
    special_case["senior"] = [ "sr" ]
    special_case["qa"] = [ "quality",  ]
    special_case["bigdata"] = [ "big data",  ]
    special_case["webservice"] = [ "webservices", "website", "webapps" ]
    special_case["xml"] = [ "xml file", "xml schemas", "xml/json", "xml web service" ]
    special_case["bigdata"] = [ "big data",  ]
    special_case["nlp"] = [ "natural language process", "natural language", "nltk" ]
    for root_skill in special_case:
        if skill in special_case[root_skill]:
            skill = root_skill
    
    # Special case regex
    special_case_regex = {
        r'^angular.*$': 'angularjs',
        r'^node.*$': 'nodejs',
        r'^(.*)[_\s]js$': '\\1js',
        r'^(.*) js$': '\\1js',
    }
    for regex_rule in special_case_regex:
        after_skill = re.sub(regex_rule, special_case_regex[regex_rule], skill)
        if after_skill != skill:
            skill = after_skill
            break
    
    # Stem
    if len(skill) > 2:
        skill_after = skill.split(" ")
        skill_after = [wordnet_lemmatizer.lemmatize(sk, pos="v") for sk in skill_after]
        skill_after = " ".join(skill_after)
        skill = skill_after
    
    # skill stopwords 
    if remove_stopwords:
        skill_stopwords = [ "app", "touch", "the", "application" ]
        skill_after = skill.split(" ")
        skill = " ".join([ sk for sk in skill_after if sk not in skill_stopwords ])
    
    skill = skill.lower().strip().replace(" ", "_")
    skill = re.sub(' +',' ', skill)
    
    # NOTE: replace js tail
    skill = re.sub('js$','', skill)
    
    return skill

print(skill_transform("js"), skill_transform("angularjs"), skill_transform("Python"))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants