# Developer Skills
## In this final notebook, I will be taking a look at what skills are most valued in developers / programmers. I will be doing so by taking our onlinejobpostings data set, and then filtering it, and then running bag of words and td_idf and then reverse frequency
 - find words that companies most value when hiring developers / programmers
 

In [10]:
# here, I will be importing a group of libraries that I may end up using that I found on a Medium article that will help abstract the math away and simply let me run the experiment on my data
import pandas as pd
import spacy
from string import punctuation
from collections import Counter

In [2]:
def pd_import(path):
    return pd.read_csv(path)

In [4]:
online_jobs = pd_import("./onlinejobpostings.csv") # import data

In [6]:
developer_keywrds = ["Software Developer", "Data Engineer", "Web Developer", "Database", "Computer hardware",
                     "Computer Systems", "Information Security", "Systems analyst", "Computer Network", "Network",
                     "Systems engineering", "Web Designer", "Systems Administrator", "Software", "User experience", 
                     "Information technology", "technology", "Video game", "Developer"] # a set of words most commonly attributed to workers working a developer / software engineering / programming job
developer_jobs = pd.concat([online_jobs.loc[online_jobs['jobpost'].str.contains(x, case=True)] for x in developer_keywrds], ignore_index=True) # all developer related jobs within our data set

In [7]:
developer_jobs.drop_duplicates(inplace=True) # removes duplicates

In [56]:
len(developer_jobs)

4776

In [50]:
recruiting_wrds = ["company", 'job', 'title', 'software', 'developer', 'position', 'location', 'assistance', 'development', 'projects', 'required', 'qualifications','background', 'plus', 'excellent', 'knowledge', 'technologies',
                  'experience', 'good', 'knowledge','commensurate', 'norms', 'accepted', 'company', 'application', 'procedures', 'successful', 'candidates', 'submit', 'relevant', 'recommendation', 'letters', 'previous', 'employers', 'copy', '-ies',
                  'relevant', 'certificates', 'available', 'color', 'photo', 'yerevan', 'send', 'following', 'e', 'mail', 'human', 'resources', 'department', 'armine', 'bibilyan', 'mention', 'application', 'letter', 'learned', 'job', 'opportunity', 
                   'career', 'center', 'mention', 'url', 'website', 'thanks', 'application', 'deadline', 'place', 'free', 'posting', 'job', 'career', 'related', 'opportunities', 'developers', 'description', 'looking', 'programmers', 'j2ee', 'minimum', 'years', 'strong',
                  'skills', 'resume', 'listed', 'asked', 'pass', 'interview', "anouncement", "follow", "post", "link", "ability", "work", "activities", 'government','solutions','filled','proven','history','producing','quality','product','commercial','setting','dynamic','workplace',
                  'industry','including','tools','working', "responsibilities", "opening", "term", "date", "opening", "announcement", "time", "technical", "applications", "salary", "project", "information", "develop", "duration", "based",
                  "start", "subject", "interested"]


In [11]:
nlp = spacy.load("en_core_web_lg")

In [51]:
def hot_words(body_text):
    filtered_tokens = []
    pos_tags = ["VERB", "ADJ", "NOUN"]
    doc = nlp(body_text.lower())
    tmp = []
    for token in doc:
        if (token.text in nlp.Defaults.stop_words or
           token.text in punctuation):
            continue
        if (token.pos_ in pos_tags):
            tmp.append(token.text)
    for tokenized_wrd in tmp:
        if tokenized_wrd not in recruiting_wrds and tokenized_wrd not in developer_keywrds:
            filtered_tokens.append(tokenized_wrd)
    return filtered_tokens

In [52]:
output = hot_words(developer_jobs['jobpost'][1])
print(output)

['object', 'oriented', 'web', 'services', 'soap', 'ooa&d', 'practical', 'uml', 'international', 'german', 'languages', 'mandatory', 'big', 'remuneration', 'depends', 'open', 'jv', 'focus', 'web', 'organization']


In [53]:
all_keywrds = []
for index, row in developer_jobs.iterrows():
    all_keywrds.extend(hot_words(row["jobpost"]))
all_keywrds

['rendering',
 'database',
 'management',
 'systems',
 'realization',
 'servers',
 'maintenance',
 'replication',
 'participation',
 'designing',
 'university',
 'degree',
 'economical',
 'windows',
 'server',
 'networking',
 'tcp/',
 'ip',
 'server',
 'visual',
 'basic',
 'database',
 'remuneration',
 'diploma',
 'organization',
 'object',
 'oriented',
 'web',
 'services',
 'soap',
 'ooa&d',
 'practical',
 'uml',
 'international',
 'german',
 'languages',
 'mandatory',
 'big',
 'remuneration',
 'depends',
 'open',
 'jv',
 'focus',
 'web',
 'organization',
 'synergy',
 'international',
 'systems',
 'synergy',
 'systems',
 'seeks',
 'fill',
 'long',
 'focused',
 'core',
 'tasks',
 'synergy',
 'main',
 'focus',
 'developing',
 'integrated',
 'state',
 'art',
 'web',
 'database',
 'web',
 'portal',
 'systems',
 'business',
 'intelligence',
 'management',
 'solid',
 'developing',
 'practice',
 'candidate',
 'aspects',
 'process',
 'design',
 'implementation',
 'testing',
 'delivery',
 'spe

In [55]:
top_skills = [('#' + x[0]) for x in Counter(all_keywrds).most_common(20)]
top_skills[:10]

['#team',
 '#design',
 '#organization',
 '#web',
 '#management',
 '#language',
 '#systems',
 '#support',
 '#communication',
 '#degree']