In [1]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency



In [None]:
# ! pip install --upgrade gensim

In [2]:
import gensim 
from gensim.models import Word2Vec 
import multiprocessing


In [3]:
from gensim.models.phrases import Phrases, Phraser


In [4]:
import nltk
from nltk.util import ngrams


In [5]:
# data preprocessing
df = pd.read_csv('cleaned_combined_jobs_data.csv')
df.shape
description = df['description_cleaned'].dropna()


In [6]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,JobTitle,SalaryEstimate,JobDescription,Rating,Company.Name,Location,Headquarters,Size,Founded,Type.of.ownership,Industry,Sector,Revenue,Competitors,Easy.Apply,description_cleaned
0,0,Senior Data Scientist,$111K-$181K (Glassdoor est.),"ABOUT HOPPER\r\n\r\nAt Hopper, we’re on a miss...",3.5,Hopper\r\n3.5,"New York, NY","Montreal, Canada",501 to 1000 employees,2007,Company - Private,Travel Agencies,Travel & Tourism,Unknown / Non-Applicable,-1,-1,about hopper at hopper we re on a mission ...
1,1,"Data Scientist, Product Analytics",$111K-$181K (Glassdoor est.),"At Noom, we use scientifically proven methods ...",4.5,Noom US\r\n4.5,"New York, NY","New York, NY",1001 to 5000 employees,2008,Company - Private,"Health, Beauty, & Fitness",Consumer Services,Unknown / Non-Applicable,-1,-1,at noom we use scientifically proven methods ...
2,2,Data Science Manager,$111K-$181K (Glassdoor est.),Decode_M\r\n\r\nhttps://www.decode-m.com/\r\n\...,-1.0,Decode_M,"New York, NY","New York, NY",1 to 50 employees,-1,Unknown,-1,-1,Unknown / Non-Applicable,-1,TRUE,decode m https www decode m com data ...
3,3,Data Analyst,$111K-$181K (Glassdoor est.),Sapphire Digital seeks a dynamic and driven mi...,3.4,Sapphire Digital\r\n3.4,"Lyndhurst, NJ","Lyndhurst, NJ",201 to 500 employees,2019,Company - Private,Internet,Information Technology,Unknown / Non-Applicable,"Zocdoc, Healthgrades",-1,sapphire digital seeks a dynamic and driven mi...
4,4,"Director, Data Science",$111K-$181K (Glassdoor est.),"Director, Data Science - (200537)\r\nDescripti...",3.4,United Entertainment Group\r\n3.4,"New York, NY","New York, NY",51 to 200 employees,2007,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"BBDO, Grey Group, Droga5",-1,director data science description...


In [7]:
print(description)

0        about hopper    at hopper  we re on a mission ...
1        at noom  we use scientifically proven methods ...
2        decode m    https   www decode m com     data ...
3        sapphire digital seeks a dynamic and driven mi...
4        director  data science             description...
                               ...                        
10249    maintains systems to protect data from unautho...
10250    position   senior data analyst  corporate audi...
10251    title  technical business analyst  sql  data a...
10252    summary    responsible for working cross funct...
10253    you     you bring your body  mind  heart and s...
Name: description_cleaned, Length: 10253, dtype: object


In [8]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer


In [18]:
# set up the parameter of the word2vec model
w2v_model = Word2Vec(min_count=50,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [19]:
# bigrams
sent = [row.split() for row in description]
phrases = Phrases(sent, min_count=30, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]



In [20]:
# most frequent words
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

38787

In [21]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]


['and', 'to', 'the', 'of', 'in', 'data', 'a', 'with', 'for', 'experience']

In [22]:
# build the vocabulary of the model
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)



In [23]:
# train the model
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)



(53154561, 129878190)

In [24]:
# calling init_sims()to make the model much more memory-efficient:
w2v_model.init_sims(replace=True)


In [25]:
# input key words
w2v_model.wv.most_similar(positive=["data"])

# show your ability to develop your own data pipeline to move raw data from a source to a destination . 
# The destination is where the data is analyzed for business insights.


[('analytics', 0.5672853589057922),
 ('sets', 0.5247077345848083),
 ('analysis', 0.524539589881897),
 ('reporting', 0.5168337821960449),
 ('using', 0.5088383555412292),
 ('models', 0.5065914988517761),
 ('sources', 0.5033448338508606),
 ('datasets', 0.4979720711708069),
 ('tools', 0.4887539744377136),
 ('pipelines', 0.4813288450241089)]

In [26]:
w2v_model.wv.most_similar(positive=["sql"])

# sql related skills included in resume: python, tableau, sas, r, excel, databases

[('python', 0.6370193958282471),
 ('tableau', 0.6035563945770264),
 ('sql_server', 0.5966885089874268),
 ('sas', 0.5875258445739746),
 ('etl', 0.5672693252563477),
 ('sql_queries', 0.5612906813621521),
 ('r', 0.5506023168563843),
 ('excel', 0.5472790002822876),
 ('sas_r', 0.5463905334472656),
 ('relational_databases', 0.5433073043823242)]

In [46]:
w2v_model.wv.most_similar(positive=["python"])

# python related skills included in resume: r, sql, java, pyspark

[('python_r', 0.6625280380249023),
 ('sql', 0.6370193958282471),
 ('python_java', 0.6344974637031555),
 ('r', 0.6047457456588745),
 ('java_scala', 0.603253185749054),
 ('programming', 0.5989794731140137),
 ('python_scala', 0.5900561809539795),
 ('pyspark', 0.5864248871803284),
 ('languages_such', 0.5859144330024719),
 ('scala', 0.584870457649231)]

In [48]:
w2v_model.wv.most_similar(positive=["staffigo"])


[('global_leader', 0.4766501784324646),
 ('ibmers_serving', 0.40292245149612427),
 ('countries_location', 0.3909544348716736),
 ('disrupt', 0.38372525572776794),
 ('nd_century', 0.3729187846183777),
 ('consulting', 0.37085938453674316),
 ('staffing', 0.3646043837070465),
 ('ntt', 0.361350953578949),
 ('accenture_federal', 0.3503355085849762),
 ('disney_streaming', 0.3490871489048004)]

In [31]:
w2v_model.wv.most_similar(positive=["machine"])


[('reinforcement_learning', 0.383495032787323),
 ('vector', 0.3684707283973694),
 ('bayesian', 0.35157138109207153),
 ('deep_learning', 0.34417200088500977),
 ('computers', 0.3428729772567749),
 ('boosting', 0.3314419984817505),
 ('text_mining', 0.32837897539138794),
 ('computer', 0.3265228271484375),
 ('image_processing', 0.32563281059265137),
 ('natural_language', 0.32401445508003235)]

In [27]:
w2v_model.wv.most_similar(positive=["required"])
# specific ml algorithms: reinforcement_learning, deal with vectors, bayesian related algorithms, deep_learning, text_mining, image_processing, NLP

# qualification/ degree and experience matter most

[('preferred', 0.6454149484634399),
 ('minimum', 0.6383762955665588),
 ('qualifications', 0.5970038175582886),
 ('or', 0.5497605800628662),
 ('or_equivalent', 0.5461472272872925),
 ('related_field', 0.5359395742416382),
 ('degree', 0.5336716175079346),
 ('related', 0.5274271965026855),
 ('experience', 0.5252817273139954),
 ('years', 0.5193190574645996)]

In [47]:
w2v_model.wv.most_similar(positive=["degree"])
# degree requirements

[('related_field', 0.7417489886283875),
 ('computer_science', 0.7334764003753662),
 ('minimum', 0.6908020377159119),
 ('bachelor_s', 0.6806066632270813),
 ('finance_economics', 0.6628279089927673),
 ('or_equivalent', 0.6543084979057312),
 ('related_discipline', 0.6404116749763489),
 ('economics_finance', 0.6378796100616455),
 ('master_s', 0.6327531337738037),
 ('years', 0.6293987035751343)]

In [29]:
w2v_model.wv.most_similar(positive=["experience"])
# experience preferred


[('years', 0.8309797048568726),
 ('preferred', 0.7640348076820374),
 ('preferably', 0.6805927753448486),
 ('minimum', 0.664563775062561),
 ('plus', 0.6520195007324219),
 ('or_equivalent', 0.6492597460746765),
 ('hands_on', 0.6452623605728149),
 ('similar', 0.6380330324172974),
 ('related_field', 0.6343466639518738),
 ('at_least', 0.6137567162513733)]

In [32]:
w2v_model.wv.most_similar(positive=["cloud"])
# aws and google cloud

[('cloud_based', 0.6046124696731567),
 ('microsoft_azure', 0.5427299737930298),
 ('on_premise', 0.5403303503990173),
 ('amazon_web', 0.5395830869674683),
 ('azure', 0.522074818611145),
 ('aws_azure', 0.5219202637672424),
 ('aws', 0.5188604593276978),
 ('cloud_computing', 0.5052876472473145),
 ('google_cloud', 0.4945797920227051),
 ('gcp', 0.4886273741722107)]

In [40]:
w2v_model.wv.most_similar(positive=["skills"])
# communication, interpersonal, organization and presentation skills are important

[('problem_solving', 0.7218472957611084),
 ('strong', 0.6830064058303833),
 ('interpersonal_skills', 0.6514632105827332),
 ('excellent', 0.6427369713783264),
 ('presentation_skills', 0.6413192749023438),
 ('organizational_skills', 0.6064302921295166),
 ('ability', 0.6013692617416382),
 ('communication_skills', 0.6003636121749878),
 ('abilities', 0.5891449451446533),
 ('interpersonal', 0.5771859884262085)]