In [1]:
import spacy
from textacy import preprocessing
from textacy import extract
import pandas as pd
from spacy import displacy
import re
import numpy as np
from sklearn.cluster import KMeans
import pytextrank



In [2]:
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x7fe8619b15b0>

In [11]:
ruler = nlp.add_pipe("entity_ruler", before="ner")
#nlp.remove_pipe("entity_ruler")

In [12]:
patterns = [
    {"label": "SKILL", "pattern": [{"POS": "VERB", "IS_SENT_START": True},  
                                   {"SPACY": True, "OP": "*"}, {"POS": "NOUN"}]},
    {"label": "JOB", "pattern": [{"POS": "NOUN", "LEMMA": {"IN" : ["engineer", "designer", "product management"]}}]},
    {"label": "PROD", "pattern": [{"TEXT": "seeking"},{"TEXT": "S3"}]}
]

ruler.add_patterns(patterns)

doc3 = nlp("Description seeking, seeking experienced, experienced product, designer engineer product leader, leader join, join Boston, MA office, office lead, lead product, product management, management AWS3, AWS newest, newest hybrid, hybrid storage, storage service, AWS Outposts, Outposts provide, provide fully, fully managed, managed AWS, AWS infrastructure, infrastructure services, services customers")
print([(ent.text, ent.label_) for ent in doc3.ents])

[('designer', 'JOB'), ('engineer', 'JOB'), ('Boston', 'GPE'), ('MA', 'ORG'), ('AWS3', 'ORG'), ('AWS', 'ORG'), ('AWS Outposts', 'ORG'), ('Outposts', 'ORG'), ('AWS', 'ORG')]


In [7]:
displacy.render(doc3, style='ent', jupyter=True)

In [17]:
def remove_stop_words(text: str):
    
    doc = nlp(text)
    tokens = [token.text for token in doc if not token.is_stop]
    return " ".join(tokens)

In [18]:
def remove_new_line(text: str):
    return text.replace("\n", " ")

In [19]:
preproc = preprocessing.make_pipeline(
    remove_new_line,
    preprocessing.remove.html_tags,
    remove_stop_words,
    preprocessing.normalize.whitespace, 
    preprocessing.replace.urls, 
    preprocessing.replace.numbers, 
    preprocessing.normalize.unicode
)

In [20]:
lnkjobs = pd.read_csv("../data/linkedin_jobs.csv")
lnkjobs

Unnamed: 0,index,subject,url,job_desc
0,0,Principal Product Manager - Technical - AWS,https://www.linkedin.com/jobs/view/2600123751/...,About the job\nDescription\n\nWe’re seeking an...
1,1,"Principal Product Manager Technical, Amazon De...",https://www.linkedin.com/jobs/view/2798027544/...,About the job\nDescription\n\nAWS External Sec...
2,2,Sr. Data Product Manager,https://www.linkedin.com/jobs/view/2787722444/...,About the job\nAbout CyberArk:\nCyberArk (NASD...
3,3,Senior Principal Product Manager (REMOTE ELIGI...,https://www.linkedin.com/jobs/view/2799481430/...,About the job\nAbout VERITAS\n\nVeritas solves...
4,4,"Principal Product Manager, Cloud-Native Platform",https://www.linkedin.com/jobs/view/2802975863/...,About the job\nJob Description:\n\nPrincipal P...
5,5,"Senior Product Manager, Multi-Cloud Commerce -...",https://www.linkedin.com/jobs/view/2780771184/...,About the job\nWhy will you enjoy this new opp...
6,6,Principal Product Manager - Tech,https://www.linkedin.com/jobs/view/2798052149/...,About the job\nDescription\n\nAre you passiona...
7,7,Group Product Manager,https://www.linkedin.com/jobs/view/2801103520/...,About the job\nCoinbase has built the world's ...
8,8,Principal Product Manager,https://www.linkedin.com/jobs/view/2807025455/...,About the job\nDescription:\nOverview\n\nPerch...
9,9,Senior Product Manager,https://www.linkedin.com/jobs/view/2793489860/...,About the job\nDefine the Future of Work\nAs a...


In [35]:
lnkjobs["clean_job_desc"] = lnkjobs["job_desc"].map(preproc)
lnkjobs["nlp"] = lnkjobs["job_desc"].map(nlp)
lnkjobs

Unnamed: 0,index,subject,url,job_desc,clean_job_desc,nlp
0,0,Principal Product Manager - Technical - AWS,https://www.linkedin.com/jobs/view/2600123751/...,About the job\nDescription\n\nWe’re seeking an...,job Description seeking experienced product le...,"(About, the, job, \n, Description, \n\n, We, ’..."
1,1,"Principal Product Manager Technical, Amazon De...",https://www.linkedin.com/jobs/view/2798027544/...,About the job\nDescription\n\nAWS External Sec...,job Description AWS External Security Services...,"(About, the, job, \n, Description, \n\n, AWS, ..."
2,2,Sr. Data Product Manager,https://www.linkedin.com/jobs/view/2787722444/...,About the job\nAbout CyberArk:\nCyberArk (NASD...,job CyberArk : CyberArk ( NASDAQ : CYBR ) glob...,"(About, the, job, \n, About, CyberArk, :, \n, ..."
3,3,Senior Principal Product Manager (REMOTE ELIGI...,https://www.linkedin.com/jobs/view/2799481430/...,About the job\nAbout VERITAS\n\nVeritas solves...,job VERITAS Veritas solves . industry - leadin...,"(About, the, job, \n, About, VERITAS, \n\n, Ve..."
4,4,"Principal Product Manager, Cloud-Native Platform",https://www.linkedin.com/jobs/view/2802975863/...,About the job\nJob Description:\n\nPrincipal P...,job Job Description : Principal Product Manage...,"(About, the, job, \n, Job, Description, :, \n\..."
5,5,"Senior Product Manager, Multi-Cloud Commerce -...",https://www.linkedin.com/jobs/view/2780771184/...,About the job\nWhy will you enjoy this new opp...,job enjoy new opportunity ? VMware leader virt...,"(About, the, job, \n, Why, will, you, enjoy, t..."
6,6,Principal Product Manager - Tech,https://www.linkedin.com/jobs/view/2798052149/...,About the job\nDescription\n\nAre you passiona...,job Description passionate Artificial Intellig...,"(About, the, job, \n, Description, \n\n, Are, ..."
7,7,Group Product Manager,https://www.linkedin.com/jobs/view/2801103520/...,About the job\nCoinbase has built the world's ...,job Coinbase built world leading compliant cry...,"(About, the, job, \n, Coinbase, has, built, th..."
8,8,Principal Product Manager,https://www.linkedin.com/jobs/view/2807025455/...,About the job\nDescription:\nOverview\n\nPerch...,job Description : Overview Perch new kind dire...,"(About, the, job, \n, Description, :, \n, Over..."
9,9,Senior Product Manager,https://www.linkedin.com/jobs/view/2793489860/...,About the job\nDefine the Future of Work\nAs a...,job Define Future Work thought leader industry...,"(About, the, job, \n, Define, the, Future, of,..."


In [15]:
lnkjobs["clean_job_desc"][0]

results = [x for x in lnkjobs['clean_job_desc']]
doc = nlp(' '.join(results))

In [47]:
ngrams = list(extract.basics.ngrams(doc, 2, min_freq=1))

print(ngrams)

[job Description, Description seeking, seeking experienced, experienced product, product leader, leader join, join Boston, MA office, office lead, lead product, product management, management AWS, AWS newest, newest hybrid, hybrid storage, storage service, AWS Outposts, Outposts provide, provide fully, fully managed, managed AWS, AWS infrastructure, infrastructure services, services customers, premises data, data centers, Amazon S3, S3 Outposts, provides hybrid, hybrid object, object storage, storage enabling, enabling customers, customers store, store retrieve, retrieve data, premises S3, S3 programming, programming model, model features, team responsible, responsible delivering, delivering hybrid, hybrid storage, storage platform, platform Outposts, help shape, shape object, object storage, storage generation, generation hybrid, hybrid computing, computing platform, Product Management, Management AWS, AWS opportunity, opportunity collaborate, collaborate engineering, business develop

In [22]:
#extract.keyterms.textrank(doc, topn=20)

In [23]:
#extract.keyterms.yake(doc, normalize="lemma", ngrams=2, topn=30)

In [36]:
results = [x for x in lnkjobs['nlp']]

for doc in results:
    print('-----------------------------------------------------------------')
    displacy.render(doc, style='ent', jupyter=True)

-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------




-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


-----------------------------------------------------------------


In [72]:
noun_chunks = extract.basics.noun_chunks(nlp(results[0]))
for n in noun_chunks:
    print(n)

job Description
experienced product leader
Boston
MA office
product management AWS
newest hybrid storage service
AWS Outposts
fully managed AWS infrastructure services customers - premises data centers
Amazon S3 Outposts
_
NUMBER
_
hybrid object storage
customers
store
retrieve
team
hybrid storage platform Outposts
team
object storage generation hybrid computing platform
Product Management AWS opportunity
engineering
design
business development teams
entrepreneurial product leader
solutions customers
new AWS business
Successful candidates
strategic roadmap business
technical details
closely engineering team drive delivery
delight customers
experienced team engineers product managers
scaled services Amazon
effectively diverse team peers
Work / Life Balance team
high value work - life balance
flexible people
work
_
NUMBER
_
NUMBER
Mentorship Career Growth team
new team members
team broad mix experience levels
Amazon tenures
building environment
knowledge
sharing mentorship
senior members

In [85]:
terms = extract.basics.terms(nlp(results[0]), ngs=3, ents=False, ncs=False, dedupe = False)
#print(len(list(terms)))
for n in terms:
    print(n)

job Description seeking
Description seeking experienced
seeking experienced product
experienced product leader
product leader join
leader join Boston
MA office lead
office lead product
lead product management
product management AWS
management AWS newest
AWS newest hybrid
newest hybrid storage
hybrid storage service
AWS Outposts provide
Outposts provide fully
provide fully managed
fully managed AWS
managed AWS infrastructure
AWS infrastructure services
infrastructure services customers
premises data centers
Amazon S3 Outposts
provides hybrid object
hybrid object storage
object storage enabling
storage enabling customers
enabling customers store
customers store retrieve
store retrieve data
premises S3 programming
S3 programming model
programming model features
team responsible delivering
responsible delivering hybrid
delivering hybrid storage
hybrid storage platform
storage platform Outposts
help shape object
shape object storage
object storage generation
storage generation hybrid
genera

In [83]:
terms = extract.basics.terms(nlp(results[0]), ngs=2, ents=True, ncs=True)
#print(len(list(terms)))
for n in terms:
    print(n)

job Description
Description seeking
seeking experienced
experienced product
product leader
leader join
join Boston
MA office
office lead
lead product
product management
management AWS
AWS newest
newest hybrid
hybrid storage
storage service
AWS Outposts
Outposts provide
provide fully
fully managed
managed AWS
AWS infrastructure
infrastructure services
services customers
premises data
data centers
Amazon S3
S3 Outposts
provides hybrid
hybrid object
object storage
storage enabling
enabling customers
customers store
store retrieve
retrieve data
premises S3
S3 programming
programming model
model features
team responsible
responsible delivering
delivering hybrid
hybrid storage
storage platform
platform Outposts
help shape
shape object
object storage
storage generation
generation hybrid
hybrid computing
computing platform
Product Management
Management AWS
AWS opportunity
opportunity collaborate
collaborate engineering
business development
development teams
looking entrepreneurial
entrepreneur

In [87]:
lnkjobs.head()

Unnamed: 0,index,subject,url,job_desc,clean_job_desc
0,0,Principal Product Manager - Technical - AWS,https://www.linkedin.com/jobs/view/2600123751/...,About the job\nDescription\n\nWe’re seeking an...,job Description seeking experienced product le...
1,1,"Principal Product Manager Technical, Amazon De...",https://www.linkedin.com/jobs/view/2798027544/...,About the job\nDescription\n\nAWS External Sec...,job Description AWS External Security Services...
2,2,Sr. Data Product Manager,https://www.linkedin.com/jobs/view/2787722444/...,About the job\nAbout CyberArk:\nCyberArk (NASD...,job CyberArk : CyberArk ( NASDAQ : CYBR ) glob...
3,3,Senior Principal Product Manager (REMOTE ELIGI...,https://www.linkedin.com/jobs/view/2799481430/...,About the job\nAbout VERITAS\n\nVeritas solves...,job VERITAS Veritas solves . industry - leadin...
4,4,"Principal Product Manager, Cloud-Native Platform",https://www.linkedin.com/jobs/view/2802975863/...,About the job\nJob Description:\n\nPrincipal P...,job Job Description : Principal Product Manage...


In [95]:
jobs_dict = dict(zip(lnkjobs['url'], lnkjobs['nlp']))

In [149]:
temp_results = {
    'Terms': [],
    'Job_ID': []
}
for key, value in jobs_dict.items():
    chunks = list(extract.basics.terms(value, ngs=2, ents=False, ncs=False, dedupe = False))
    ID = re.search('\d{10}',key).group(0)
    for c in chunks:
        temp_results['Terms'].append(c.text)
        temp_results['Job_ID'].append(ID)

temp_results

{'Terms': ['job Description',
  'Description seeking',
  'seeking experienced',
  'experienced product',
  'product leader',
  'leader join',
  'join Boston',
  'MA office',
  'office lead',
  'lead product',
  'product management',
  'management AWS',
  'AWS newest',
  'newest hybrid',
  'hybrid storage',
  'storage service',
  'AWS Outposts',
  'Outposts provide',
  'provide fully',
  'fully managed',
  'managed AWS',
  'AWS infrastructure',
  'infrastructure services',
  'services customers',
  'premises data',
  'data centers',
  'Amazon S3',
  'S3 Outposts',
  'provides hybrid',
  'hybrid object',
  'object storage',
  'storage enabling',
  'enabling customers',
  'customers store',
  'store retrieve',
  'retrieve data',
  'premises S3',
  'S3 programming',
  'programming model',
  'model features',
  'team responsible',
  'responsible delivering',
  'delivering hybrid',
  'hybrid storage',
  'storage platform',
  'platform Outposts',
  'help shape',
  'shape object',
  'object st

In [150]:
terms = pd.DataFrame(temp_results)
terms.head()

Unnamed: 0,Terms,Job_ID
0,job Description,2600123751
1,Description seeking,2600123751
2,seeking experienced,2600123751
3,experienced product,2600123751
4,product leader,2600123751


In [151]:
grouped = terms.groupby('Terms').count().reset_index()
grouped.sort_values('Job_ID', ascending=False).head(50)

Unnamed: 0,Terms,Job_ID
534,_,77
348,NUMBER,40
69,Amazon,34
2371,sexual orientation,20
1859,national origin,17
1403,gender identity,15
2650,veteran status,13
672,basis race,12
416,Product Manager,11
959,customers,11


In [152]:
grouped[grouped['Terms'].str.contains('roadmap', na=False)]

Unnamed: 0,Terms,Job_ID
1558,infrastructure roadmap,1
1661,"lead delivery high - performance , innovative ...",1
2088,product roadmap,2
2089,product roadmap customers,1
2090,product roadmap meet requirements business plan,1
2091,product roadmaps,2
2274,roadmap,1
2275,roadmap business,1
2276,roadmap customers,1
2277,roadmap meet,1


In [160]:
rm1 = nlp("product roadmap")
rm2 = nlp("infrastructure roadmap")
rm3 = nlp("roadmap business")
rm4 = nlp("customer")
rm5 = nlp("product roadmaps")

rm1.similarity(rm5)

0.8902945854485927

In [199]:
str_terms = [] # string is used to filter out dedups
span_terms = []
for key, value in jobs_dict.items():
    chunks = list(extract.basics.terms(value, ngs=2, ents=False, ncs=True, dedupe = True))
    ID = re.search('\d{10}',key).group(0)
    for c in chunks:
        if(len(c.text) > 6):
            if c.text not in str_terms:
                span_terms.append(c)
                str_terms.append(c.text)



In [217]:
def find_similar_terms(span):
    
    ceil = 1.0
    a = 0.85
    b = 0.72
    c = 0.69
    
    key_a = str(a) + "_" + str(ceil)
    key_b = str(b) + "_" + str(a-0.01)
    key_c = str(c) + "_" + str(b-0.01)
    results = {
        'key_term': span,
        key_a: [],
        key_b: [],
        key_c: []
    }
    
    for t in span_terms:
        score = span.similarity(t)
        if(score >= a and score < ceil):
            results[key_a].append((t, score))
        elif(score >= b and score < a):
            results[key_b].append((t, score))
        elif(score >= c and score < b):
            results[key_c].append((t, score))
    
    return results

In [222]:
terms_clusters = {}
for term in span_terms:
    terms_clusters[term.text] = find_similar_terms(term)
    

  score = span.similarity(t)


In [234]:
#persons_vc_map  = {ent.text:ent.vector  for ent in doc.ents if ent.label_=='PERSON' } 
terms2 = {chunk.text:chunk.vector for chunk in span_terms}


Unnamed: 0,0,1
0,Terms,0 job Description 1 De...
1,Job_ID,0 2600123751 1 2600123751 2 ...


In [256]:
df = pd.DataFrame([(key , val ) for key , val in terms2.items()])
X = np.array([a for a in df[1]])
kmeans_terms = KMeans(n_clusters=50).fit(X)
df ['labels'] =  kmeans_terms.labels_
df

Unnamed: 0,0,1,labels
0,job Description,"[-0.21057501, 0.0414495, -0.420905, -0.045852,...",24
1,Description seeking,"[0.061595, -0.1956505, -0.41362998, 0.08548801...",6
2,seeking experienced,"[0.1363455, -0.138585, -0.179095, 0.051354505,...",20
3,experienced product,"[-0.22814949, 0.251275, -0.0824755, -0.1180255...",8
4,product leader,"[-0.33899498, 0.452585, 0.0947745, 0.128925, 0...",8
...,...,...,...
2639,Sophos Central – centerpiece adaptive cybersec...,"[-0.107221425, 0.08308, 0.124375716, -0.101335...",19
2640,centralized data lake,"[-0.14853166, -0.09586332, 0.13034667, 0.00875...",39
2641,partners,"[-0.25558, -0.39295, 0.0096051, -0.38615, 0.30...",1
2642,Sophos major hubs globe,"[0.24185124, -0.1791905, 0.059178747, -0.10275...",23


In [257]:
for label, names in df.groupby('labels'):
    print("cluster {} , values : {}".format( label, names[0].tolist()))
    print("++++++++++++++++")

cluster 0 , values : ['help shape', 'flexible people', 'people occasionally', 'occasionally need', 'effectively start', 'based help', 'find balance', 'thorough , kind , code reviews', 'help secure', 'apply find', 'reason prevents', 'process place', 'productive comfortable', 'need fully', 'look barriers', 'Shape possible', 'possible today', 'hard problems', 'way defining', 'things look', 'default trust', 'way relevant', 'view Pay', 'need reasonable', 'let know', 'quick access', 'screen reading', 'free step', 'step step', 'step tutorial', 'tutorial found', 'regulate way', 'agreeing use', 'quick access screen', 'free step step tutorial', 'kind direct', 'makes easy', 'access clean', 'Like parent', 'mission clean', 'looks push', 'push envelope', 'process imagine', 'Want fast', 'rewrite way', 'changing treat', 'clothes wear', 'makeup everyday', 'everyday objects', 'pressing problems', 'ways problem', 'advised safe', 'clothes', 'drivers safer', 'things makes', 'safe welcoming', 'help realize'