## Import tools

In [1]:
!pip install spacy



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     |████████████████████████████████| 13.9 MB 958 kB/s            
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [769]:
import spacy
import pandas as pd
import scattertext as st
import re

In [514]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from tqdm import tqdm

In [5]:
nlp = spacy.load('en_core_web_sm')

## Load data

In [4]:
##Unickle whole dataframe
df = pd.read_pickle('cleaned_tapwage_jds')
df.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,jd_length,header_number,bullets_number,bullets_percentage,bullets_length,bullets_merged
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,11255,15,67,0.585695,6592,maintains appropriate office systems word pro...
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,11493,23,67,0.573567,6592,maintains appropriate office systems word pro...
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,14701,25,63,0.464798,6833,active scholarship and or relevant teaching ex...
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,5075,7,17,0.27665,1404,create predictive models by mining complex dat...
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,3494,5,9,0.275615,963,express recombinant proteins in microbial or e...


In [None]:
## location frequency
## titles frequency
## bullet point lemmas

In [432]:
## Unpickle just JDs with bullets dataframe
df_bullets = pd.read_pickle('bullet_points_corpus_w_punct')

In [433]:
df_bullets.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,jd_length_char,header_number,bullets_number,bullets_length_char,bullets_percentage,bullets_merged
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,11255,15,67,6592,0.585695,"Maintains appropriate office systems, word pro..."
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,11493,23,67,6592,0.573567,"Maintains appropriate office systems, word pro..."
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,14701,25,63,6833,0.464798,Active scholarship and/or relevant teaching ex...
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,5075,7,17,1404,0.27665,Create predictive models by mining complex dat...
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,3494,5,9,963,0.275615,Express recombinant proteins in microbial or e...


In [434]:
df_bullets.shape

(6119, 16)

In [435]:
df_bullets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6119 entries, 0 to 6874
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   title                6119 non-null   object 
 1   company              6119 non-null   object 
 2   location             6119 non-null   object 
 3   uid                  6119 non-null   object 
 4   tags                 6119 non-null   object 
 5   description          6119 non-null   object 
 6   bullets              6119 non-null   object 
 7   headers              6119 non-null   object 
 8   phd_tag              6119 non-null   bool   
 9   tag_number           6119 non-null   int64  
 10  jd_length_char       6119 non-null   int64  
 11  header_number        6119 non-null   int64  
 12  bullets_number       6119 non-null   int64  
 13  bullets_length_char  6119 non-null   int64  
 14  bullets_percentage   6119 non-null   float64
 15  bullets_merged       6119 non-null   o

In [436]:
## UGH there are duplicates that were not detected due to spaces!!
df_bullets['uid_cleaned'] = df_bullets.uid.apply(lambda x: str.strip(x))

In [437]:
df_bullets.uid.value_counts()

33687104     3
45496105     2
50299789     2
50299808     2
50299993     2
            ..
50160288     1
50180523     1
50197064     1
50211756     1
49806221     1
Name: uid, Length: 5224, dtype: int64

In [438]:
df_bullets.uid_cleaned.value_counts()

33687104    3
45496105    2
50299789    2
50299808    2
50299993    2
           ..
50160288    1
50180523    1
50197064    1
50211756    1
49806221    1
Name: uid_cleaned, Length: 5224, dtype: int64

In [439]:
df_bullets[df_bullets.uid_cleaned == '33687104']

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,jd_length_char,header_number,bullets_number,bullets_length_char,bullets_percentage,bullets_merged,uid_cleaned
2189,"Assistant, Associate , OR Full Professor (HS C...",University of California San Diego,"San Diego, CA",33687104,"[PhD, Biology, Universities, Junior, Full-Time...","Assistant, Associate, or Full Professor (HS Cl...",[PATHOLOGY / School of Medicine / UC San Diego...,"[Open date:, Next review date:, Final date:, D...",False,6,5063,10,8,1036,0.204622,PATHOLOGY / School of Medicine / UC San Diego ...,33687104
5150,"Assistant, Associate , OR Full Professor (HS C...",University of California San Diego,"San Diego, CA",33687104,"[PhD, Biology, Universities, Junior, Full-Time...","Assistant, Associate, or Full Professor (HS Cl...",[PATHOLOGY / School of Medicine / UC San Diego...,"[Open date:, Next review date:, Final date:, D...",False,6,5063,10,8,1036,0.204622,PATHOLOGY / School of Medicine / UC San Diego ...,33687104
6129,"Assistant, Associate , OR Full Professor (HS C...",University of California San Diego,"San Diego, CA",33687104,"[PhD, Biology, Universities, Junior, Full-Time...","Assistant, Associate, or Full Professor (HS Cl...",[PATHOLOGY / School of Medicine / UC San Diego...,"[Open date:, Next review date:, Final date:, D...",False,6,5063,10,8,1036,0.204622,PATHOLOGY / School of Medicine / UC San Diego ...,33687104


In [440]:
df_bullets.drop_duplicates(subset=['uid_cleaned'], inplace=True)

In [441]:
df_bullets.shape

(5224, 17)

In [442]:
df_bullets.uid.value_counts()

48022361     1
48586227     1
48374423     1
48374594     1
48505219     1
            ..
50250897     1
50253056     1
50266739     1
50271008     1
49806221     1
Name: uid, Length: 5224, dtype: int64

In [443]:
title_freq = df_bullets.title.value_counts()

In [444]:
title_freq.head(15)

Applied Scientist job                     56
Research Fellow job                       28
PostDoctoral Scholar job                  24
Applied Scientist, Alexa AI job           22
Post-Doctoral Fellow job                  22
Senior Data Scientist job                 22
Senior Applied Scientist job              22
PostDoctoral Associate job                19
Senior Applied Scientist, Alexa AI job    17
Data Scientist job                        16
PostDoctoral Research Associate job       16
PostDoctoral Fellow job                   13
Senior Software Engineer job              13
Behavioral Case Manager job               12
Staff Software Engineer job               12
Name: title, dtype: int64

In [445]:
df_bullets['spacy_title'] = list(nlp.pipe(df_bullets.title))

In [446]:
df_bullets.spacy_title.head(10)

0     (PhD, Program, &, Lab, Manager, (, Hybrid, ), ...
1     (PhD, Program, &, Lab, Manager, (, Hybrid, ), ...
2     (Assistant, Professor, -, Clinical, PhD, Progr...
3     (Data, Scientist, /, Statistician, Intern, (, ...
4                     (PhD, Scientist, Internship, job)
5     (PhD, Program, Administrator, (, Academic, Ser...
8     (Associate, Director, ,, MD, -, PhD, Program, ...
9      (PhD, &, Certificate, Program, Coordinator, job)
10    (Temporary, Summer, PhD, Research, Specialist,...
11    (University, Recruiting, Technical, Sourcer, ,...
Name: spacy_title, dtype: object

In [447]:
print(df_bullets.spacy_title[0])
for token in df_bullets.spacy_title[0]:
    #print(token.text, token.pos_, token.lemma_, token.is_stop)
    print()
    print(token.text, token.pos_, token.lemma_, token.dep_)

PhD Program & Lab Manager (Hybrid) job

PhD NOUN phd compound

Program PROPN Program nmod

& CCONJ & cc

Lab PROPN Lab conj

Manager PROPN Manager nmod

( PUNCT ( punct

Hybrid PROPN Hybrid appos

) PUNCT ) punct

job NOUN job ROOT


In [448]:
print(df_bullets.spacy_title[9])
for token in df_bullets.spacy_title[9]:
    #print(token.text, token.pos_, token.lemma_, token.is_stop)
    print()
    print(token.text, token.pos_, token.lemma_, token.dep_)

PhD & Certificate Program Coordinator job

PhD PROPN PhD nmod

& CCONJ & cc

Certificate PROPN Certificate conj

Program PROPN Program compound

Coordinator PROPN Coordinator appos

job NOUN job ROOT


In [449]:
location_freq = df_bullets.location.value_counts()

In [450]:
location_freq.head(10)

Seattle, WA          290
New York City, NY    274
Austin, TX           165
Los Angeles, CA      163
San Diego, CA        163
Boston, MA           157
Foster City, CA      148
Cambridge, MA        130
San Francisco, CA    119
Sunnyvale, CA        115
Name: location, dtype: int64

In [451]:
## Drop lines where uid is longer than 9 b/c scraped did not parse properly

df_bullets['uid_len'] = df_bullets.uid.apply(lambda x: len(x))
uid_len_freq = df_bullets.uid_len.value_counts()

In [452]:
uid_len_freq.head(10)

9      5174
8         5
98        4
116       3
131       3
118       2
123       2
109       2
102       2
139       2
Name: uid_len, dtype: int64

In [453]:
df_bullets.uid[df_bullets.uid_len > 9].shape

(45,)

In [454]:
df_bullets.drop(df_bullets.uid[df_bullets.uid_len > 9].index, inplace = True)
df_bullets.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,jd_length_char,header_number,bullets_number,bullets_length_char,bullets_percentage,bullets_merged,uid_cleaned,spacy_title,uid_len
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,11255,15,67,6592,0.585695,"Maintains appropriate office systems, word pro...",48022361,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,11493,23,67,6592,0.573567,"Maintains appropriate office systems, word pro...",47983977,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,14701,25,63,6833,0.464798,Active scholarship and/or relevant teaching ex...,48973936,"(Assistant, Professor, -, Clinical, PhD, Progr...",9
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,5075,7,17,1404,0.27665,Create predictive models by mining complex dat...,39232138,"(Data, Scientist, /, Statistician, Intern, (, ...",9
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,3494,5,9,963,0.275615,Express recombinant proteins in microbial or e...,49810371,"(PhD, Scientist, Internship, job)",9


In [455]:
df_bullets.shape

(5179, 19)

In [456]:
## try title and location freq again w/o mis-scraped JDs
title_freq = df_bullets.title.value_counts()

In [457]:
title_freq.head(15)

Applied Scientist job                     56
Research Fellow job                       28
PostDoctoral Scholar job                  24
Senior Data Scientist job                 22
Senior Applied Scientist job              22
Post-Doctoral Fellow job                  22
Applied Scientist, Alexa AI job           22
Senior Applied Scientist, Alexa AI job    17
Data Scientist job                        16
PostDoctoral Research Associate job       16
PostDoctoral Associate job                15
Senior Software Engineer job              13
PostDoctoral Fellow job                   12
Staff Software Engineer job               12
Behavioral Case Manager job               12
Name: title, dtype: int64

In [458]:
location_freq = df_bullets.location.value_counts()

In [459]:
location_freq.head(15)

Seattle, WA          290
New York City, NY    274
Austin, TX           165
San Diego, CA        163
Los Angeles, CA      163
Boston, MA           157
Foster City, CA      148
Cambridge, MA        130
San Francisco, CA    119
Sunnyvale, CA        115
Atlanta, GA          102
Aurora, CO           102
Redmond, WA           97
Golden, CO            97
McLean, VA            75
Name: location, dtype: int64

In [460]:
company_freq = df_bullets.company.value_counts()

In [461]:
company_freq.head(30)

Amazon                                   585
Visa                                     562
Northrop Grumman Corporation             289
General Atomics                          129
Boeing                                   104
University of Colorado                   100
Facebook                                  74
Cedars Sinai                              74
LLNL                                      73
Capital One                               69
Massachusetts General Hospital            64
AbbVie                                    62
National Renewable Energy Laboratory      58
ZS Associates                             53
NREL                                      52
Microsoft                                 49
Intel                                     46
University of California San Diego        45
Oregon Health & Science University        44
Sanofi                                    41
MITRE                                     41
University of California Irvine           40
Pfizer    

In [462]:
def get_lemma(series):
    return pd.Series([token.lemma_ for _list in series for token in _list if token.pos_ == "PROPN"])


In [463]:
prop_n_freq_title = get_lemma(df_bullets["spacy_title"]).value_counts()

In [464]:
prop_n_freq_title.head(30)

Engineer        1198
Scientist       1152
Senior          1131
Research         693
Principal        655
Applied          446
Manager          416
Associate        411
Director         403
PostDoctoral     392
Software         337
Systems          312
Data             291
Professor        282
Assistant        238
Fellow           217
Development      196
Researcher       192
Staff            191
Science          169
AI               165
Lab              154
Program          151
Intern           145
Learning         141
Design           139
Lead             133
II               128
Analyst          126
Clinical         122
dtype: int64

In [465]:
df_bullets['pn_in_title'] = df_bullets.spacy_title.apply(lambda x: [token.lemma_ for token in x if token.pos_ == "PROPN"])



In [466]:
df_bullets.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,jd_length_char,header_number,bullets_number,bullets_length_char,bullets_percentage,bullets_merged,uid_cleaned,spacy_title,uid_len,pn_in_title
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,11255,15,67,6592,0.585695,"Maintains appropriate office systems, word pro...",48022361,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]"
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,11493,23,67,6592,0.573567,"Maintains appropriate office systems, word pro...",47983977,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]"
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,14701,25,63,6833,0.464798,Active scholarship and/or relevant teaching ex...,48973936,"(Assistant, Professor, -, Clinical, PhD, Progr...",9,"[Assistant, Professor, Clinical, Program, Los,..."
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,5075,7,17,1404,0.27665,Create predictive models by mining complex dat...,39232138,"(Data, Scientist, /, Statistician, Intern, (, ...",9,"[Data, Scientist, Statistician, Intern, BS, MS..."
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,3494,5,9,963,0.275615,Express recombinant proteins in microbial or e...,49810371,"(PhD, Scientist, Internship, job)",9,"[Scientist, Internship]"


In [467]:
df_bullets['pn_in_title_merged'] = df_bullets.pn_in_title.apply(lambda x: ' '.join([str(lemma) for lemma in x]))


In [468]:
df_bullets.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,...,header_number,bullets_number,bullets_length_char,bullets_percentage,bullets_merged,uid_cleaned,spacy_title,uid_len,pn_in_title,pn_in_title_merged
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,15,67,6592,0.585695,"Maintains appropriate office systems, word pro...",48022361,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,23,67,6592,0.573567,"Maintains appropriate office systems, word pro...",47983977,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,...,25,63,6833,0.464798,Active scholarship and/or relevant teaching ex...,48973936,"(Assistant, Professor, -, Clinical, PhD, Progr...",9,"[Assistant, Professor, Clinical, Program, Los,...",Assistant Professor Clinical Program Los Angeles
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,...,7,17,1404,0.27665,Create predictive models by mining complex dat...,39232138,"(Data, Scientist, /, Statistician, Intern, (, ...",9,"[Data, Scientist, Statistician, Intern, BS, MS...",Data Scientist Statistician Intern BS MS PhD S...
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,...,5,9,963,0.275615,Express recombinant proteins in microbial or e...,49810371,"(PhD, Scientist, Internship, job)",9,"[Scientist, Internship]",Scientist Internship


In [469]:
## Title frequency with only lemmas of proper nouns (as parsed by spaCy)
lemmatized_title_freq = df_bullets.pn_in_title_merged.value_counts()

In [470]:
lemmatized_title_freq.head(15)

Applied Scientist                    58
                                     58
Research Fellow                      28
PostDoctoral Scholar                 24
Senior Applied Scientist             23
Senior Data Scientist                23
Applied Scientist Alexa AI           22
Fellow                               22
Data Scientist                       18
PostDoctoral Research Associate      18
Senior Applied Scientist Alexa AI    17
Applied Scientist II                 15
PostDoctoral Associate               15
Staff Software Engineer              13
Senior Software Engineer             13
Name: pn_in_title_merged, dtype: int64

In [471]:
lemmatized_title_freq.tail(15)

Data Engineer                                                           1
Senior Manager Treasury M&A Integration                                 1
Engineer V                                                              1
Director UX Research Insights Innovation Consumer Trust                 1
Senior Firmware Engineer                                                1
Nanomaterials PostDoctoral Researcher                                   1
PostDoctoral Scholar Kong Lab                                           1
Laboratory Research Technician Sperling Lab                             1
Principal FPGA Design Engineer Rolling Meadows IL                       1
Principal Automation Engineer                                           1
Assistant Academic Research Scientist School Medicine Human Genetics    1
Staff Systems Engineer Learning Technology                              1
Senior Staff Systems Engineer                                           1
Research Specialist Microscopy        

In [472]:
df_bullets[df_bullets.pn_in_title_merged.str.contains('AAV')]

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,...,header_number,bullets_number,bullets_length_char,bullets_percentage,bullets_merged,uid_cleaned,spacy_title,uid_len,pn_in_title,pn_in_title_merged
1750,"Senior Scientist I/II, Recombinant AAV Purific...",AbbVie,"Worcester, MA",45272194,"[PhD, Bachelor's Degree, Pharma / Biotech, Hea...",About AbbVieAbbVie’s mission is to discover an...,[Purification of rAAV serotypes from mammalian...,"[About AbbVie, Qualifications, Significant Wor...",False,9,...,8,20,1880,0.483166,Purification of rAAV serotypes from mammalian ...,45272194,"(Senior, Scientist, I, /, II, ,, Recombinant, ...",9,"[Senior, Scientist, I, II, Recombinant, AAV, P...",Senior Scientist I II Recombinant AAV Purifica...


### Topic Modeling with Lemmatized Proper Nouns in Titles

In [346]:
## Make Count Vectorizer of just titles
## Only include words that appear in at least 10 titles
cv = CountVectorizer(min_df = .001)
X = cv.fit_transform(df_bullets.pn_in_title_merged)
dtm = pd.DataFrame(X.toarray(), columns = cv.get_feature_names())



In [347]:
dtm.head()

Unnamed: 0,academic,accelerator,access,account,acquisition,additive,adjunct,administrative,administrator,ads,...,visiting,west,wind,work,worker,workforce,world,writer,writing,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [348]:
dtm.shape

(5179, 552)

In [371]:
## Started with 9 becuase the average JD has 9 tags. This turned out to be too many, so I lowered it.
lsa = TruncatedSVD(4)

In [372]:
lsa.fit(dtm)

TruncatedSVD(n_components=4)

In [351]:
print(len(lsa.components_[0]))
print(len(lsa.components_[1]))

552
552


In [363]:
#cv.get_feature_names()

In [353]:
doc_by_topic = lsa.transform(dtm)

In [354]:
doc_by_topic.shape

(5179, 4)

In [355]:
df_by_topic = pd.DataFrame(lsa.components_,
                index = ["component_1", "component_2", "component_3",
                        "component_4"], 
                         #"component_5", "component_6", "component_7",
                       # "component_8", "component_9"],
                columns = cv.get_feature_names())



In [356]:
df_by_topic

Unnamed: 0,academic,accelerator,access,account,acquisition,additive,adjunct,administrative,administrator,ads,...,visiting,west,wind,work,worker,workforce,world,writer,writing,young
component_1,0.001121,0.000505,0.001168,0.002159,0.001228,0.000882,0.000441,0.000116,0.001548,0.002348,...,0.000473,0.0007,0.000489,0.00147,1.7e-05,0.000725,0.001066,0.001185,0.0001,2e-06
component_2,0.002271,0.000592,-0.000382,0.001252,0.001728,-0.000313,0.001926,0.000479,-0.000405,0.00329,...,0.001852,0.000486,0.000335,-0.000268,3.9e-05,0.000458,0.001956,0.000621,0.000298,8e-06
component_3,0.006034,0.001791,0.000487,-0.003823,0.000444,0.003014,0.017872,0.00228,0.000394,-0.002556,...,0.005355,-0.000813,3.8e-05,0.003134,0.000348,-0.000431,-0.001304,-0.000144,0.002915,8.3e-05
component_4,0.00874,-0.000688,0.000894,0.010983,0.002898,-0.001685,0.016843,0.000401,0.001767,0.00162,...,0.002191,0.003464,0.001813,0.002625,0.000417,0.00475,0.003016,0.004537,0.004108,2.5e-05


In [357]:
# Function to display the top n terms in each topic
## Adapted from Metis topic modeling exercise
def display_topics(model, feature_names, num_top_words, topic_names = None): 
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix + 1)
        else:
            print("\nTopic: ", topic_names[ix])
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
    print("\n")
    return model, feature_names, num_top_words

In [374]:
## I would say -- 
## Topic 1 = Industry, Engineer/Software Engineer
## Topic 2 = Industry, Applied Scientist/Data Scientist
## Topic 3 = Academia, Postdoc Researcher/Professor
## Topic 4 = Industry/Academia, Leadership Position as Director/Senior/Mananger/PI
top_7_in_topics = display_topics(lsa, cv.get_feature_names(), 7)


Topic  1
engineer, senior, principal, scientist, systems, software, applied

Topic  2
scientist, applied, research, senior, data, ai, alexa

Topic  3
research, associate, postdoctoral, professor, engineer, assistant, fellow

Topic  4
senior, associate, director, manager, professor, assistant, program




In [375]:
## Try a different kind of modeling
nmf = NMF(6)
nmf.fit(dtm)



NMF(n_components=6)

In [376]:
df_by_topic_nmf = pd.DataFrame(nmf.components_,
                index = ["component_1", "component_2", "component_3",
                        "component_4", "component_5", "component_6"], 
                               #"component_7"],
                       # "component_8", "component_9"],
                columns = cv.get_feature_names())



In [377]:
df_by_topic_nmf

Unnamed: 0,academic,accelerator,access,account,acquisition,additive,adjunct,administrative,administrator,ads,...,visiting,west,wind,work,worker,workforce,world,writer,writing,young
component_1,0.0,0.005869,0.012665,0.0,0.0,0.01567,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.012518,6e-05,0.0,0.0,0.0,0.0,0.0
component_2,0.00016,0.003327,0.0,0.0,0.001315,0.0,0.0,0.0,0.0,0.016584,...,0.003649,0.0,0.0,0.0,0.0,0.0,0.007276,0.0,0.0,0.0
component_3,0.001464,0.008342,0.0,0.0,0.011393,0.016663,0.0,0.01364,0.00638,0.0,...,0.017096,0.0,0.004648,0.00119,0.0,0.0,0.0,0.006922,0.0,0.000591
component_4,0.01492,0.0,0.009219,0.064322,0.017818,0.0,0.0,0.0,0.005488,0.02898,...,0.0,0.019704,0.011596,0.004964,0.0,0.01894,0.017525,0.025143,0.0,0.0
component_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017271,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9e-06
component_6,0.074694,0.002939,0.005049,0.0,0.001844,0.0,0.213941,0.004692,0.0,0.0,...,0.035989,0.001448,0.0,0.030021,0.004619,0.011378,0.007035,0.000382,0.043491,0.0


In [378]:
## WOW these topics are way better!!
## I would say -- 
## Topic 1 = Industry, Engineer/Software Engineer
## Topic 2 = Industry, Applied Scientist/Data Scientist/Decision Scientist
## Topic 3 = Academia, Junior Researcher/Postdoc/Intern
## Topic 4 = Industry, Data Scientist/Data Analyst/Risk Analyst
## Topic 5 = Industry, Systems Engineer/Engineering Manager/Security
## Topic 6 = Academia, Professor/Clinical Professor/Research Director
top_7_in_topics = display_topics(nmf, cv.get_feature_names(), 7)


Topic  1
engineer, software, systems, staff, design, development, test

Topic  2
scientist, applied, ai, alexa, data, ii, learning

Topic  3
research, postdoctoral, fellow, associate, lab, intern, scientist

Topic  4
senior, manager, data, director, risk, analyst, science

Topic  5
principal, systems, senior, engineer, gbsd, cyber, researcher

Topic  6
professor, associate, assistant, director, clinical, rank, open






In [382]:
## Try just two broad categories to see what happens
nmf_2 = NMF(2)
nmf_2.fit(dtm)



NMF(n_components=2)

In [383]:
## I was hoping this would be a Academia/Industry split...looks like it's actually 'Engineer'
## vs. 'Not an Engineer'.
top_10_in_topics = display_topics(nmf_2, cv.get_feature_names(), 10)


Topic  1
engineer, principal, senior, systems, software, design, gbsd, staff, development, mechanical

Topic  2
scientist, applied, senior, research, data, ai, alexa, associate, postdoctoral, ii






### Topic Modeling with Bullet Points

In [473]:
df_bullets['bullets_merged_spacy'] = df.bullets.apply(lambda x: ' '.join([str(sentence) for sentence in x]))

In [474]:
df_bullets['bullets_merged_spacy'] = list(nlp.pipe(df_bullets.bullets_merged_spacy))

In [475]:
df_bullets.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,...,bullets_number,bullets_length_char,bullets_percentage,bullets_merged,uid_cleaned,spacy_title,uid_len,pn_in_title,pn_in_title_merged,bullets_merged_spacy
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,67,6592,0.585695,"Maintains appropriate office systems, word pro...",48022361,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid,"(Maintains, appropriate, office, systems, ,, w..."
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,67,6592,0.573567,"Maintains appropriate office systems, word pro...",47983977,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid,"(Maintains, appropriate, office, systems, ,, w..."
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,...,63,6833,0.464798,Active scholarship and/or relevant teaching ex...,48973936,"(Assistant, Professor, -, Clinical, PhD, Progr...",9,"[Assistant, Professor, Clinical, Program, Los,...",Assistant Professor Clinical Program Los Angeles,"(Active, scholarship, and/or, relevant, teachi..."
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,...,17,1404,0.27665,Create predictive models by mining complex dat...,39232138,"(Data, Scientist, /, Statistician, Intern, (, ...",9,"[Data, Scientist, Statistician, Intern, BS, MS...",Data Scientist Statistician Intern BS MS PhD S...,"(Create, predictive, models, by, mining, compl..."
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,...,9,963,0.275615,Express recombinant proteins in microbial or e...,49810371,"(PhD, Scientist, Internship, job)",9,"[Scientist, Internship]",Scientist Internship,"(Express, recombinant, proteins, in, microbial..."


In [476]:
## Make sure there are no blank cells
df_bullets[df_bullets.bullets_merged_spacy == '']

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,...,bullets_number,bullets_length_char,bullets_percentage,bullets_merged,uid_cleaned,spacy_title,uid_len,pn_in_title,pn_in_title_merged,bullets_merged_spacy


In [569]:
## Lemmatize and pick out only nouns and verbs
pos_list_n_v = ['PROPN', 'NOUN', 'VERB']
df_bullets['lemmatized_n_v'] = df_bullets.bullets_merged_spacy.apply(lambda x: [token.lemma_ for token in x if token.pos_ in pos_list_n_v])
df_bullets['merged_n_v'] = df_bullets.lemmatized_n_v.apply(lambda x: ' '.join([str(lemma) for lemma in x]))

## Also try just nouns as a separate column
pos_list_n = ['PROPN', 'NOUN']
df_bullets['lemmatized_n'] = df_bullets.bullets_merged_spacy.apply(lambda x: [token.lemma_ for token in x if token.pos_ in pos_list_n])
df_bullets['merged_n'] = df_bullets.lemmatized_n.apply(lambda x: ' '.join([str(lemma) for lemma in x]))


In [494]:
df_bullets.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,...,uid_cleaned,spacy_title,uid_len,pn_in_title,pn_in_title_merged,bullets_merged_spacy,lemmatized_n_v,merged_n_v,lemmatized_n,merged_n
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,48022361,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid,"(Maintains, appropriate, office, systems, ,, w...","[maintain, office, system, word, processing, d...",maintain office system word processing databas...,"[maintain, office, system, word, processing, d...",maintain office system word processing databas...
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,47983977,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid,"(Maintains, appropriate, office, systems, ,, w...","[maintain, office, system, word, processing, d...",maintain office system word processing databas...,"[maintain, office, system, word, processing, d...",maintain office system word processing databas...
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,...,48973936,"(Assistant, Professor, -, Clinical, PhD, Progr...",9,"[Assistant, Professor, Clinical, Program, Los,...",Assistant Professor Clinical Program Los Angeles,"(Active, scholarship, and/or, relevant, teachi...","[scholarship, teaching, experience, African, A...",scholarship teaching experience African Americ...,"[scholarship, teaching, experience, African, A...",scholarship teaching experience African Americ...
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,...,39232138,"(Data, Scientist, /, Statistician, Intern, (, ...",9,"[Data, Scientist, Statistician, Intern, BS, MS...",Data Scientist Statistician Intern BS MS PhD S...,"(Create, predictive, models, by, mining, compl...","[create, model, mine, datum, formulating, test...",create model mine datum formulating testing in...,"[model, datum, formulating, testing, insight, ...",create model mine datum formulating testing in...
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,...,49810371,"(PhD, Scientist, Internship, job)",9,"[Scientist, Internship]",Scientist Internship,"(Express, recombinant, proteins, in, microbial...","[express, protein, expression, system, scale, ...",express protein expression system scale range ...,"[protein, expression, system, scale, hundred, ...",express protein expression system scale range ...


In [495]:
df_bullets.shape

(5179, 26)

In [481]:
df_bullets.merged_n_v[df_bullets.merged_n_v.str.contains('EMI')]

1351    support mission system concept development sat...
1405    coordinate test verify EMI EMC requirement tea...
1406    review approve supplier test procedure report ...
1432    conception development power product include t...
1903    Bachelor Science degree Electrical Engineering...
2245    coordinate test verify tempest requirement tea...
2728    Software Engineer responsibility work member J...
2868    coordinate test verify EMI EMC requirement tea...
2869    coordinate test verify EMI EMC requirement tea...
2870    coordinate test verify EMI EMC requirement tea...
3019    design power conversion circuitry utilize powe...
3073    coordinate test verify tempest requirement tea...
3203    Medical Dental Vision coverage Educational Ass...
3596    have year hand industry experience power analo...
3673    coordinate test verify EMI EMC requirement tea...
3810    coordinate test verify EMI EMC requirement tea...
4207    MSME require phd prefer experience electro des...
4316    concep

In [389]:
df_bullets.merged_n_v[df_bullets.merged_n_v.str.contains('Colorado')]

24      PKD PKD Translational Group National Institute...
334     Professional Development II Professional Devel...
377     Anschutz Campus Denver Campus Exemptions Centr...
397     Anschutz Campus Central Services Administratio...
401     Anschutz Campus Denver Campus Exemptions Centr...
433     Anschutz Campus Central Services Administratio...
434     MS MPH Associate Professor Professor Universit...
455     Anschutz Campus Denver Campus Exemptions Centr...
466     Anschutz Campus Denver Campus Exemptions Centr...
549     TBI Concussions Pulmonary Critical Care Medici...
648     Anschutz Campus Denver Campus Exemptions Centr...
693     Record PI Conduct Animal Modify Develop ChIP R...
716     Anschutz Campus Denver Campus Exemptions Centr...
778     Anschutz Campus Denver Campus Exemptions Centr...
790     Anschutz Campus Denver Campus Exemptions Centr...
800     Anschutz Campus Central Services Administratio...
814     Anschutz Campus Central Services Administratio...
822     Assess

In [482]:
## Make Count Vectorizer of bullet points
## Only include words that appear in at least 20 docs
cv = CountVectorizer(min_df = .003)
X = cv.fit_transform(df_bullets['merged_n_v'])
dtm_bullets = pd.DataFrame(X.toarray(), columns = cv.get_feature_names())



In [483]:
dtm_bullets

Unnamed: 0,178c,19,1st,2d,3d,401,401k,abbvie,abet,ability,...,year,years,yield,york,yr,yrs,zemax,zone,zoom,zs
0,0,0,0,0,0,0,0,0,0,4,...,1,0,0,0,0,0,0,0,2,0
1,0,0,0,0,0,0,0,0,0,4,...,1,0,0,0,0,0,0,0,2,0
2,0,0,0,0,0,0,0,0,0,10,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5174,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
5175,0,0,0,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,0,0,0
5176,0,0,0,0,0,0,0,0,0,1,...,7,0,0,0,0,0,0,0,0,0
5177,0,0,0,0,0,0,0,0,0,3,...,1,0,0,0,0,0,0,0,0,0


In [484]:
## Since NMF worked well for titles, let's try that here as well
## Started with 9 topics b/c average # of Tags for one JD is 9
## Best fit appears to be 17!
## THIS IS FOR PROPER NOUNS ONLY!
#nmf = NMF(7)
#nmf = NMF(13)
#nmf = NMF(14)
#nmf = NMF(15)
#nmf = NMF(16)
#nmf = NMF(17)
#nmf = NMF(18)
nmf = NMF(17)
#nmf = NMF(18)
#nmf = NMF(22)
#nmf = NMF(20)
nmf.fit(dtm_bullets)



NMF(n_components=17)

In [490]:
## Finding number for N AND V AND PN!
nmf = NMF(30)

nmf.fit(dtm_bullets)



NMF(n_components=30)

In [427]:
## What about responsibilities?? Haven't handled those yet

## I would say --
## Topic 1 -- Experience -- Engineering (not Software)
## Topic 2 -- Qualifications -- Higher Education
## Topic 3 -- Skills -- Data Science tools
## Topic 4 -- Experience -- Academic Research
## Topic 5 -- Experience -- ??
## Topic 6 -- Skills -- Business/Management
## Topic 7 -- Experience -- STEM Education
## Topic 8 -- Qualifications -- Security Clearance
## Topic 9 -- Application Materials -- Cover letter, CV, Diversity statement
## Topic 10 -- Skills -- Software Engineering
## Topic 11 -- Experience -- University Admin
## Topic 12 -- Benefits -- Health Insurance, Vision/Dental Insurance, Life Insurance, Paid Time Off
## Topic 13 -- Admin Skills -- Microsoft Office, Project Management
## Topic 14 -- Qualifications -- Certifications/Licensing
## Topic 15 -- Skills -- Machine Learning/Deep Learning
## Topic 16 -- Skills -- Engineering Quality Assurance
## Topic 17 -- Qualifications -- Clinical Licensing (Colorado is for vaccine exemptions)
## Topic 18 -- Experience -- Software Development
## 
top_10_in_topics = display_topics(nmf, cv.get_feature_names(), 10)


Topic  1
engineering, electrical, systems, mechanical, physics, aerospace, chemical, matlab, technology, power

Topic  2
degree, md, mba, masters, jd, bachelor, advanced, client, product, java

Topic  3
python, ml, java, learning, machine, ai, statistics, cs, master, mathematics

Topic  4
research, job, academic, postdoctoral, md, department, pi, business, working, specialty

Topic  5
visa, direct, bachelor, risk, client, global, degree, marketing, account, technology

Topic  6
medical, affairs, global, ta, md, director, medicine, associate, scientific, education

Topic  7
experience, software, systems, design, bachelor, emi, fpga, emc, rf, familiarity

Topic  8
master, bachelor, degree, stem, technology, science, accreditation, pe, math, python

Topic  9
statement, optional, cover, vitae, letter, research, school, misc, additional, excellence

Topic  10
science, computer, data, mathematics, statistics, math, applied, ai, algorithm, meta

Topic  11
university, program, director, schoo



In [491]:
top_10_in_topics_N_V_PN = display_topics(nmf, cv.get_feature_names(), 10)


Topic  1
experience, field, have, phd, relate, tool, obtain, python, hand, industry

Topic  2
ability, skill, communication, knowledge, demonstrate, write, problem, solve, include, environment

Topic  3
research, develop, publication, field, review, conduct, journal, peer, science, study

Topic  4
system, requirement, architecture, performance, systems, integration, develop, control, clearance, hardware

Topic  5
student, faculty, university, program, graduate, include, participation, service, course, admission

Topic  6
learning, machine, model, language, computer, field, python, science, business, phd

Topic  7
development, lead, develop, strategy, leadership, support, role, drug, technology, candidate

Topic  8
engineering, degree, discipline, master, bachelor, accredit, abet, phd, science, technology

Topic  9
analysis, model, study, performance, method, modeling, use, perform, result, datum

Topic  10
datum, data, analytic, science, source, technology, use, analyze, analysis, dat



In [276]:
doc_topic_bullets = nmf.transform(dtm_bullets)

In [279]:
doc_topic_df = pd.DataFrame(doc_topic_bullets.round(5), 
                            index = df_bullets.uid, columns = ["engineering_experience",
                                                         "higher_ed_qualifications",
                                                         "data_science_skills",
                                                         "clinical/medical_experience",
                                                         "academic_reserach_experience",
                                                         "business/management_experience",
                                                         "stem_education_qualifications", 
                                                         "security_clearance_qualifications",
                                                         "application_materials",
                                                         "software_engineering_skills",
                                                         "university_admin_experience",
                                                         "benefits", "admin_skills",
                                                         "certifications_qualifications",
                                                         "machine/deep_learning_skills",
                                                         "engineering_skills",
                                                         "clinical_licensing_qualifications",
                                                         "software_engineering_experience"])


In [280]:
doc_topic_df.head()

Unnamed: 0_level_0,engineering_experience,higher_ed_qualifications,data_science_skills,clinical/medical_experience,academic_reserach_experience,business/management_experience,stem_education_qualifications,security_clearance_qualifications,application_materials,software_engineering_skills,university_admin_experience,benefits,admin_skills,certifications_qualifications,machine/deep_learning_skills,engineering_skills,clinical_licensing_qualifications,software_engineering_experience
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
48022361,0.0,0.00722,0.0,0.00389,0.05326,0.00489,0.0,0.01757,0.02833,0.0,0.06381,0.00538,0.09678,0.09041,0.0,0.0141,0.0,0.00554
47983977,0.0,0.00722,0.0,0.00389,0.05326,0.00489,0.0,0.01757,0.02833,0.0,0.06381,0.00538,0.09678,0.09041,0.0,0.0141,0.0,0.00554
48973936,0.0,0.0,0.04843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.97913,0.0,0.0,0.0,0.0,0.0,0.20369,0.0
39232138,0.0,0.02193,0.07156,0.0,0.03983,0.0,0.0,0.00758,0.0103,0.0,0.0,0.00049,0.1675,0.0,0.0,0.0,0.0,0.0
49810371,0.0,0.00101,0.0,0.0,0.0,0.00098,0.0,0.0,0.00015,0.0,0.0,0.0,0.00493,0.0,0.0012,0.01557,0.0,3e-05


In [292]:
doc_topic_df.shape

(6063, 18)

In [285]:
df_smaller = df_bullets.loc[:, 'title':'tags']

In [286]:
df_smaller.head()

Unnamed: 0,title,company,location,uid,tags
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry..."
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry..."
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]"
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program..."
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat..."


In [291]:
df_smaller.shape

(6063, 5)

In [305]:
df_smaller.uid.value_counts()

33687104     3
45511502     2
50299808     2
50301219     2
50364839     2
            ..
50160288     1
50180523     1
50197064     1
50211756     1
49806221     1
Name: uid, Length: 5179, dtype: int64

In [298]:
new_df = pd.merge(df_smaller, doc_topic_df, how='inner', on='uid')

In [299]:
new_df.head()

Unnamed: 0,title,company,location,uid,tags,engineering_experience,higher_ed_qualifications,data_science_skills,clinical/medical_experience,academic_reserach_experience,...,application_materials,software_engineering_skills,university_admin_experience,benefits,admin_skills,certifications_qualifications,machine/deep_learning_skills,engineering_skills,clinical_licensing_qualifications,software_engineering_experience
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",0.0,0.00722,0.0,0.00389,0.05326,...,0.02833,0.0,0.06381,0.00538,0.09678,0.09041,0.0,0.0141,0.0,0.00554
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",0.0,0.00722,0.0,0.00389,0.05326,...,0.02833,0.0,0.06381,0.00538,0.09678,0.09041,0.0,0.0141,0.0,0.00554
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",0.0,0.0,0.04843,0.0,0.0,...,0.0,0.0,0.97913,0.0,0.0,0.0,0.0,0.0,0.20369,0.0
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...",0.0,0.02193,0.07156,0.0,0.03983,...,0.0103,0.0,0.0,0.00049,0.1675,0.0,0.0,0.0,0.0,0.0
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",0.0,0.00101,0.0,0.0,0.0,...,0.00015,0.0,0.0,0.0,0.00493,0.0,0.0012,0.01557,0.0,3e-05


In [300]:
new_df.shape

(7833, 23)

In [302]:
new_df.drop_duplicates(subset=['uid'], inplace=True)

In [303]:
new_df.shape

(5179, 23)

In [511]:
## Try modeling with just nouns and proper nouns
## Make Count Vectorizer of bullet points
## Only include words that appear in at least 20 docs
## Exclude terms that occur in 95 percent of documents
cv = CountVectorizer(min_df = .003, max_df = .50)
X = cv.fit_transform(df_bullets['merged_n'])
dtm_bullets_n = pd.DataFrame(X.toarray(), columns = cv.get_feature_names())

In [512]:
## Finding number for just nouns and proper nouns
nmf = NMF(20)

nmf.fit(dtm_bullets_n)



NMF(n_components=20)

In [513]:
top_10_in_topics_N_V_PN = display_topics(nmf, cv.get_feature_names(), 10)


Topic  1
project, management, manage, lead, issue, ensure, stakeholder, leadership, activity, business

Topic  2
system, analysis, requirement, performance, architecture, systems, tool, integration, control, modeling

Topic  3
learning, machine, model, field, language, computer, science, python, business, algorithm

Topic  4
science, meeting, project, problem, result, laboratory, method, provide, technology, technique

Topic  5
student, program, graduate, faculty, office, policy, course, event, support, admission

Topic  6
engineering, bachelor, discipline, master, program, accredit, science, technology, abet, clearance

Topic  7
datum, analysis, data, analytic, use, source, science, method, technology, model

Topic  8
risk, management, business, visa, assessment, identify, control, understanding, provide, payment

Topic  9
product, customer, technology, market, feature, support, solution, need, requirement, define

Topic  10
health, care, service, family, provide, patient, staff, pra



### Try bullets with TFIDF

In [556]:
## Nouns, Proper Nouns, and Verbs
#tfidf = TfidfVectorizer(min_df = .01, max_df = .90)
tfidf = TfidfVectorizer(min_df = .003)
X = tfidf.fit_transform(df_bullets['merged_n_v'])
tfidf_bullets_n_v = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names())



In [557]:
tfidf_bullets_n_v

Unnamed: 0,178c,19,1st,2d,3d,401,401k,abbvie,abet,ability,...,year,years,yield,york,yr,yrs,zemax,zone,zoom,zs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042840,...,0.009454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092118,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042840,...,0.009454,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092118,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.104340,...,0.009211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066433,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028661,...,0.025300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.046349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009845,...,0.060837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
5177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083633,...,0.024609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


In [558]:
tfidf_bullets_n_v.shape

(5179, 2903)

In [559]:
## Finding number of topics for nouns and verbs
nmf_tfidf = NMF(17)

nmf_tfidf.fit(tfidf_bullets_n_v)



NMF(n_components=17)

In [560]:
top_10_in_topics_N_V_PN = display_topics(nmf_tfidf, tfidf.get_feature_names(), 10)


Topic  1
engineering, system, clearance, dod, security, discipline, requirement, stem, degree, systems

Topic  2
learning, machine, model, language, ml, experience, computer, python, algorithm, programming

Topic  3
research, journal, publication, field, conference, peer, funding, review, experience, publish

Topic  4
client, product, business, visa, work, market, payment, sale, jd, partner

Topic  5
statement, optional, diversity, letter, equity, cover, vitae, curriculum, inclusion, teaching

Topic  6
cell, biology, assay, culture, immunology, protein, experience, mouse, biochemistry, cytometry

Topic  7
datum, analytic, source, analysis, data, love, model, know, technology, machine

Topic  8
experience, year, degree, bachelor, phd, work, masters, master, require, jd

Topic  9
shift, research, equipment, specialization, academic, postdoctoral, maintenance, type, job, protocol

Topic  10
software, system, architecture, development, computer, experience, code, cloud, security, programm



In [570]:
len(df_bullets.merged_n[0])

3759

In [571]:
len(df_bullets.merged_n_v[0])

4517

In [596]:
## Nouns and Proper Nouns
#tfidf = TfidfVectorizer(min_df = .003)
tfidf = TfidfVectorizer(min_df = .003, max_df = .95)
X = tfidf.fit_transform(df_bullets['merged_n'])
tfidf_bullets_n = pd.DataFrame(X.toarray(), columns = tfidf.get_feature_names())

In [597]:
tfidf_bullets_n.shape

(5179, 2553)

In [602]:
## Finding number of topics for nouns and verbs
## Wow these categories are nonsense.
#nmf_tfidf_n = NMF(15)
#nmf_tfidf_n = NMF(17)
nmf_tfidf_n = NMF(18)
#nmf_tfidf_n = NMF(20)

nmf_tfidf_n.fit(tfidf_bullets_n)



NMF(n_components=18)

In [603]:
top_10_in_topics_N_PN = display_topics(nmf_tfidf_n, tfidf.get_feature_names(), 10)


Topic  1
experience, year, degree, work, bachelor, masters, phd, jd, md, master

Topic  2
learning, machine, model, experience, algorithm, ml, computer, language, business, python

Topic  3
research, journal, publication, peer, funding, field, laboratory, conference, phd, ability

Topic  4
client, business, visa, mba, work, marketing, team, payment, project, sale

Topic  5
statement, optional, diversity, letter, equity, cover, inclusion, curriculum, vitae, teaching

Topic  6
cell, biology, culture, immunology, assay, protein, experience, laboratory, mouse, biochemistry

Topic  7
datum, analytic, analysis, data, source, science, experience, sql, python, model

Topic  8
student, faculty, program, university, course, teaching, graduate, education, campus, participation

Topic  9
shift, equipment, research, specialization, academic, postdoctoral, maintenance, type, job, protocol

Topic  10
engineering, clearance, system, dod, year, stem, experience, discipline, degree, security

Topic  11



### Turn each doc into score in each of 18 categories, merge with title/company/location info using UID column, then try to predict academic or non academic (academic = 'University' in company)

In [605]:
doc_topic_matrix = nmf_tfidf_n.transform(tfidf_bullets_n)

In [606]:
doc_topic_nmf_tfidf_n = pd.DataFrame(doc_topic_matrix.round(5), 
                            index = df_bullets.uid, columns = ["degree_or_experience",
                                                               "machine_learning",
                                                               "academic_research",
                                                               "business_or_marketing",
                                                               "application_materials",
                                                               "bio_or_med_lab",
                                                               "data_science",
                                                               "teaching",
                                                               "experiments",
                                                               "security_clearance",
                                                               "risk_assessment",
                                                               "benefits",
                                                               "software_engineering",
                                                               "project_management",
                                                               "product_development",
                                                               "engineering",
                                                               "employer_requirements",
                                                               "speech_recognition"])

In [607]:
doc_topic_nmf_tfidf_n.head()

Unnamed: 0_level_0,degree_or_experience,machine_learning,academic_research,business_or_marketing,application_materials,bio_or_med_lab,data_science,teaching,experiments,security_clearance,risk_assessment,benefits,software_engineering,project_management,product_development,engineering,employer_requirements,speech_recognition
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
48022361,0.0,0.0,0.01878,0.0,0.0,0.0,0.00552,0.18616,0.00366,0.00957,0.00284,0.00545,0.00078,0.05476,0.00016,0.00032,0.00157,0.00777
47983977,0.0,0.0,0.01878,0.0,0.0,0.0,0.00552,0.18616,0.00366,0.00957,0.00284,0.00545,0.00078,0.05476,0.00016,0.00032,0.00157,0.00777
48973936,0.0,0.00459,0.02206,0.0,0.01601,0.0,0.0,0.25397,0.0,0.0,0.0,0.02368,0.0,0.0,0.0,0.0,0.00023,0.0
39232138,0.0,0.11196,0.0,0.0,0.0,0.0,0.06987,0.00745,0.0,0.0,0.0003,0.0,0.0,0.02286,0.0,0.01745,0.0,0.0
49810371,0.0,0.00282,0.0,0.0,0.00103,0.05397,0.0,0.0,0.0,0.01058,0.0,0.0,0.0,0.0301,0.00386,0.00767,0.01534,0.0


In [608]:
doc_topic_nmf_tfidf_n.shape

(5179, 18)

In [609]:
df_smaller = df_bullets.loc[:, 'title':'tags']

In [610]:
df_smaller.head()

Unnamed: 0,title,company,location,uid,tags
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry..."
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry..."
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]"
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program..."
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat..."


In [611]:
df_smaller.shape

(5179, 5)

In [612]:
prediction_df = pd.merge(df_smaller, doc_topic_nmf_tfidf_n, how='inner', on='uid')

In [613]:
prediction_df.head()

Unnamed: 0,title,company,location,uid,tags,degree_or_experience,machine_learning,academic_research,business_or_marketing,application_materials,...,experiments,security_clearance,risk_assessment,benefits,software_engineering,project_management,product_development,engineering,employer_requirements,speech_recognition
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",0.0,0.0,0.01878,0.0,0.0,...,0.00366,0.00957,0.00284,0.00545,0.00078,0.05476,0.00016,0.00032,0.00157,0.00777
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",0.0,0.0,0.01878,0.0,0.0,...,0.00366,0.00957,0.00284,0.00545,0.00078,0.05476,0.00016,0.00032,0.00157,0.00777
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",0.0,0.00459,0.02206,0.0,0.01601,...,0.0,0.0,0.0,0.02368,0.0,0.0,0.0,0.0,0.00023,0.0
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...",0.0,0.11196,0.0,0.0,0.0,...,0.0,0.0,0.0003,0.0,0.0,0.02286,0.0,0.01745,0.0,0.0
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",0.0,0.00282,0.0,0.0,0.00103,...,0.0,0.01058,0.0,0.0,0.0,0.0301,0.00386,0.00767,0.01534,0.0


In [614]:
prediction_df.shape

(5179, 23)

In [615]:
## Pickle this dataframe!
prediction_df.to_pickle('job_titles_plus_topics')

## Scattertext

In [616]:
df_bullets.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,...,uid_cleaned,spacy_title,uid_len,pn_in_title,pn_in_title_merged,bullets_merged_spacy,lemmatized_n_v,merged_n_v,lemmatized_n,merged_n
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,48022361,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid,"(Maintains, appropriate, office, systems, ,, w...","[maintain, office, system, word, processing, d...",maintain office system word processing databas...,"[maintain, office, system, word, processing, d...",maintain office system word processing databas...
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,47983977,"(PhD, Program, &, Lab, Manager, (, Hybrid, ), ...",9,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid,"(Maintains, appropriate, office, systems, ,, w...","[maintain, office, system, word, processing, d...",maintain office system word processing databas...,"[maintain, office, system, word, processing, d...",maintain office system word processing databas...
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,...,48973936,"(Assistant, Professor, -, Clinical, PhD, Progr...",9,"[Assistant, Professor, Clinical, Program, Los,...",Assistant Professor Clinical Program Los Angeles,"(Active, scholarship, and/or, relevant, teachi...","[scholarship, teaching, experience, African, A...",scholarship teaching experience African Americ...,"[scholarship, teaching, experience, African, A...",scholarship teaching experience African Americ...
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,...,39232138,"(Data, Scientist, /, Statistician, Intern, (, ...",9,"[Data, Scientist, Statistician, Intern, BS, MS...",Data Scientist Statistician Intern BS MS PhD S...,"(Create, predictive, models, by, mining, compl...","[create, model, mine, datum, formulating, test...",create model mine datum formulating testing in...,"[model, datum, formulating, testing, insight, ...",model datum formulating testing insight Implem...
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,...,49810371,"(PhD, Scientist, Internship, job)",9,"[Scientist, Internship]",Scientist Internship,"(Express, recombinant, proteins, in, microbial...","[express, protein, expression, system, scale, ...",express protein expression system scale range ...,"[protein, expression, system, scale, hundred, ...",protein expression system scale hundred liter ...


In [617]:
df_bullets['academic'] = df_bullets.company.str.contains('University|College')

In [618]:
df_bullets[df_bullets.academic == True].shape

(763, 27)

In [626]:
df_bullets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5179 entries, 0 to 6707
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 5179 non-null   object 
 1   company               5179 non-null   object 
 2   location              5179 non-null   object 
 3   uid                   5179 non-null   object 
 4   tags                  5179 non-null   object 
 5   description           5179 non-null   object 
 6   bullets               5179 non-null   object 
 7   headers               5179 non-null   object 
 8   phd_tag               5179 non-null   bool   
 9   tag_number            5179 non-null   int64  
 10  jd_length_char        5179 non-null   int64  
 11  header_number         5179 non-null   int64  
 12  bullets_number        5179 non-null   int64  
 13  bullets_length_char   5179 non-null   int64  
 14  bullets_percentage    5179 non-null   float64
 15  bullets_merged       

In [629]:
df_bullets = df_bullets.astype({"academic": str}, errors='raise') 

In [630]:
df_bullets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5179 entries, 0 to 6707
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 5179 non-null   object 
 1   company               5179 non-null   object 
 2   location              5179 non-null   object 
 3   uid                   5179 non-null   object 
 4   tags                  5179 non-null   object 
 5   description           5179 non-null   object 
 6   bullets               5179 non-null   object 
 7   headers               5179 non-null   object 
 8   phd_tag               5179 non-null   bool   
 9   tag_number            5179 non-null   int64  
 10  jd_length_char        5179 non-null   int64  
 11  header_number         5179 non-null   int64  
 12  bullets_number        5179 non-null   int64  
 13  bullets_length_char   5179 non-null   int64  
 14  bullets_percentage    5179 non-null   float64
 15  bullets_merged       

In [644]:
corpus = st.CorpusFromPandas(df_bullets, category_col = 'academic', text_col = 'bullets_merged_spacy', nlp = nlp).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))

In [645]:
html = st.produce_scattertext_explorer(corpus, category='True', category_name='Academic', 
                                       not_category_name='Industry', width_in_pixels=1000)


In [646]:
#open('./example.html', 'w').write(html)
open("academic_vs_industry.html", 'wb').write(html.encode('utf-8'))

11990153

In [787]:
## Let's try with just the nouns and proper nouns
## First let's only use words longer than 1 character to get rid of some weird abbreviations
def drop_shortest_words(sentence):
    new_sentence = []
    for i in range(len(sentence)):
        if re.search('www.cu.edu', sentence[i]):
            #print(sentence[i])
            pass
        elif re.search('phd|PHD|PhD', sentence[i]):
            #print(sentence[i])
            pass
        elif len(sentence[i]) > 1:
            new_sentence.append(sentence[i])
        else:
            pass
    return(new_sentence)
    

In [788]:
test = df_bullets.lemmatized_n[3]
test_result = drop_shortest_words(test)
print(len(test_result))
print(len(df_bullets.lemmatized_n[3]))

97
106


In [789]:
df_bullets['n_short_dropped'] = df_bullets.lemmatized_n.apply(lambda x: drop_shortest_words(x))

In [790]:
df_bullets['merged_n_short_dropped'] = df_bullets.n_short_dropped.apply(lambda x: ' '.join([str(sentence) for sentence in x]))

In [791]:
df_bullets.head()

Unnamed: 0,title,company,location,uid,tags,description,bullets,headers,phd_tag,tag_number,...,pn_in_title,pn_in_title_merged,bullets_merged_spacy,lemmatized_n_v,merged_n_v,lemmatized_n,merged_n,academic,merged_n_short_dropped,n_short_dropped
0,PhD Program & Lab Manager (Hybrid) job,Mass General Brigham,"Charlestown, MA",48022361,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid,"(Maintains, appropriate, office, systems, ,, w...","[maintain, office, system, word, processing, d...",maintain office system word processing databas...,"[maintain, office, system, word, processing, d...",maintain office system word processing databas...,False,maintain office system word processing databas...,"[maintain, office, system, word, processing, d..."
1,PhD Program & Lab Manager (Hybrid) job,Massachusetts General Hospital,"Charlestown, MA",47983977,"[PhD, Masters Degree, Bachelor's Degree, Entry...",DescriptionJob Summary:The School of Health an...,"[Maintains appropriate office systems, word pr...","[Job Summary:, Job Duties:, Program Manager Ro...",False,9,...,"[Program, Lab, Manager, Hybrid]",Program Lab Manager Hybrid,"(Maintains, appropriate, office, systems, ,, w...","[maintain, office, system, word, processing, d...",maintain office system word processing databas...,"[maintain, office, system, word, processing, d...",maintain office system word processing databas...,False,maintain office system word processing databas...,"[maintain, office, system, word, processing, d..."
2,Assistant Professor - Clinical PhD Program - L...,Bertelsmann,"Los Angeles, CA",48973936,"[PhD, Fortune500, Media, Junior, Full-Time]",Assistant Professor - Clinical PhD Program - L...,[Active scholarship and/or relevant teaching e...,"[Our Mission:, Our Vision:, Duties & Responsib...",False,5,...,"[Assistant, Professor, Clinical, Program, Los,...",Assistant Professor Clinical Program Los Angeles,"(Active, scholarship, and/or, relevant, teachi...","[scholarship, teaching, experience, African, A...",scholarship teaching experience African Americ...,"[scholarship, teaching, experience, African, A...",scholarship teaching experience African Americ...,False,scholarship teaching experience African Americ...,"[scholarship, teaching, experience, African, A..."
3,Data Scientist / Statistician Intern (BS / MS ...,Lubrizol,"Hazelwood, DE",39232138,"[PhD, Bachelor's Degree, Data Science, Program...","Lubrizol, a Berkshire Hathaway company, is a m...",[Create predictive models by mining complex da...,[Data Scientist / Statistician Intern (BS/MS/P...,False,12,...,"[Data, Scientist, Statistician, Intern, BS, MS...",Data Scientist Statistician Intern BS MS PhD S...,"(Create, predictive, models, by, mining, compl...","[create, model, mine, datum, formulating, test...",create model mine datum formulating testing in...,"[model, datum, formulating, testing, insight, ...",model datum formulating testing insight Implem...,False,model datum formulating testing insight Implem...,"[model, datum, formulating, testing, insight, ..."
4,PhD Scientist Internship job,Danaher Corporation,"Madison, WI",49810371,"[PhD, Fortune500, Publicly Listed, Conglomerat...",Aldevron is an industry pioneer with a core co...,[Express recombinant proteins in microbial or ...,"[Job Summary:, Responsibilities:, Qualificatio...",False,10,...,"[Scientist, Internship]",Scientist Internship,"(Express, recombinant, proteins, in, microbial...","[express, protein, expression, system, scale, ...",express protein expression system scale range ...,"[protein, expression, system, scale, hundred, ...",protein expression system scale hundred liter ...,False,protein expression system scale hundred liter ...,"[protein, expression, system, scale, hundred, ..."


In [792]:
df_bullets.merged_n_short_dropped[2]

'scholarship teaching experience African American psychology Latinx psychology LGBTQ psychology Strong research statistic background Interest Health Multi Interest Option emphasis area ability principle inclusion diversity equity teaching scholarship interest teaching student community background ability community curriculum teaching philosophy evidence psychologist identity involvement psychology association interest community organization health service agency Learning Facilitating Classroom Distance Learning course preparation class curriculum coordination Assessment evaluation learner student selection orientation assessment advising student Field placement supervision coordination Research scholarship mentoring supervision Scholarship Scholarship/ research generation integration dissemination application knowledge contribution ’s field commensurate rank Service faculty engage service University Community Profession rank example category Service University faculty developing curric

In [647]:

corpus = st.CorpusFromPandas(df_bullets, 
                             category_col = 'academic', 
                             text_col = 'merged_n', nlp = nlp).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))


In [648]:
html = st.produce_scattertext_explorer(corpus, category='True', category_name='Academia', 
                                       not_category_name='Industry', width_in_pixels=1000)


In [649]:
open("academic_vs_industry_job_description_nouns.html", 'wb').write(html.encode('utf-8'))

6813905

In [793]:
## Make visualization with only nouns and proper nouns longer than 1 character
new_corpus = st.CorpusFromPandas(df_bullets, 
                             category_col = 'academic', 
                             text_col = 'merged_n_short_dropped', nlp = nlp).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))


In [794]:
new_html = st.produce_scattertext_explorer(new_corpus, category='True', category_name='Academia', 
                                       not_category_name='Industry', width_in_pixels=1000)


In [795]:
open("academic_vs_industry_job_description_nouns_long.html", 'wb').write(new_html.encode('utf-8'))

6773432