In [214]:
!pip install -r requirements.txt



In [None]:
# standard library imports
import json
import re

# third party imports
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\willi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
# Using df = pd.read_json("./data/careerbuilder_usa.ldjson", lines=True, orient='records') did not work due to missing newline characters
# Hence, I used the following approach
jsonls = []
with open('./data/careerbuilder_usa.ldjson', 'r') as f:
    jsonls = []
    for i, line in enumerate(f):
        l = line.replace('}{"', '}\n{"')
        for newline in l.split('\n')[:-1]:
            jsonls.append(json.loads(newline))

df_raw = pd.DataFrame(jsonls)

In [7]:
df_raw.columns

Index(['uniq_id', 'crawl_timestamp', 'url', 'job_title', 'company_name',
       'city', 'state', 'country', 'post_date', 'job_description', 'job_type',
       'inferred_salary_time_unit', 'company_description', 'salary_offered',
       'job_board', 'geo', 'job_post_lang', 'valid_through',
       'html_job_description', 'inferred_iso2_lang_code',
       'inferred_iso3_lang_code', 'site_name', 'domain', 'postdate_yyyymmdd',
       'has_expired', 'last_expiry_check_date', 'latest_expiry_check_date',
       'duplicate_status', 'postdate_in_indexname_format', 'inferred_city',
       'inferred_state', 'inferred_country', 'fitness_score',
       'inferred_salary_from', 'inferred_salary_to',
       'inferred_salary_currency', 'is_consumed_job', 'job_requirements',
       'contact_email', 'test_contact_email'],
      dtype='object')

In [8]:
# Focus on columns that may be useful for uncovering skills trends in data science
df = df_raw[['job_title', 'job_description', 'company_description', 'job_requirements']]
display(df.head())

# Replace empty company description with null value, then drop columns with >80% null values
df.loc[:,'company_description'] = df['company_description'].replace('', None)
df = df.dropna(thresh=int(0.8*df.shape[0]), axis=1)
print("After dropping null/empty values:")
df.head()

Unnamed: 0,job_title,job_description,company_description,job_requirements
0,Asphalt/Concrete Senior Project Manager,SR. PROJECT MANAGER WANTED!!! My client is a M...,,
1,Amazon Warehouse Team - Full Time,"Shifts: Overnight, Early Morning, Day, Evening...",,
2,Amazon Warehouse Associate - Morning Shifts Av...,"Shifts: Early Morning, Day, Evening Location: ...",,
3,Assembly Electrical,Do you pride yourself on attention to detail a...,,
4,Graphics Designer,In your role as Graphics Designer for Alaska C...,,


After dropping null/empty values:


Unnamed: 0,job_title,job_description
0,Asphalt/Concrete Senior Project Manager,SR. PROJECT MANAGER WANTED!!! My client is a M...
1,Amazon Warehouse Team - Full Time,"Shifts: Overnight, Early Morning, Day, Evening..."
2,Amazon Warehouse Associate - Morning Shifts Av...,"Shifts: Early Morning, Day, Evening Location: ..."
3,Assembly Electrical,Do you pride yourself on attention to detail a...
4,Graphics Designer,In your role as Graphics Designer for Alaska C...


In [9]:
has_data = df['job_title'].str.lower().str.contains('data')
not_data_management = ~df['job_title'].str.lower().str.contains('data entr') \
    & ~df['job_title'].str.lower().str.contains('engin') \
    & ~df['job_title'].str.lower().str.contains('archi') \
    & ~df['job_title'].str.lower().str.contains('admin')
subset_df = df[(has_data & not_data_management)]

In [10]:
subset_df

Unnamed: 0,job_title,job_description
439,Data Analyst,Ref ID: 01030 [ Phone number blocked ] Classif...
740,Lead Azure Data Factory/Logic Apps Developer,Lead Azure Data Factory/Logic Apps Developer P...
865,Database Developer,Ref ID: 02910-0011489692 Classification: Datab...
1578,Data Analyst - Point Mugu,JT4 provides engineering and technical support...
1744,Big Data Business Analyst,Important Note: During the application process...
...,...,...
28445,Data Analyst,Ref ID: 02380 [ Phone number blocked ] Classif...
28648,Buyside - Data Scientist/Research Analyst,We're currently working closely with a prestig...
28654,Business Initiatives Consultant 3 / Data Initi...,Important Note: During the application process...
28882,Data Team Manager,JOB OVERVIEW: Capital Insurance Group is seeki...


In [90]:
standardisation_dict = {
    "natural language processing": "nlp",
    "machine learning": "ml",
    "artificial intelligence": "ai",
    "google cloud platform": "gcp",
    "google cloud": "gcp",
    "amazon web services": "aws",
    "microsoft azure": "azure",
    "big data": "big_data"
}
def clean_text(text):
    text = text.lower()
    #remove punctuations
    text = re.sub(r'[,\.!?":\[\]\(\)-;•\*\$]', '', text)
    #remove numbers
    text = re.sub(r'\d+', ' ', text)
    for key, value in standardisation_dict.items():
        text = text.replace(key, value)
    return text

In [91]:
subset_df['cleaned_jd'] = subset_df['job_description'].apply(clean_text)
subset_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['cleaned_jd'] = subset_df['job_description'].apply(clean_text)


Unnamed: 0,job_title,job_description,cleaned_jd,tokens
439,Data Analyst,Ref ID: 01030 [ Phone number blocked ] Classif...,ref id phone number blocked classification ...,"[ref, id, phone, number, blocked, classificati..."
740,Lead Azure Data Factory/Logic Apps Developer,Lead Azure Data Factory/Logic Apps Developer P...,lead azure data factorylogic apps developer pr...,"[lead, azure, data, factory, logic, apps, deve..."
865,Database Developer,Ref ID: 02910-0011489692 Classification: Datab...,ref id classification database developer comp...,"[ref, id, classification, database, developer,..."
1578,Data Analyst - Point Mugu,JT4 provides engineering and technical support...,jt provides engineering and technical support ...,"[jt, provides, engineering, technical, support..."
1744,Big Data Business Analyst,Important Note: During the application process...,important note during the application process ...,"[important, note, application, process, ensure..."
...,...,...,...,...
28445,Data Analyst,Ref ID: 02380 [ Phone number blocked ] Classif...,ref id phone number blocked classification ...,"[ref, id, phone, number, blocked, classificati..."
28648,Buyside - Data Scientist/Research Analyst,We're currently working closely with a prestig...,we're currently working closely with a prestig...,"[currently, working, closely, prestigious, inv..."
28654,Business Initiatives Consultant 3 / Data Initi...,Important Note: During the application process...,important note during the application process ...,"[important, note, application, process, ensure..."
28882,Data Team Manager,JOB OVERVIEW: Capital Insurance Group is seeki...,job overview capital insurance group is seekin...,"[job, overview, capital, insurance, group, see..."


In [118]:
stop_words = stopwords.words('english') + ["n't", "job", "requirements", "experience", "skills", "years", "required", "must", "&", "'", "please", "careerbuilder", "work", "fargo"]
vectorizer = CountVectorizer(tokenizer=word_tokenize, stop_words=stop_words, max_features=5000)
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(vectorizer.fit_transform(subset_df['cleaned_jd']))

In [119]:
vocab = vectorizer.get_feature_names_out()
def get_top_words(model, feature_names, n_words=10):
    topics = []
    for comp in model.components_:
        word_idx = comp.argsort()[::-1][:n_words]
        top_words = [feature_names[i] for i in word_idx]
        topics.append(top_words)
    return topics

topics = get_top_words(lda, vocab, n_words=10)
for idx, words in enumerate(topics):
    print(f"Topic {idx}: {', '.join(words)}")

Topic 0: sql, data, server, knowledge, database, transplant, r, services, processes, etl
Topic 1: data, database, business, management, ability, development, may, modeling, andor, link
Topic 2: data, global, team, independently, use, operations, stakeholders, intelligence, ability, investment
Topic 3: ml, learning, models, deep, teaching, engineering, bcforward, ai, ’, analytics
Topic 4: facility, maintain, data, module, management, dmlssfm, support, maintenance, dmlss, facilities
Topic 5: data, business, team, management, ability, solutions, systems, analysis, support, technical
Topic 6: data, link, removed, technology, half, process, us, software, robert, research
Topic 7: data, wells, business, management, team, customers, st, information, analysis, financial
Topic 8: insurance, general, national, team, data, development, individuals, essential, status, protected
Topic 9: data, information, security, provide, applying, apply, report, database, use, team
