In [1]:
import glob
import pandas as pd
import numpy as np
import json

### Merge separate data files into one data frame `(df_main)`

In [2]:
df_master = pd.DataFrame()

files = glob.glob('../data_raw/*.csv')
files

['../data_raw/indeed-8-5.csv',
 '../data_raw/indeed-8-7.csv',
 '../data_raw/linkedin-8-5.csv',
 '../data_raw/linkedin-8-9.csv',
 '../data_raw/linkedin-8-8.csv',
 '../data_raw/indeed-8-8.csv',
 '../data_raw/indeed-8-9.csv']

In [3]:
for i, filepath in enumerate(files):
    df = pd.read_csv(filepath, encoding='latin1')
    filename_elements = filepath.split('/')[2].split('.')[0].split('-')  # splits up the filename to extract platform & search date info
    df['platform'] = filename_elements[0]  # first element is the platform
    df['search_date'] = f'{filename_elements[1]}-{filename_elements[2]}-2019'  # second/third elements are the month & date of the search
    
    frames = [df_master, df]
    df_master = pd.concat(frames, sort=True)

In [4]:
df_master = df_master.dropna(subset=['description'])  # Drop rows with null values in description
df_master.head(20)


Unnamed: 0,company,description,employment_type,full_desc,industries,job_function,location,platform,post_date,salary,search_city,search_date,search_role,seniority_level,title,url
0,Noom Inc.,"\n At Noom, we use scientifically proven meth...",,,,,"New York, NY",indeed,30+,,,8-5-2019,Data Scientist,,Data Engineer,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
1,Strategic Financial Solutions,\n Overview\nDo you love numbers and finding ...,,,,,"New York, NY 10018",indeed,18,,,8-5-2019,Data Scientist,,Lead Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2,Averity,\n We are one of the world's premiere travel ...,,,,,"New York, NY",indeed,12,"$175,000 - $225,000 a year",,8-5-2019,Data Scientist,,"Manager, Data Science",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
3,Lenmar Consulting,\n Medical Data Scientist \n Must have at lea...,,,,,"Woodcliff Lake, NJ",indeed,0,$75 an hour,,8-5-2019,Data Scientist,,Medical Data Scientist,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
4,Memorial Sloan Kettering,"\n <h2 class=""jobSectionHeader"">Company Overv...",,,,,"New York, NY 10017",indeed,16,,,8-5-2019,Data Scientist,,Sr Strategic Consultant / Data Scientist (Pati...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
5,Digitalogy,\n What you will do\n\nResponsible for assist...,,,,,"New York, NY",indeed,5,$50 - $80 an hour,,8-5-2019,Data Scientist,,Data Scientist,https://www.indeed.com/rc/clk?jk=83e3f665b22e5...
6,Viacom,"\n <h2 class=""jobSectionHeader"">Overview and ...",,,,,"New York, NY 10036",indeed,7,,,8-5-2019,Data Scientist,,Jr. Data Scientist,https://www.indeed.com/rc/clk?jk=1942a17d2bd16...
7,Butterfly Network,\n Job Description<br/>\n<br/>\nWe are lookin...,,,,,"New York, NY",indeed,3,,,8-5-2019,Data Scientist,,Data Scientist,https://www.indeed.com/rc/clk?jk=14bc4e97c18a1...
8,Celonis,\n Are you ready for a new challenge?\n \n <b...,,,,,"New York, NY 10036",indeed,2,,,8-5-2019,Data Scientist,,Data Scientist,https://www.indeed.com/rc/clk?jk=755c879e19b03...
9,Macy's,\n Job Overview:<br/>\n\n\nAs part of the Sup...,,,,,"New York, NY 10001",indeed,14,,,8-5-2019,Data Scientist,,Associate Data Scientist,https://www.indeed.com/rc/clk?jk=126152117b65e...


### Observations
- Inconsistency in which features are included. There are many null values throughout the data
- Very inconsistent/sparse salary column 

### Save master data frame

In [172]:
df_subset = df_master[['company', 'description', 'location', 'title', 'url']]
df_subset.to_csv("../data_processed/concatenated_data.csv", index=False)

### Group job titles into categories

In [25]:
all_job_titles = list(df_master['title'].unique())

# data scientists
data_scientist_count = 0

# engineers
data_engineer_count = 0
machine_learning_engineer_count = 0

# analysts
data_analyst_count = 0
business_analyst_count = 0

# research
ml_ai_research_count = 0
statistics_count = 0

job_title_dictionary = {}

for job_title in all_job_titles:
    if 'data' in job_title.lower() and 'scien' in job_title.lower():
        job_title_dictionary[job_title] = ['Data Scientist']
        data_scientist_count += 1
    if 'data' in job_title.lower() and 'engineer' in job_title.lower():
        data_engineer_count += 1
        if job_title in job_title_dictionary.keys():  # if the job title has already been given one of the previous labels, add on to it
            labels = job_title_dictionary[job_title]
            labels.append('Data Engineer')
            job_title_dictionary[job_title] = labels
        else:
            job_title_dictionary[job_title] = ['Data Engineer']
    if ('machine learning' in job_title.lower() or 'ml' in job_title.lower() or 'deep learning' in job_title.lower() or 'computer vision' in job_title.lower())\
        and ('engineer' in job_title.lower() or 'developer' in job_title.lower() or 'software' in job_title.lower()):
        machine_learning_engineer_count += 1
        if job_title in job_title_dictionary.keys():  # if the job title has already been given one of the previous labels, add on to it
            labels = job_title_dictionary[job_title]
            labels.append('Machine Learning Engineer')
            job_title_dictionary[job_title] = labels
        else:
            job_title_dictionary[job_title] = ['Machine Learning Engineer']
    if 'analy' in job_title.lower() and 'data' in job_title.lower():
        data_analyst_count += 1
        if job_title in job_title_dictionary.keys():  # if the job title has already been given one of the previous labels, add on to it
            labels = job_title_dictionary[job_title]
            labels.append('Data Analyst')
            job_title_dictionary[job_title] = labels
        else:
            job_title_dictionary[job_title] = ['Data Analyst']
    if ('analy' in job_title.lower() or 'intelli' in job_title.lower()) and ('business' in job_title.lower() or 'product' in job_title.lower()\
        or 'market' in job_title.lower() or 'strategy' in job_title.lower() or 'management' in job_title.lower()):
        business_analyst_count += 1
        if job_title in job_title_dictionary.keys():  # if the job title has already been given one of the previous labels, add on to it
            labels = job_title_dictionary[job_title]
            labels.append('Business Analyst')
            job_title_dictionary[job_title] = labels
        else:
            job_title_dictionary[job_title] = ['Business Analyst']
    if ('research' in job_title.lower() or 'data' not in job_title.lower()) and ('machine learning' in job_title.lower() or 'ml' in job_title.lower()\
        or 'ai' in job_title.lower() or 'artificial intelligence' in job_title.lower()):
        ml_ai_research_count += 1
        if job_title in job_title_dictionary.keys():  # if the job title has already been given one of the previous labels, add on to it
            labels = job_title_dictionary[job_title]
            labels.append('ML/AI Researcher')
            job_title_dictionary[job_title] = labels
        else:
            job_title_dictionary[job_title] = ['ML/AI Researcher']
    if ('statistic' in job_title.lower() or 'quant' in job_title.lower() or 'math' in job_title.lower())\
        and ('model' in job_title.lower() or 'research' in job_title.lower()):
        statistics_count += 1
        if job_title in job_title_dictionary.keys():  # if the job title has already been given one of the previous labels, add on to it
            labels = job_title_dictionary[job_title]
            labels.append('Statistical Modeler/Researcher')
            job_title_dictionary[job_title] = labels
        else:
            job_title_dictionary[job_title] = ['Statistical Modeler/Researcher']

with open('job_title_dictionary.json', 'w') as json_file:
    json.dump(job_title_dictionary, json_file)

In [26]:
count = 0
for job_title in all_job_titles:
    if job_title not in job_title_dictionary.keys():
        count += 1
        print(job_title)
count

Software Engineer 
decision scientist, Applied Analytics
Siri 
Graduate Studies Program 
Mid
Federal 
Data Architect 
Quantitative Finance Analyst 
Senior Biostatistician, Cardiology Projects 
Biologist/Water Resources Scientist 
Health Actuary 
Power Plant Data Integrator
Informatics Analyst 
Government Grants Analyst 
Quant Risk Analyst 
Crime Analyst, Level I 
Finance, Full Time Analyst (North America 
Retirement Plan Analyst 
Treasury Systems Analyst 
Information Security Analyst 
Crime Analyst 
Fraud Analyst 
Software Quality Assurance Analyst 
Commission Analyst 
Consumer Insights Analyst 
E
Program Analyst 
Quality Review Analyst NF
Analyst
Entry
Consultant 
Analyst 
Intelligence Analyst 
K
Policy and Planning Analyst 
Intel Analyst (ISA I â C) 
Strategy and Insights Manager 
Healix 
Platform Integration Manager 
Strategic Analyst 
Social Media Intelligence Analyst 
Sr Engineer, Software 
DHS Intelligence Analyst 
PT job interviewer needed for cleaning company. Bi
INTERNSHIP (

907

In [118]:
# deprecated code - retaining these keywords for seniority
#if 'junior' in job_title.lower()\
#    or ('jr' in job_title.lower() and 'sr' not in job_title.lower())\
#    or 'intern' in job_title.lower():
#    job_title_dictionary[job_title] = ['Junior Data Scientist']
#    junior_data_scientist_count += 1
#elif 'senior' in job_title.lower()\
#    or 'sr' in job_title.lower()\
#    or 'lead' in job_title.lower()\
#    or 'chief' in job_title.lower()\
#    or 'principal' in job_title.lower()\
#    or 'manager' in job_title.lower():
#    job_title_dictionary[job_title] = ['Senior Data Scientist']
#    senior_data_scientist_count += 1