In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_pickle("./cleaned_jobs.pk1")

Exploring data and updating the dataframe by cleaning and arsing new features

1. Simplify job titles
2. Add seniority based on title
3. Correct state variable
4. Add length of Job description. Might indicate patterns
5. Parse and count competitors for the data 
6. Conver hourly wage to annual wage to make it consistent

Simplify Job Titles and add seniroity

In [3]:
# Define function to parse and simplify job titles

def simple_jobs(title):
    job_array = ['data scientist','machine learning','data engineer','manager','director']
    
    simplified_job = 'na'

    for job in job_array:
        if job in title.lower():
            simplified_job =  job
    
    return simplified_job

def seniority(title):
    tmp = title.lower()
    if 'sr' in tmp or 'senior' in tmp or 'lead' in tmp or 'principal' in tmp:
        return 'senior'
    elif 'jr' in tmp or 'junior' in tmp:
        return 'junior'
    else:
        return 'na'
        

In [4]:
df['job_simplified'] = df['Job Title'].apply(simple_jobs)
df['seniority'] = df['Job Title'].apply(seniority)

In [5]:
df['job_simplified'].value_counts()
df['seniority'].value_counts()

na        519
senior    220
junior      3
Name: seniority, dtype: int64

Update the job state variable

In [6]:
df['Headquarters'].value_counts()
df['job_state'].value_counts()

 CA             151
 MA             103
 NY              72
 VA              41
 IL              40
 MD              35
 PA              33
 TX              28
 NC              21
 WA              21
 NJ              17
 FL              16
 OH              14
 TN              13
 CO              11
 DC              11
 IN              10
 UT              10
 WI              10
 AZ               9
 MO               9
 AL               8
 DE               6
 KY               6
 MI               6
 GA               6
 IA               5
 CT               5
 NE               4
 OR               4
 LA               4
 KS               3
 NM               3
 MN               2
 ID               2
 SC               1
 RI               1
 Los Angeles      1
Name: job_state, dtype: int64

In [7]:
# check state variable to see counts

#df['job_state'].value_counts()

# Correct Los angeles to LA

df['job_state'] = df['job_state'].apply(lambda x: x.strip() if x.strip().lower() != 'los angeles' else 'CA')

df['job_state'].value_counts()

CA    152
MA    103
NY     72
VA     41
IL     40
MD     35
PA     33
TX     28
NC     21
WA     21
NJ     17
FL     16
OH     14
TN     13
DC     11
CO     11
UT     10
WI     10
IN     10
MO      9
AZ      9
AL      8
DE      6
MI      6
GA      6
KY      6
IA      5
CT      5
OR      4
LA      4
NE      4
KS      3
NM      3
MN      2
ID      2
RI      1
SC      1
Name: job_state, dtype: int64

Add length of job descriptions


In [8]:
df['job_desc_len'] = df['Job Description'].apply(lambda x: len(x))

df['job_desc_len']

0      2536
1      4783
2      3461
3      3883
4      2728
       ... 
950    6162
951    6130
952    3078
953    1642
955    3673
Name: job_desc_len, Length: 742, dtype: int64

Parse and count competitors

In [9]:
# Check competitor column

df['Competitors']

0                                                     -1
1                                                     -1
2                                                     -1
3      Oak Ridge National Laboratory, National Renewa...
4                   Commerce Signals, Cardlytics, Yodlee
                             ...                        
950                           Pfizer, AstraZeneca, Merck
951                      See Tickets, TicketWeb, Vendini
952                                                   -1
953                                                   -1
955                                                   -1
Name: Competitors, Length: 742, dtype: object

In [10]:
# Split by comma and count # of entries
# leave as -1 if not available

df['num_competitors'] = df['Competitors'].apply(lambda x: len(x.split(',')) if x != '-1' else 0)

df['num_competitors']

0      0
1      0
2      0
3      3
4      3
      ..
950    3
951    3
952    0
953    0
955    0
Name: num_competitors, Length: 742, dtype: int64

Convert hourly wages to annual

Assumption will be 40hrs/week and 52 weeks per year. SInce all salaries are in thousands, we divide by 1k to make it consistent



In [11]:
df.columns.values

df['hourly flag'].value_counts()

0    718
1     24
Name: hourly flag, dtype: int64

In [12]:
df['min_salary_updated'] = df.apply(lambda x: x['min_salary'] if x['hourly flag'] == 0 else x['min_salary']*(40*52/1000), axis = 1)
df['max_salary_updated'] = df.apply(lambda x: x['max_salary'] if x['hourly flag'] == 0 else x['max_salary']*(40*52/1000), axis = 1)

# Dropping old salary columns

df.drop(labels = ['min_salary','max_salary'], axis = 1, inplace = True)

In [13]:
df.columns.values

array(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Headquarters', 'Size', 'Founded',
       'Type of ownership', 'Industry', 'Sector', 'Revenue',
       'Competitors', 'hourly flag', 'employer provided flag',
       'company_txt', 'job_state', 'hq_state', 'job_in_HQ_flag',
       'age_company', 'sas_flag', 'spark_flag', 'python_flag',
       'matlab_flag', 'tensorflow_flag', 'tableau_flag', 'r_flag',
       'aws_flag', 'hadoop_flag', 'job_simplified', 'seniority',
       'job_desc_len', 'num_competitors', 'min_salary_updated',
       'max_salary_updated'], dtype=object)

# Understanding final data frame and creating visualizations