### Glassdoor data cleaning

In [1]:
# Imports
import pandas as pd
import pickle
import re
import unicodedata
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
with open('glass_jobs.json', 'rb') as data:
    df = pd.read_json(data)

In [3]:
df.head()

Unnamed: 0,description,header,salary_estimation,title
0,[We are seeking a Data Analyst to join our Peo...,"[Data Analyst, People Analytics 3.3 WeWork – N...","[$90,000]","[Data Analyst, People Analytics]"
1,[A data scientist at Kensho is passionate abou...,"[Data Scientist 5.0 Kensho – New York, NY 11 d...","[$148,000]",[Data Scientist]
2,"[OverviewAt New York Blood Center, one of the ...",[Data Analyst 2.9 New York Blood Center – New ...,"[$73,000]",[Data Analyst]
3,[DATA SCIENTISTAbout us ERGO Interactive is a ...,[Data Scientist 3.3 ERGO Interactive – New Yor...,"[$116,000]",[Data Scientist]
4,[Review mortgage applications and credit.Analy...,[],[],[]


In [4]:
for column in df:
    df[column] = df[column].str.get(0)

In [5]:
# Entries without titles do not contain info for the headers
df = df.dropna(subset=['title'])

In [6]:
df.shape

(6010, 4)

In [7]:
# descriptions include entire job posting by company. If there are duplicates
# they are definately the same job
df = df.drop_duplicates(subset=['description'])
df.shape

(5080, 4)

In [8]:
df.title = df.title.str.lower()
df.description = df.description.str.lower()
df.head()

Unnamed: 0,description,header,salary_estimation,title
0,we are seeking a data analyst to join our peop...,"Data Analyst, People Analytics 3.3 WeWork – Ne...","$90,000","data analyst, people analytics"
1,a data scientist at kensho is passionate about...,"Data Scientist 5.0 Kensho – New York, NY 11 da...","$148,000",data scientist
2,"overviewat new york blood center, one of the m...",Data Analyst 2.9 New York Blood Center – New Y...,"$73,000",data analyst
3,data scientistabout us ergo interactive is a m...,Data Scientist 3.3 ERGO Interactive – New York...,"$116,000",data scientist
5,intent media isn't your usual company. our wor...,"Data Scientist 4.7 Intent Media – New York, NY...","$140,000",data scientist


Focus is on data related jobs. First filter out any obvious job names that dont have to do with data

In [9]:
needed_jobs = 'data|learning|predictive|engineer|scientist|science|developer|analyst|\
               software|java|consultant|information|product|devops|analytics|intern'

needed_skills = 'python|sql|java|tableau|c/+/+|c/#|vba|pivot tables|ios|database|programmer'
    
df = df[(df.title.str.contains(needed_jobs)) | (df.description.str.contains(needed_skills))]
df.shape

(3201, 4)

This filter accounts for any jobs that would contain useful information to train the model. 

Next, I need to extract categorical features from the job descriptions and headers. 

These will then be made into dummy variables that can be used to train the model.

In [10]:
df = df.reset_index(drop=True)
# normalize annoying unicode format to allow for regex match
df.header = df.header.map(lambda x: unicodedata.normalize('NFKD', x))
df.header = df.header.map(lambda x: x.encode('utf-8'))
df.header = df.header.map(lambda x: re.sub("\xe2\x80\x93", "?", x))

In [11]:
def get_location(column):
    match_location = re.search(r'\w([a-zA-Z]+, [a-zA-z]+) ', column)
    return match_location.group()

In [12]:
# Only select the state from the header. If cities are specified, the model overfits majorly due
# to single jobs per city
#df['location'] = df.header.str.extract(r'(?<=\? )([a-zA-Z]* *[a-zA-Z]+, [a-zA-z]+)')
#df = df.dropna(subset=['location'])
df.shape

(3201, 4)

In [13]:
# target variable will be average salary, if no salary info is in the header or under salary_estimation,
# the rows need to be dropped
nanrows = df[df.salary_estimation.isnull()]

In [14]:
nanrows.header.str.extract(r'(Glassdoor Est.)').isnull().shape

(1119, 1)

In [15]:
nanrows.header.str.extract(r'(Employer Est.)').isnull().shape

(1119, 1)

So it looks like Glassdoors salary estimation is based on the ranges of values in the header.

This means any values that are NaN for salary_estimation need to be dropped as there is no target variable.

In [16]:
df = df.dropna(subset=['salary_estimation']).reset_index(drop=True)
df.shape

(2082, 4)

Now extract the features from job descriptions

In [17]:
df = df.drop(['header'], axis=1)

In [18]:
df.description = df.description.str.replace('[^\w\s]','')

In [19]:
df.title = df.title.str.replace('[^\w\s]','').str.lower()

In [20]:
# Dictionary of feature names with corresponding re's
re_dict = {'lang_python': re.compile(r'python'),
           'lang_r': re.compile(r'\br\b|r studio'),
           'lang_sql': re.compile(r'\bsql\b'),
           'lang_java': re.compile(r'\bjava\b'),
           'lang_javascript': re.compile(r'javascript'),
           'company_vacaton': re.compile(r'vacation'),
           'company_401k': re.compile(r'401k'),
           'company_benefits': re.compile(r'health|medical|dental|vision|benefits'),
           'company_travel': re.compile(r'travel'),
           'skill_communication': re.compile(r'communication *skills'),
           'company_food': re.compile(r'food|lunch|breakfast|snacks'),
           'company_equity': re.compile(r'equity'),
           'skill_datamining': re.compile(r'data *mining|mine|database|warehouse|warehousing'),
           'skill_stats': re.compile(r'statistic|statistical|statistics|stats'),
           'skill_predictive': re.compile(r'predict'),
           'skill_algorithm': re.compile(r'algorithm'),
           'skill_present': re.compile(r'\bpresent'),
           'skill_machinelearning': re.compile(r'machine learning|data science|scientist'),
           'skill_aws': re.compile(r'\baws\b'),
           'skill_bigdata': re.compile(r'big data|hadoop|impala|spark'),
           'skill_visualization': re.compile(r'visualization|tableau|\bd3\b|ggplot'),
           'company_pets': re.compile(r'\bpet\b|dog'),
           'degree_bs': re.compile(r'bachelor|\bbs\b'),
           'degree_ms': re.compile(r'master|\bms\b'),
           'degree_phd': re.compile(r'phd|doctorate|doctoral'),
           'experience_low': re.compile(r'([1-3][^0].*(?<=year).*experience)'),
           'experience_high': re.compile(r'([4-9][^0].*(?<=year).*experience)')   
          }



In [21]:
# Set all necessary dummy columns 
for key, _ in re_dict.items():
    df[key] = 0

In [22]:
# Create dummy variables for description features
for index, value in df.description.iteritems():
        for key, regex in re_dict.items():
            if re.search(regex, value) is not None:
                df.loc[index, key] = 1
            else:
                df.loc[index, key] = 0

In [23]:
# Dummy variable dict for title features
title_re_dict = {'title_intern': re.compile(r'intern'),
                 'title_senior': re.compile(r'senior|\bsr\b'),
                 'title_junior': re.compile(r'junior|\bjr\b'),
                 'title_manager': re.compile(r'manager'),
                 'title_director': re.compile(r'director'),
                 #'specialist': re.compile(r'specialist')
                }

In [24]:
# Set all necessary dummy columns 
for key, _ in title_re_dict.items():
    df[key] = 0

In [25]:
# Create dummy variables for title features ie: intern/fulltime/contract (none labeled contract)
for index, value in df.title.iteritems():
        for key, regex in title_re_dict.items():
            if re.search(regex, value) is not None:
                df.loc[index, key] = 1
            else:
                df.loc[index, key] = 0

In [26]:
# Locations are all in same format, get_dummies can be used
#loc_df = pd.get_dummies(df.location, prefix='loc_')

In [27]:
#final_df = df.join(loc_df)
final_df = df

In [28]:
final_df.salary_estimation = final_df.salary_estimation.infer_objects()
final_df.salary_estimation = final_df.salary_estimation.replace('[\$,]', '', regex=True).astype(float)

In [29]:
final_df.drop(['description', 'title'], axis=1, inplace=True)
final_df = final_df[final_df.salary_estimation > 50]

In [30]:
#save the data set and load in new notebook to avoid confusion
with open('final_data.pkl', 'wb') as f:
    pickle.dump(final_df,f)