# Title/Salary Key Term Associations

In [1]:
#imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import scipy
import numpy as np

Data Science Jobs dataset from glassdoor (accessed via kaggle): https://www.kaggle.com/datasets/georgejnr/advertised-data-science-jobs-dataset?resource=download

### Initial Loading of Data and EDA

In [2]:
#Loading Data
ds_jobs = pd.read_csv("./data_sci_jobs.csv")
ds_jobs

Unnamed: 0,Company,Job Title,Location,Estimated Salary
0,Dataquestcorp,Data Science,"Branchburg, NJ",$70K - $79K (Employer est.)
1,Paliwoda Group,"Junior Researcher, Data Science","New York, NY",
2,"JPMorgan Chase Bank, N.A.",2023 Advancing Hispanics & Latinos Fellowship ...,"New York, NY",$26.45 Per Hour(Employer est.)
3,Discord,Data Engineering Manager,"San Francisco, CA",$192K (Employer est.)
4,YT Global Network,Data Engineer- Remote,Remote,$90.00 - $120.00 Per Hour(Employer est.)
...,...,...,...,...
2425,C3 AI,Data Science Instructor,"Redwood City, CA",$109K - $142K (Employer est.)
2426,pulseData,Data Science Associate (Remote),"New York, NY",$50K - $83K (Glassdoor est.)
2427,Bayer,NAFTO Data Science Co-op,United States,$35.00 Per Hour(Employer est.)
2428,MediaMath,"VP, Data Science","New York, NY",$231K - $300K (Employer est.)


In [3]:
#check for nulls in job title and estimated salary
print(ds_jobs["Job Title"].isna().sum())
print(ds_jobs["Estimated Salary"].isna().sum())

0
482


In [4]:
#drop null salary jobs
ds_jobs.dropna(subset=["Estimated Salary"], inplace=True)
ds_jobs

Unnamed: 0,Company,Job Title,Location,Estimated Salary
0,Dataquestcorp,Data Science,"Branchburg, NJ",$70K - $79K (Employer est.)
2,"JPMorgan Chase Bank, N.A.",2023 Advancing Hispanics & Latinos Fellowship ...,"New York, NY",$26.45 Per Hour(Employer est.)
3,Discord,Data Engineering Manager,"San Francisco, CA",$192K (Employer est.)
4,YT Global Network,Data Engineer- Remote,Remote,$90.00 - $120.00 Per Hour(Employer est.)
5,excelon solutuion,Hiring for Data Science and AWS on w2,Remote,$45.00 - $50.00 Per Hour(Employer est.)
...,...,...,...,...
2425,C3 AI,Data Science Instructor,"Redwood City, CA",$109K - $142K (Employer est.)
2426,pulseData,Data Science Associate (Remote),"New York, NY",$50K - $83K (Glassdoor est.)
2427,Bayer,NAFTO Data Science Co-op,United States,$35.00 Per Hour(Employer est.)
2428,MediaMath,"VP, Data Science","New York, NY",$231K - $300K (Employer est.)


### Convert Salaries to Numeric Data Type / Clean Data and Standardize Units

In [5]:
#remove estimate provider statement from salaries
import re
    
ds_jobs["Estimated Salary"] = ds_jobs["Estimated Salary"].map(lambda s: re.sub("\(.*?\)", "", s))
ds_jobs

Unnamed: 0,Company,Job Title,Location,Estimated Salary
0,Dataquestcorp,Data Science,"Branchburg, NJ",$70K - $79K
2,"JPMorgan Chase Bank, N.A.",2023 Advancing Hispanics & Latinos Fellowship ...,"New York, NY",$26.45 Per Hour
3,Discord,Data Engineering Manager,"San Francisco, CA",$192K
4,YT Global Network,Data Engineer- Remote,Remote,$90.00 - $120.00 Per Hour
5,excelon solutuion,Hiring for Data Science and AWS on w2,Remote,$45.00 - $50.00 Per Hour
...,...,...,...,...
2425,C3 AI,Data Science Instructor,"Redwood City, CA",$109K - $142K
2426,pulseData,Data Science Associate (Remote),"New York, NY",$50K - $83K
2427,Bayer,NAFTO Data Science Co-op,United States,$35.00 Per Hour
2428,MediaMath,"VP, Data Science","New York, NY",$231K - $300K


In [6]:
#convert suffix K to thousands of dollars
ds_jobs["Estimated Salary"] = ds_jobs["Estimated Salary"].map(lambda s: s.replace("K", "000"))
ds_jobs

Unnamed: 0,Company,Job Title,Location,Estimated Salary
0,Dataquestcorp,Data Science,"Branchburg, NJ",$70000 - $79000
2,"JPMorgan Chase Bank, N.A.",2023 Advancing Hispanics & Latinos Fellowship ...,"New York, NY",$26.45 Per Hour
3,Discord,Data Engineering Manager,"San Francisco, CA",$192000
4,YT Global Network,Data Engineer- Remote,Remote,$90.00 - $120.00 Per Hour
5,excelon solutuion,Hiring for Data Science and AWS on w2,Remote,$45.00 - $50.00 Per Hour
...,...,...,...,...
2425,C3 AI,Data Science Instructor,"Redwood City, CA",$109000 - $142000
2426,pulseData,Data Science Associate (Remote),"New York, NY",$50000 - $83000
2427,Bayer,NAFTO Data Science Co-op,United States,$35.00 Per Hour
2428,MediaMath,"VP, Data Science","New York, NY",$231000 - $300000


In [7]:
#convert per hour to annual salary based on assumption of 40-hour workweek 
def annual_calc(num):
    """simple helper function for unit conversion from hourly to annual salary"""
    annual = num*40*52
    return annual

def hourly_conversion(string):
    """Converts hourly salaries and salary ranges to annual equivalents based on a 40-hour workweek.
    Note that this assumes holidays and PTO are paid at normal rate and no additional hours are worked.
    
    ***Arguments***
    string: individual input strings from the salary column. Assesses string for hourly units. If 
            string represents hourly units, the string is parsed to obtain numeric representation for 
            hourly salary or salary ranges and calculates annual salary equivalent
            
    ***Returns***
    function returns annual salary equivalents as a numeric string (for consistency with raw salary 
    ranges remaining in the data)
    """
    #convert on conditional
    if "Per Hour" in string:
        string.replace("Per Hour", "")
        substrings = re.findall("\d+", string)
        substrings = [annual_calc(int(s)) for s in substrings]
        #remove extra strings due to decimal
        substrings = [s for s in substrings if s!=0]
        #joining converted salaries
        new_string = ""
        for i in substrings:
            new_string += f" {i} -"
        return new_string[1:-1]
    
    #if not hourly salary, return string unchanged
    else:
        return string

#conversion
ds_jobs["Estimated Salary"] = ds_jobs["Estimated Salary"].apply(hourly_conversion)
ds_jobs

Unnamed: 0,Company,Job Title,Location,Estimated Salary
0,Dataquestcorp,Data Science,"Branchburg, NJ",$70000 - $79000
2,"JPMorgan Chase Bank, N.A.",2023 Advancing Hispanics & Latinos Fellowship ...,"New York, NY",54080 - 93600
3,Discord,Data Engineering Manager,"San Francisco, CA",$192000
4,YT Global Network,Data Engineer- Remote,Remote,187200 - 249600
5,excelon solutuion,Hiring for Data Science and AWS on w2,Remote,93600 - 104000
...,...,...,...,...
2425,C3 AI,Data Science Instructor,"Redwood City, CA",$109000 - $142000
2426,pulseData,Data Science Associate (Remote),"New York, NY",$50000 - $83000
2427,Bayer,NAFTO Data Science Co-op,United States,72800
2428,MediaMath,"VP, Data Science","New York, NY",$231000 - $300000


In [8]:
#convert listed salaries and salary ranges to 3 columns: upper bound, median, and lower bound
#if range is provided, median value fills the median column
#if specific salary is provided, a 20% over/under tolerance formula is used to estimate upper/lower

def salary(string):
    """Helper function to ingest strings from estimated salary column and convert them to 
    upper bound, median, and lower bound values. Returns these numbers as a list of integers"""
    
    substrings = re.findall("\d+", string)
    if len(substrings)==1:
        
    else:
        
    
    return salary_bounds

ds_jobs["Estimated Salary"] = ds_jobs["Estimated Salary"].apply()