In [256]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import matplotlib.pyplot as plt

In [257]:
df1 = pd.read_csv('data/raw/DataEngineer.csv',sep=',') # Comma-seperated values file and Comma-delimited 
df1.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Easy Apply
0,Data Engineer,$80K-$150K (Glassdoor est.),Company Description\nSagence is a management a...,4.5,Sagence\n4.5,"New York, NY","Chicago, IL",1 to 50 employees,2009,Company - Private,Consulting,Business Services,$10 to $25 million (USD),"WCI Consulting, PwC",-1
1,Senior Data Engineer (Healthcare Domain experi...,$80K-$150K (Glassdoor est.),"Key Responsibilities\n\n- Architect, build, an...",3.4,Enterprise Integration\n3.4,"New York, NY","Jacksonville, FL",51 to 200 employees,1998,Company - Private,IT Services,Information Technology,$25 to $50 million (USD),-1,-1
2,Data Engineers,$80K-$150K (Glassdoor est.),Overview\n\nJob description\n\nPosition Overvi...,5.0,Maestro Technologies\n5.0,"New York, NY","Trenton, NJ",51 to 200 employees,2003,Company - Private,IT Services,Information Technology,$5 to $10 million (USD),-1,-1
3,Client Trade Support Engineer,$80K-$150K (Glassdoor est.),About the Position\n\n\nThis position will sup...,4.8,Jane Street\n4.8,"New York, NY","New York, NY",501 to 1000 employees,2000,Company - Private,Investment Banking & Asset Management,Finance,Unknown / Non-Applicable,-1,-1
4,Data Engineer,$80K-$150K (Glassdoor est.),Data Engineer\n\nJob Details\nLevel\nExperienc...,3.7,GNY Insurance Companies\n3.7,"New York, NY","New York, NY",201 to 500 employees,1914,Company - Private,Insurance Carriers,Insurance,$100 to $500 million (USD),"Travelers, Chubb, Crum & Forster",True


In [258]:
# Removed the "Job Description"
# Removed "Easy Apply" because it is a unique feature on Glassdoor job website that allows the applicant to directly apply on Glassdoor instead of going being redirected to the company website application portal
df2 = df1.drop(['Job Description',"Easy Apply","Competitors","Industry","Sector","Revenue","Type of ownership","Headquarters","Founded"],axis='columns') 
df2.head()

Unnamed: 0,Job Title,Salary Estimate,Rating,Company Name,Location,Size
0,Data Engineer,$80K-$150K (Glassdoor est.),4.5,Sagence\n4.5,"New York, NY",1 to 50 employees
1,Senior Data Engineer (Healthcare Domain experi...,$80K-$150K (Glassdoor est.),3.4,Enterprise Integration\n3.4,"New York, NY",51 to 200 employees
2,Data Engineers,$80K-$150K (Glassdoor est.),5.0,Maestro Technologies\n5.0,"New York, NY",51 to 200 employees
3,Client Trade Support Engineer,$80K-$150K (Glassdoor est.),4.8,Jane Street\n4.8,"New York, NY",501 to 1000 employees
4,Data Engineer,$80K-$150K (Glassdoor est.),3.7,GNY Insurance Companies\n3.7,"New York, NY",201 to 500 employees


In [259]:
df2.info()
# "Keep in mind that in Pandas, string data is always stored with an object dtype." from Class3B lecture

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2528 entries, 0 to 2527
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Job Title        2528 non-null   object 
 1   Salary Estimate  2528 non-null   object 
 2   Rating           2528 non-null   float64
 3   Company Name     2528 non-null   object 
 4   Location         2528 non-null   object 
 5   Size             2528 non-null   object 
dtypes: float64(1), object(5)
memory usage: 118.6+ KB


In [260]:
# The "Company Rating" column might have duplicate in there 
df2 = df2.rename(columns = {"Rating" : "Company Rating","Size":"Company Size"})

In [261]:
# Converted "-1" into str "Unknown"
# checked on https://www.glassdoor.ca/Job/burnaby-engineering-jobs-SRCH_IL.0,7_IC2278200_KO8,19.htm 
# That "Unknown" in appeared in some company revenue, size, and sector so I can assume that "-1" in this dataset is Unknown


In [262]:
# For the "Rating" column, turning int -1 to 0 for the mean calculation so the mean it will be correct
# reference: https://stackoverflow.com/questions/53587315/pandas-find-specific-value-in-entire-dataframe
print("Before:",np.mean(df2["Company Rating"]))
df2["Company Rating"].loc[df2["Company Rating"]==-1] = 0.0
print("After:",np.mean(df2["Company Rating"]))

Before: 3.3795490506329116
After: 3.472112341772152


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [263]:
df2["Company Size"].loc[df2["Company Size"] == "-1"] = "Unknown"

In [264]:
# Remove \n4.5 attached at the end of the company name & (Glassdoor est.) in Salary Estimate column
# https://stackoverflow.com/questions/51956572/modify-string-values-of-a-pandas-dataframe-column
df2["Company Name"] = df2["Company Name"].replace(r'\n.*', '', regex=True)
df2["Salary Estimate"] = df2["Salary Estimate"].replace(r'\(.*', '', regex=True)

In [265]:
# Convert str "80K - 150K" in the "Salary Estimate" to int
# split string at the "-" and replace "K" in str then convert srt to int 
# https://stackoverflow.com/questions/32464280/converting-currency-with-to-numbers-in-python-pandas/32465968 
# https://www.geeksforgeeks.org/python-pandas-split-strings-into-two-list-columns-using-str-split/

df2["Salary Estimate"] = df2["Salary Estimate"].replace('[\$]', '', regex=True)


new = df2["Salary Estimate"].str.split("-", n = 1, expand = True)
df2["Minimum Salary"]= new[0]
df2["Maximum Salary"]= new[1]
df2.drop(columns =["Salary Estimate"],inplace = True)

#df2["Salary Estimate"] = df2["Salary Estimate"].replace('[\-]', '', regex=True)

df2["Minimum Salary"] = df2["Minimum Salary"].replace('K', '', regex=True)
df2["Maximum Salary"] = df2["Maximum Salary"].replace('K', '', regex=True)
df2["Minimum Salary"] = df2["Minimum Salary"].astype(int)
df2["Maximum Salary"] = df2["Maximum Salary"].astype(int)

df2

Unnamed: 0,Job Title,Company Rating,Company Name,Location,Company Size,Minimum Salary,Maximum Salary
0,Data Engineer,4.5,Sagence,"New York, NY",1 to 50 employees,80,150
1,Senior Data Engineer (Healthcare Domain experi...,3.4,Enterprise Integration,"New York, NY",51 to 200 employees,80,150
2,Data Engineers,5.0,Maestro Technologies,"New York, NY",51 to 200 employees,80,150
3,Client Trade Support Engineer,4.8,Jane Street,"New York, NY",501 to 1000 employees,80,150
4,Data Engineer,3.7,GNY Insurance Companies,"New York, NY",201 to 500 employees,80,150
...,...,...,...,...,...,...,...
2523,Cloud Engineer,4.5,PMG Global,"Westlake, TX",1 to 50 employees,76,128
2524,Electrical Engineer,2.9,Albin Engineering Services,"Fort Worth, TX",51 to 200 employees,76,128
2525,Sr. Big Data Engineer,3.7,Cincinnati Bell Technology Solutions,"Irving, TX",501 to 1000 employees,76,128
2526,Senior Big Data / ETL Engineer,3.5,Schwab,"Westlake, TX",Unknown,76,128


In [266]:
# Calculating on Min and Max Salary Column
df2["Minimum Salary"] = df2["Minimum Salary"]*1000
df2["Maximum Salary"] = df2["Maximum Salary"]*1000
df2

Unnamed: 0,Job Title,Company Rating,Company Name,Location,Company Size,Minimum Salary,Maximum Salary
0,Data Engineer,4.5,Sagence,"New York, NY",1 to 50 employees,80000,150000
1,Senior Data Engineer (Healthcare Domain experi...,3.4,Enterprise Integration,"New York, NY",51 to 200 employees,80000,150000
2,Data Engineers,5.0,Maestro Technologies,"New York, NY",51 to 200 employees,80000,150000
3,Client Trade Support Engineer,4.8,Jane Street,"New York, NY",501 to 1000 employees,80000,150000
4,Data Engineer,3.7,GNY Insurance Companies,"New York, NY",201 to 500 employees,80000,150000
...,...,...,...,...,...,...,...
2523,Cloud Engineer,4.5,PMG Global,"Westlake, TX",1 to 50 employees,76000,128000
2524,Electrical Engineer,2.9,Albin Engineering Services,"Fort Worth, TX",51 to 200 employees,76000,128000
2525,Sr. Big Data Engineer,3.7,Cincinnati Bell Technology Solutions,"Irving, TX",501 to 1000 employees,76000,128000
2526,Senior Big Data / ETL Engineer,3.5,Schwab,"Westlake, TX",Unknown,76000,128000


In [268]:
unique_companyname = list(set(df2["Company Name"]))
unique_companyname[0:15]

['American Public Media',
 'All Campus',
 'Trace3',
 'Modis',
 'Rockstar Games',
 'Sendlane',
 'Subaru of America',
 'Stellent IT LLC',
 'Russell Tobin & Associates',
 'Clearcover',
 "The Farmer's Dog",
 'HIMSS',
 'RF-SMART, a division of ICS',
 'Quantum World Technologies Inc',
 'LiveMindz']

In [269]:
# There are a lot of different unique job title, but with similiar position (i.e Data Engineer & Data Engineer Python)
# In order to use a count plot with job title we can make the job title more general, instead of "Data Engineer - Python" it can be "Data Engineer"
# Its corresponding information will make it unique from the others Data Engineer titles
# Or the dataset might be large enough to only include the job with basic title (i.e Data Engineer Software Engineer)
unique_job = list(set(df2["Job Title"]))
unique_job[0:15]

['Site Reliability Engineer',
 'Big Data Engineer (PA)',
 'Security & Infrastructure Engineer',
 'Software Engineer (Data Team - Password Cracking)',
 'Data Analyst - EDSS',
 'Platform Software Engineer',
 'Senior Platform Engineer',
 'Lead Software Engineer - .Net / DevOps',
 'Data Center Hardware Engineer',
 '(SONUS) Engineer/Senior Engineer, IT Network Infrastructure',
 'Flow Data Project Engineer',
 'Software Development Engineer, Amazon Fashion',
 '7232 Data Engineer (Analyst/Programmer - Career), Information Technology DevOps',
 'Senior Data Engineer (St. Louis, Austin or Toronto)',
 'Software Engineer, Machine Learning']

In [271]:
unique_size = set(df2["Company Size"])
unique_size

{'1 to 50 employees',
 '10000+ employees',
 '1001 to 5000 employees',
 '201 to 500 employees',
 '5001 to 10000 employees',
 '501 to 1000 employees',
 '51 to 200 employees',
 'Unknown'}

In [None]:
# I am going to categorize the size of the company into using this Canadian goverment website with small modifcation to work with this data set
# https://www150.statcan.gc.ca/n1/pub/11f0027m/2011069/part-partie1-eng.htm
# The exact number of the company size is not known in this data set so I would say it's okay to go over the limit that was described in the goverment link
# Also the goverment link is archived so the infornmation is not updated. This is just for use of referencing

# 1 to to 200 is small
# 201 to 500 is medium
# +500 is large
# -1 is assumed to be unknown 

# Method Chaining

# Research Question
***Should you work for a small company instead of a larger one?***
 - Company size
 - Salary Estimate
 - Rating 
 
 Note: Rating can be posted by anyone so some might be fake and a few could come from disgrunted employees

"More for visulization"
 - Job Title
 - Industry and Sector 
 - Location 