### Importing Libraries & Data

In [4]:
import pandas as pd
import os
import plotly.express as px
import numpy as np

In [5]:
data_import = pd.read_csv(r'C:\Users\Damon\OneDrive\Documents\Personal Portfolio\DS Salaries\ds_salaries.csv')

In [6]:
df = data_import.copy()

In [7]:
df = df.drop(columns='Unnamed: 0', axis= 0)
df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L
...,...,...,...,...,...,...,...,...,...,...,...
602,2022,SE,FT,Data Engineer,154000,USD,154000,US,100,US,M
603,2022,SE,FT,Data Engineer,126000,USD,126000,US,100,US,M
604,2022,SE,FT,Data Analyst,129000,USD,129000,US,0,US,M
605,2022,SE,FT,Data Analyst,150000,USD,150000,US,100,US,M


In [8]:
df['work_year'] = df['work_year'].astype(str)

### General Data Cleaning

In [9]:
# Cleaning the experience_level column for readability

conditions = [
    df['experience_level'] == 'MI',
    df['experience_level'] == 'SE',
    df['experience_level'] == 'EN',
    df['experience_level'] == 'EX'
]

values = [
    'Mid Level',
    'Senior Level',
    'Entry Level',
    'Experienced'
]

df['experience_level'] = np.select(conditions,values)

### Exploratory Data Analysis

In [10]:
# Basic exploratory analysis
fig = px.histogram(
    df,
    x='experience_level',
    y='salary_in_usd', 
    barmode='group', 
    histfunc='avg', 
    text_auto='.2s',
    title= 'Average USD Salary by Experience Level',
    facet_col= 'company_size')

fig.update_traces(textfont_size=12, textangle=0, textposition='outside')

fig

In [11]:
px.histogram(df,
    x='experience_level',
    title='Majority of Observations are based on Mid & Senior Levels',
    text_auto= '.2s')

### What is the trend of salaries over time?

In [12]:
px.histogram(df,x='work_year',y='salary_in_usd',histfunc='avg',text_auto='.2s', barmode= 'group', title='Salaries within Data have been increasing since 2020')

### Which job categories are the most lucrative?

In [13]:
conditions = [
    df['job_title'].str.contains('Scientist') | df['job_title'].str.contains('Science') | df['job_title'].str.contains('Machine Learning'),
    df['job_title'].str.contains('Analyst') | df['job_title'].str.contains('Analytics') | df['job_title'].str.contains('Specialist') | df['job_title'].str.contains('Head of Data'),
    df['job_title'].str.contains('Engineer') | df['job_title'].str.contains('Engineering')| df['job_title'].str.contains('Architect')| df['job_title'].str.contains('ETL')
]

values = [
    'Data Science',
    'Data Analytics',
    'Data Engineer'
]


df['job_category'] = np.select(conditions, values)

In [14]:
df = df[df['job_title'] != '3D Computer Vision Researcher']

In [15]:
px.histogram(df,x='job_category',y='salary_in_usd',histfunc='avg',text_auto='.2s', barmode= 'group', title='Data Science & Engineering are more lucrative career paths than Analytics')

In [16]:
px.histogram(df,x='work_year',y='salary_in_usd',histfunc='avg',text_auto='.2s', barmode= 'group', facet_col='job_category', title='While Data Science pays the most, Data Engineering salaries appears to be increasing at a steeper rate')

### How does an employee's Working Arrangement correlate with their salary?

In [17]:
conditions = [
    df['remote_ratio'] == 0,
    df['remote_ratio'] == 50,
    df['remote_ratio'] == 100
]

values = [
    'In-Person',
    'Hybrid',
    'Remote'
]

df['working_arrangement'] = np.select(conditions,values)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
px.histogram(df, x='working_arrangement',y='salary_in_usd', histfunc='avg', text_auto='.2s', title='Remote workers tend to earn the most')

In [19]:
px.histogram(df, x='working_arrangement',color='experience_level', title= 'A large portion of remote workers are senior level, which could explain why remote workers earn the most')

### Does company size impact what data-related job category is more prevalent?

In [20]:
px.histogram(df, x='company_size', text_auto='.2s', color='job_category', barmode= 'overlay', title='Data Analytics is potentially more important as a company scales, then Data Science is the more critical function')