In [19]:
# goal 

# dash where the first page is just avg. it's be KDE with overlap
#options are states

# source: https://www.kaggle.com/datasets/thedevastator/jobs-dataset-from-glassdoor/data


In [20]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/gsteinmetzsilber/DATA-608/main/Story%204/glassdoor_salary_data.csv") 
print(df.head())
print(df.shape)
print(df.describe())
print(df["job_simp"].unique()) 
print(df["job_state"].unique()) #not all 50 states, that's okay there's a lot


print(df.isna().sum())

   Unnamed: 0                  Job Title              Salary Estimate  \
0           0             Data Scientist   $53K-$91K (Glassdoor est.)   
1           1  Healthcare Data Scientist  $63K-$112K (Glassdoor est.)   
2           2             Data Scientist   $80K-$90K (Glassdoor est.)   
3           3             Data Scientist   $56K-$97K (Glassdoor est.)   
4           4             Data Scientist  $86K-$143K (Glassdoor est.)   

                                     Job Description  Rating  \
0  Data Scientist\nLocation: Albuquerque, NM\nEdu...     3.8   
1  What You Will Do:\n\nI. General Summary\n\nThe...     3.4   
2  KnowBe4, Inc. is a high growth information sec...     4.8   
3  *Organization and Job ID**\nJob ID: 310709\n\n...     3.8   
4  Data Scientist\nAffinity Solutions / Marketing...     2.9   

                                 Company Name         Location  \
0                      Tecolote Research\n3.8  Albuquerque, NM   
1  University of Maryland Medical System\n3.

In [21]:
# let's only keep 4 roles:

titles = ["analyst", "data engineer", "data scientist", "mle"]
df_filtered = df[df["job_simp"].isin(titles)]

#while cleaning, let's just get the titles in better shape
df_filtered.loc[:, "job_simp"] = df_filtered["job_simp"].replace("mle", "Machine Learning Engineer")
df_filtered.loc[:, "job_simp"] = df_filtered["job_simp"].str.title()

# and order by state to make the later dropdown menu better
df_filtered.sort_values("job_state", inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [22]:
import numpy as np
from dash import Dash, dcc, html, Input, Output
import plotly.graph_objects as go 
from scipy.stats import gaussian_kde 

# a bunch of the states don't have a lot of data / very low variance; both make plots look terrible. I define a function for determining whether there's enough quality data: 
def state_has_valid_data(state, df, job_roles, min_variance=0.01, min_records=3):
    df_state = df[df["job_state"] == state]
    valid_job_roles_count = 0 #start counter

    for job_role in job_roles:
        job_df = df_state[df_state["job_simp"] == job_role]
        if len(job_df) >= min_records and job_df["avg_salary"].var() >= min_variance: # Now a further check that there's more than one role to plot...otherwise it's not illuminating
            valid_job_roles_count += 1  
    
    if valid_job_roles_count > 1:
        return True  
    return False

job_roles = df_filtered["job_simp"].unique() #unique roles
valid_states = [state for state in df_filtered["job_state"].unique() if state_has_valid_data(state, df_filtered, job_roles)]

# Setting up Dash app
app = Dash(__name__)

app.layout = html.Div([
    html.H1("Salary Distributions for Data-Related Jobs"), #title
    dcc.Dropdown(
        id="state_dropdown",
        options=[{"label": "All States", "value": "All"}] + [{"label": state, "value": state} for state in valid_states],
        value="All States", #default
        clearable=False,
    ),
    dcc.Graph(id="kde")
])

@app.callback(
    Output("kde", "figure"), #the figure identified by kde changes when...
    [Input("state_dropdown", "value")] # the value of the state changes
)
def update_graph(selected_state):
    df_plot = df_filtered[df_filtered["job_state"] == selected_state] if selected_state != "All States" else df_filtered #unless we're looking at all states, filter to only look at the relevant state's rows
    fig = go.Figure() #empty Plotly figure

    # Create a trace for each job role even if it's not present in the current state to standardize the legend
    for job_role in job_roles:
        job_df = df_plot[df_plot["job_simp"] == job_role] #filtering again to only look at one job type at a time

        if not job_df.empty and job_df["avg_salary"].var() >= 0.01:
            try:
                x = np.linspace(job_df["avg_salary"].min(), job_df["avg_salary"].max(), 200) #an array of 200 points btwn the min and max
                kde = gaussian_kde(job_df["avg_salary"]) #calculate the KDE
                y = kde(x)
                fig.add_trace(go.Scatter(x=x, y=y, fill="tozeroy", name=job_role)) #add KDE at scatterplot and FILL
            except Exception as e:
                print(f"Error with {job_role} in {selected_state}: {e}")

    # layout
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), #horizontal legend, outside plot area
                      title_text=f"Salary KDE Plot for {selected_state}",
                      xaxis_title="Salary (in $000s)",
                      yaxis_title="Density",
                      showlegend=True)

    # Initially the legend wouldn't be consistent across the states, adding a dummy trace gets around this problem      
    if len(fig.data) == 0:
        for job_role in job_roles:
            fig.add_trace(go.Scatter(x=[None], y=[None], mode="lines", name=job_role))

    return fig

if __name__ == "__main__":
    app.run_server()



