In [2]:
from datetime import date, datetime, timedelta

import requests
import pandas as pd
import numpy as np

import tqdm
import time
import sqlalchemy as sqlalc

import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.ticker import FormatStrFormatter

In [3]:
# Allowing for display of all columns in the DataFrame

pd.options.display.max_columns = None

In [4]:
# Importing file from source https://www.bls.gov/oes/current/oessrcst.htm

df = pd.read_excel('BLS_2023_State_Wages.xlsx')
df.head()

Unnamed: 0,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,OCC_TITLE,O_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_QUOTIENT,PCT_TOTAL,PCT_RPT,H_MEAN,A_MEAN,MEAN_PRSE,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,total,2053090,0.0,1000.0,1.0,,,25.67,53400,0.2,10.87,14.22,19.88,30.09,46.18,22620,29580,41350,62580,96050,,
1,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,major,105580,0.8,51.424,0.74,,,56.21,116920,0.5,24.38,35.18,47.95,67.22,95.44,50710,73180,99740,139810,198520,,
2,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,detailed,720,6.8,0.348,0.25,,,106.26,221030,5.8,31.59,59.6,79.48,102.01,#,65700,123960,165320,212180,#,,
3,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1021,General and Operations Managers,detailed,34450,2.7,16.781,0.73,,,62.17,129310,1.1,23.11,34.74,49.67,78.25,112.54,48080,72260,103320,162760,234080,,
4,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1031,Legislators,detailed,1140,9.1,0.555,2.6,,,*,33690,5.1,*,*,*,*,*,18320,19670,24470,45050,55070,True,


In [5]:
df.dtypes

AREA              int64
AREA_TITLE       object
AREA_TYPE         int64
PRIM_STATE       object
NAICS             int64
NAICS_TITLE      object
I_GROUP          object
OWN_CODE          int64
OCC_CODE         object
OCC_TITLE        object
O_GROUP          object
TOT_EMP          object
EMP_PRSE         object
JOBS_1000        object
LOC_QUOTIENT     object
PCT_TOTAL       float64
PCT_RPT         float64
H_MEAN           object
A_MEAN           object
MEAN_PRSE        object
H_PCT10          object
H_PCT25          object
H_MEDIAN         object
H_PCT75          object
H_PCT90          object
A_PCT10          object
A_PCT25          object
A_MEDIAN         object
A_PCT75          object
A_PCT90          object
ANNUAL           object
HOURLY           object
dtype: object

In [6]:
# Converting the data types of the numeric columns to be that of floats/integers

columns_to_convert = [
    'TOT_EMP', 'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'H_MEAN', 
    'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN', 
    'H_PCT75', 'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75', 'A_PCT90']

df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [7]:
df.dtypes

AREA              int64
AREA_TITLE       object
AREA_TYPE         int64
PRIM_STATE       object
NAICS             int64
NAICS_TITLE      object
I_GROUP          object
OWN_CODE          int64
OCC_CODE         object
OCC_TITLE        object
O_GROUP          object
TOT_EMP         float64
EMP_PRSE        float64
JOBS_1000       float64
LOC_QUOTIENT    float64
PCT_TOTAL       float64
PCT_RPT         float64
H_MEAN          float64
A_MEAN          float64
MEAN_PRSE       float64
H_PCT10         float64
H_PCT25         float64
H_MEDIAN        float64
H_PCT75         float64
H_PCT90         float64
A_PCT10         float64
A_PCT25         float64
A_MEDIAN        float64
A_PCT75         float64
A_PCT90         float64
ANNUAL           object
HOURLY           object
dtype: object

In [8]:
# Renaming the columns to something more legible

df = df.rename(columns={"AREA_TITLE": "State", "PRIM_STATE": "State Abbreviation", "OCC_TITLE": "Job Title",
                   "TOT_EMP": "Total Employed", "JOBS_1000": "Employment per 1,000 Jobs", "H_MEAN": "Hourly Mean",
                   "A_MEAN": "Annual Mean", "H_PCT10": "Hourly 10th Percentile", "H_PCT25": "Hourly 25th Percentile",
                    "H_MEDIAN": "Hourly Median", "H_PCT75": "Hourly 75th Percentile", "H_PCT90": "Hourly 90th Percentile",
                    "A_PCT10": "Annual 10th Percentile", "A_PCT25": "Annual 25th Percentile","A_MEDIAN": "Annual Median",
                    "A_PCT75": "Annual 75th Percentile", "A_PCT90": "Annual 90th Percentile"
                  })

In [9]:
# Dropping unnecessary columns

df = df.drop(['AREA', 'AREA_TYPE', 'NAICS', 'NAICS_TITLE', 'I_GROUP', 'OWN_CODE', 'OCC_CODE', 'O_GROUP', 'EMP_PRSE',
              'LOC_QUOTIENT', 'PCT_TOTAL', 'PCT_RPT', 'MEAN_PRSE', 'ANNUAL', 'HOURLY'], axis=1)
df.head()

Unnamed: 0,State,State Abbreviation,Job Title,Total Employed,"Employment per 1,000 Jobs",Hourly Mean,Annual Mean,Hourly 10th Percentile,Hourly 25th Percentile,Hourly Median,Hourly 75th Percentile,Hourly 90th Percentile,Annual 10th Percentile,Annual 25th Percentile,Annual Median,Annual 75th Percentile,Annual 90th Percentile
0,Alabama,AL,All Occupations,2053090.0,1000.0,25.67,53400.0,10.87,14.22,19.88,30.09,46.18,22620.0,29580.0,41350.0,62580.0,96050.0
1,Alabama,AL,Management Occupations,105580.0,51.424,56.21,116920.0,24.38,35.18,47.95,67.22,95.44,50710.0,73180.0,99740.0,139810.0,198520.0
2,Alabama,AL,Chief Executives,720.0,0.348,106.26,221030.0,31.59,59.6,79.48,102.01,,65700.0,123960.0,165320.0,212180.0,
3,Alabama,AL,General and Operations Managers,34450.0,16.781,62.17,129310.0,23.11,34.74,49.67,78.25,112.54,48080.0,72260.0,103320.0,162760.0,234080.0
4,Alabama,AL,Legislators,1140.0,0.555,,33690.0,,,,,,18320.0,19670.0,24470.0,45050.0,55070.0


In [10]:
print(df["State"].unique())  # Check unique state names in your DataFrame

['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Colorado'
 'Connecticut' 'Delaware' 'District of Columbia' 'Florida' 'Georgia'
 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas' 'Kentucky'
 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota'
 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire'
 'New Jersey' 'New Mexico' 'New York' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'South Dakota' 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia'
 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming' 'Guam' 'Puerto Rico'
 'Virgin Islands']


In [11]:
# Cleaning entries that are not officially 'states'

states_to_remove = ["Guam", "Puerto Rico", "Virgin Islands", "District of Columbia" ]

removed_states = df[df["State"].isin(states_to_remove)].index

df = df.drop(removed_states)

In [12]:
# Storing the states in a region variable to display on a graph with less clutter

usa_regions = {
    "Northeast": ["Maine", "New Hampshire", "Maryland", "Vermont", "Massachusetts", "Rhode Island", "Connecticut", "New York", "Pennsylvania", "New Jersey"],
    "Midwest": ["Ohio", "Michigan", "Indiana", "Illinois", "Wisconsin", "Minnesota", "Iowa", "Missouri", "North Dakota", "South Dakota", "Nebraska", "Kansas"],
    "Southeast": ["Delaware", "Virginia", "West Virginia", "North Carolina", "South Carolina", "Georgia", "Florida", "Kentucky", "Tennessee", "Alabama", "Mississippi", "Arkansas", "Louisiana"],
    "Southwest": ["Texas", "Oklahoma", "New Mexico", "Arizona"],
    "West": ["Colorado", "Wyoming", "Montana", "Idaho", "Washington", "Oregon", "Utah", "Nevada", "California", "Alaska", "Hawaii"]
}

In [13]:
# Adding the USA Region column and matching it to the state for which it is for

df["USA Region"] = df["State"].map(usa_regions)
df.head()

Unnamed: 0,State,State Abbreviation,Job Title,Total Employed,"Employment per 1,000 Jobs",Hourly Mean,Annual Mean,Hourly 10th Percentile,Hourly 25th Percentile,Hourly Median,Hourly 75th Percentile,Hourly 90th Percentile,Annual 10th Percentile,Annual 25th Percentile,Annual Median,Annual 75th Percentile,Annual 90th Percentile,USA Region
0,Alabama,AL,All Occupations,2053090.0,1000.0,25.67,53400.0,10.87,14.22,19.88,30.09,46.18,22620.0,29580.0,41350.0,62580.0,96050.0,
1,Alabama,AL,Management Occupations,105580.0,51.424,56.21,116920.0,24.38,35.18,47.95,67.22,95.44,50710.0,73180.0,99740.0,139810.0,198520.0,
2,Alabama,AL,Chief Executives,720.0,0.348,106.26,221030.0,31.59,59.6,79.48,102.01,,65700.0,123960.0,165320.0,212180.0,,
3,Alabama,AL,General and Operations Managers,34450.0,16.781,62.17,129310.0,23.11,34.74,49.67,78.25,112.54,48080.0,72260.0,103320.0,162760.0,234080.0,
4,Alabama,AL,Legislators,1140.0,0.555,,33690.0,,,,,,18320.0,19670.0,24470.0,45050.0,55070.0,


In [34]:
# Counting N/As or NaN

df.isna().sum()

State                            0
State Abbreviation               0
Job Title                        0
Total Employed                 834
Employment per 1,000 Jobs      834
Hourly Mean                   3054
Annual Mean                    680
Hourly 10th Percentile        3054
Hourly 25th Percentile        3188
Hourly Median                 3464
Hourly 75th Percentile        3727
Hourly 90th Percentile        4179
Annual 10th Percentile         680
Annual 25th Percentile         814
Annual Median                 1096
Annual 75th Percentile        1376
Annual 90th Percentile        1886
USA Region                   36127
dtype: int64

In [14]:
# A data frame only containing the "All Occupations" summary for each state

df_all_occupations = df[df["Job Title"] == "All Occupations"]
df_all_occupations.head()

Unnamed: 0,State,State Abbreviation,Job Title,Total Employed,"Employment per 1,000 Jobs",Hourly Mean,Annual Mean,Hourly 10th Percentile,Hourly 25th Percentile,Hourly Median,Hourly 75th Percentile,Hourly 90th Percentile,Annual 10th Percentile,Annual 25th Percentile,Annual Median,Annual 75th Percentile,Annual 90th Percentile,USA Region
0,Alabama,AL,All Occupations,2053090.0,1000.0,25.67,53400.0,10.87,14.22,19.88,30.09,46.18,22620.0,29580.0,41350.0,62580.0,96050.0,
738,Alaska,AK,All Occupations,311610.0,1000.0,33.6,69880.0,15.0,18.62,26.99,40.52,58.35,31200.0,38720.0,56140.0,84280.0,121370.0,
1319,Arizona,AZ,All Occupations,3129720.0,1000.0,30.31,63040.0,14.84,17.38,22.92,35.05,51.67,30870.0,36150.0,47680.0,72900.0,107470.0,
2076,Arkansas,AR,All Occupations,1271320.0,1000.0,24.64,51250.0,12.67,14.42,18.78,28.32,40.5,26360.0,30000.0,39060.0,58900.0,84240.0,
2774,California,CA,All Occupations,17945910.0,1000.0,37.0,76960.0,16.43,18.21,25.98,44.83,73.07,34170.0,37890.0,54030.0,93250.0,151990.0,


In [15]:
# A data frame removing the summary job title for each state

df_jobs = df[df["Job Title"] != "All Occupations"]
df_jobs.head()

Unnamed: 0,State,State Abbreviation,Job Title,Total Employed,"Employment per 1,000 Jobs",Hourly Mean,Annual Mean,Hourly 10th Percentile,Hourly 25th Percentile,Hourly Median,Hourly 75th Percentile,Hourly 90th Percentile,Annual 10th Percentile,Annual 25th Percentile,Annual Median,Annual 75th Percentile,Annual 90th Percentile,USA Region
1,Alabama,AL,Management Occupations,105580.0,51.424,56.21,116920.0,24.38,35.18,47.95,67.22,95.44,50710.0,73180.0,99740.0,139810.0,198520.0,
2,Alabama,AL,Chief Executives,720.0,0.348,106.26,221030.0,31.59,59.6,79.48,102.01,,65700.0,123960.0,165320.0,212180.0,,
3,Alabama,AL,General and Operations Managers,34450.0,16.781,62.17,129310.0,23.11,34.74,49.67,78.25,112.54,48080.0,72260.0,103320.0,162760.0,234080.0,
4,Alabama,AL,Legislators,1140.0,0.555,,33690.0,,,,,,18320.0,19670.0,24470.0,45050.0,55070.0,
5,Alabama,AL,Advertising and Promotions Managers,70.0,0.032,53.99,112290.0,36.77,39.88,50.37,64.03,71.21,76480.0,82950.0,104770.0,133170.0,148110.0,
