In [1]:
# Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling as profile_report
from pathlib import Path
%matplotlib inline

# Read in rate data
rates = pd.read_csv('merged_rates/gsa.csv')
rates.rename(columns={'Role':'role', 'Education':'education', 'Functional Responsibility':'responsibility',
             'Years of Experience':'yoe', 'Company':'company', 'Program Name':'program'}, inplace=True)
rates.drop('Unnamed: 0', axis=1, inplace=True)

In [2]:
rates.head(3)

Unnamed: 0,role,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,education,responsibility,yoe,company,socio_economic
0,Senior Program Manager,442.04,454.86,468.05,481.62,,,,Bachelor's,The Senior Program Manager has overall account...,15,Accenture Federal Services,Other than small business
1,Program Manager,389.42,400.71,412.33,424.29,,,,Bachelor's,Program Managers plan and manage projects to c...,12,Accenture Federal Services,Other than small business
2,Project Manager,252.85,260.18,267.73,275.49,,,,Bachelor's,"The Project Manager manages, plans and coordin...",10,Accenture Federal Services,Other than small business


In [3]:
rates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 628 entries, 0 to 627
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   role            628 non-null    object 
 1   2018_2019       504 non-null    float64
 2   2019_2020       600 non-null    float64
 3   2020_2021       628 non-null    float64
 4   2021_2022       584 non-null    float64
 5   2022_2023       382 non-null    float64
 6   2023_2024       273 non-null    float64
 7   2024_2025       28 non-null     float64
 8   education       628 non-null    object 
 9   responsibility  628 non-null    object 
 10  yoe             628 non-null    int64  
 11  company         628 non-null    object 
 12  socio_economic  492 non-null    object 
dtypes: float64(7), int64(1), object(5)
memory usage: 63.9+ KB


In [4]:
rates.describe()

Unnamed: 0,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,yoe
count,504.0,600.0,628.0,584.0,382.0,273.0,28.0,628.0
mean,129.732242,134.111,135.703089,138.103767,137.320314,131.854908,115.535714,5.711783
std,71.443933,61.182919,62.942137,64.828446,64.535651,67.591964,65.380389,4.797134
min,25.57,9.82,9.82,9.82,9.82,9.82,38.45,0.0
25%,87.42,87.655,88.4325,89.47,91.4025,86.65,80.3525,2.0
50%,118.205,122.39,123.34,125.77,124.75,111.65,102.28,5.0
75%,161.5,168.8425,171.825,173.565,166.8775,165.96,134.58,8.0
max,1113.97,454.86,468.05,481.62,349.25,343.73,349.92,20.0


In [5]:
# Grab mean, min and max data and merge into rates dataframe
mean_ = rates.groupby('role').mean()
max_ = rates.groupby('role').max(numeric_only=True)
min_ = rates.groupby('role').min(numeric_only=True)

# Rename mean, min and columns
mean_.rename(columns={'2018_2019':'2018_2019_mean', '2019_2020':'2019_2020_mean', 
                      '2020_2021':'2020_2021_mean', '2021_2022':'2021_2022_mean', 
                      '2022_2023':'2022_2023_mean', '2023_2024':'2023_2024_mean', 
                      '2024_2025':'2024_2025_mean'}, inplace=True)

min_.rename(columns={'2018_2019':'2018_2019_min', '2019_2020':'2019_2020_min', 
                      '2020_2021':'2020_2021_min', '2021_2022':'2021_2022_min', 
                      '2022_2023':'2022_2023_min', '2023_2024':'2023_2024_min', 
                      '2024_2025':'2024_2025_min'}, inplace=True)

max_.rename(columns={'2018_2019':'2018_2019_max', '2019_2020':'2019_2020_max', 
                      '2020_2021':'2020_2021_max', '2021_2022':'2021_2022_max', 
                      '2022_2023':'2022_2023_max', '2023_2024':'2023_2024_max', 
                      '2024_2025':'2024_2025_max'}, inplace=True)

# Merge aggregated data to rates
first = pd.merge(rates, mean_.reset_index(), how='left', on='role')
second = pd.merge(first, max_.reset_index(), how='left', on='role')
third = pd.merge(second, min_.reset_index(), how='left', on='role')
rates = third.copy()

# Find year to year change in rates
rates['1819_1920_change'] = (rates['2019_2020'] - rates['2018_2019']) / rates['2018_2019']
rates['1920_2021_change'] = (rates['2020_2021'] - rates['2019_2020']) / rates['2019_2020']
rates['2021_2122_change'] = (rates['2021_2022'] - rates['2020_2021']) / rates['2020_2021']
rates['2122_2223_change'] = (rates['2022_2023'] - rates['2021_2022']) / rates['2021_2022']
rates['2223_2324_change'] = (rates['2023_2024'] - rates['2022_2023']) / rates['2022_2023']
rates['2324_2425_change'] = (rates['2024_2025'] - rates['2023_2024']) / rates['2023_2024']

rates.head(3)

Unnamed: 0,role,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,education,responsibility,...,2022_2023_min,2023_2024_min,2024_2025_min,yoe_y,1819_1920_change,1920_2021_change,2021_2122_change,2122_2223_change,2223_2324_change,2324_2425_change
0,Senior Program Manager,442.04,454.86,468.05,481.62,,,,Bachelor's,The Senior Program Manager has overall account...,...,,,,15,0.029002,0.028998,0.028993,,,
1,Program Manager,389.42,400.71,412.33,424.29,,,,Bachelor's,Program Managers plan and manage projects to c...,...,74.81,74.81,,4,0.028992,0.028999,0.029006,,,
2,Project Manager,252.85,260.18,267.73,275.49,,,,Bachelor's,"The Project Manager manages, plans and coordin...",...,67.63,67.63,134.58,4,0.02899,0.029018,0.028984,,,


In [6]:
rates.columns

Index(['role', '2018_2019', '2019_2020', '2020_2021', '2021_2022', '2022_2023',
       '2023_2024', '2024_2025', 'education', 'responsibility', 'yoe_x',
       'company', 'socio_economic', '2018_2019_mean', '2019_2020_mean',
       '2020_2021_mean', '2021_2022_mean', '2022_2023_mean', '2023_2024_mean',
       '2024_2025_mean', 'yoe_y', '2018_2019_max', '2019_2020_max',
       '2020_2021_max', '2021_2022_max', '2022_2023_max', '2023_2024_max',
       '2024_2025_max', 'yoe_x', '2018_2019_min', '2019_2020_min',
       '2020_2021_min', '2021_2022_min', '2022_2023_min', '2023_2024_min',
       '2024_2025_min', 'yoe_y', '1819_1920_change', '1920_2021_change',
       '2021_2122_change', '2122_2223_change', '2223_2324_change',
       '2324_2425_change'],
      dtype='object')

In [8]:
rates

Unnamed: 0,role,2018_2019,2019_2020,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,education,responsibility,...,2022_2023_min,2023_2024_min,2024_2025_min,yoe_y,1819_1920_change,1920_2021_change,2021_2122_change,2122_2223_change,2223_2324_change,2324_2425_change
0,Senior Program Manager,442.04,454.86,468.05,481.62,,,,Bachelor's,The Senior Program Manager has overall account...,...,,,,15,0.029002,0.028998,0.028993,,,
1,Program Manager,389.42,400.71,412.33,424.29,,,,Bachelor's,Program Managers plan and manage projects to c...,...,74.81,74.81,,4,0.028992,0.028999,0.029006,,,
2,Project Manager,252.85,260.18,267.73,275.49,,,,Bachelor's,"The Project Manager manages, plans and coordin...",...,67.63,67.63,134.58,4,0.028990,0.029018,0.028984,,,
3,Task Manager,190.81,196.34,202.03,207.89,,,,Bachelor's,Task Managers apply their broad management ski...,...,,,,4,0.028982,0.028980,0.029006,,,
4,Subject Matter Expert 1,225.85,232.40,239.14,246.08,,,,Bachelor's,The Subject Matter Expert 1 has industry exper...,...,,,,10,0.029002,0.029002,0.029021,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
623,Junior Database Administrator,113.46,115.61,117.81,120.05,,,,Bachelor's,Independently or as part of the team provides ...,...,,,,2,0.018949,0.019029,0.019014,,,
624,Network Analyst,92.42,94.18,95.97,97.79,,,,Bachelor's,"Responsible for site survey, acquisition, inst...",...,,,,2,0.019043,0.019006,0.018964,,,
625,Telecommunications Network Specialist,101.97,103.91,105.88,107.89,,,,Bachelor's,"Analyzes network characteristics (e.g., traffi...",...,,,,2,0.019025,0.018959,0.018984,,,
626,Data Communication Specialist,105.26,107.26,109.30,111.38,,,,Bachelor's,Analyzes data communication needs through surv...,...,,,,4,0.019001,0.019019,0.019030,,,


In [7]:
# Grab mean, min and max data and merge into rates dataframe
agg = rates.groupby('role').agg(['mean', 'min', 'max'])
agg.head()

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [9]:
rates.to_csv('merged_rates/gsa_analysis.csv')

In [None]:
import sweetviz as sv

my_report = sv.analyze(rates)
my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"
# my_report.show_notebook()

In [None]:
# Create html window to open report
profile = rates.profile_report()
profile.to_file(Path('./rates_eda.html'))