In [82]:
import numpy as np
import pandas as pd


In [83]:
# Use this because I couldn't see enough output
pd.set_option('display.max_rows', None)

In [84]:
df_four = pd.read_csv('export_format4.csv')

In [85]:
# Notice the comma in row 4, salary column
df_four.head()

Unnamed: 0,employer,period_begin,period_end,post_date,salary,sheltered,unsheltered,total
0,Orleans,1/1/2002,1/31/2002,1/29/2002,262.62,0.0,18.38,18.38
1,Orleans,2/1/2002,2/28/2002,2/22/2002,525.24,0.0,36.76,36.76
2,Orleans,3/1/2002,3/31/2002,3/27/2002,525.24,0.0,36.76,36.76
3,Orleans,4/1/2002,4/30/2002,4/26/2002,525.24,0.0,36.76,36.76
4,Orleans,5/1/2002,5/31/2002,6/11/2002,1021.93,0.0,71.53,71.53


In [86]:
# Notice Salary is Dtype = object, due to the commas
df_four.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582 entries, 0 to 581
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   employer      582 non-null    object 
 1   period_begin  582 non-null    object 
 2   period_end    582 non-null    object 
 3   post_date     582 non-null    object 
 4   salary        582 non-null    object 
 5   sheltered     582 non-null    float64
 6   unsheltered   582 non-null    float64
 7   total         582 non-null    float64
dtypes: float64(3), object(5)
memory usage: 36.5+ KB


In [87]:
# 'salary' column is returning as an 'object' (string) due to the commas
# Remove the commas!
df_four['salary'] = df_four['salary'].str.replace(',', '')
df_four.head()

Unnamed: 0,employer,period_begin,period_end,post_date,salary,sheltered,unsheltered,total
0,Orleans,1/1/2002,1/31/2002,1/29/2002,262.62,0.0,18.38,18.38
1,Orleans,2/1/2002,2/28/2002,2/22/2002,525.24,0.0,36.76,36.76
2,Orleans,3/1/2002,3/31/2002,3/27/2002,525.24,0.0,36.76,36.76
3,Orleans,4/1/2002,4/30/2002,4/26/2002,525.24,0.0,36.76,36.76
4,Orleans,5/1/2002,5/31/2002,6/11/2002,1021.93,0.0,71.53,71.53


In [88]:
# Now that the commas are gone, we can convert the salary column from string to numeric
df_four['salary'] = pd.to_numeric(df_four['salary'])

In [89]:
# Notice that the salary column is now an object!
df_four.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582 entries, 0 to 581
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   employer      582 non-null    object 
 1   period_begin  582 non-null    object 
 2   period_end    582 non-null    object 
 3   post_date     582 non-null    object 
 4   salary        582 non-null    float64
 5   sheltered     582 non-null    float64
 6   unsheltered   582 non-null    float64
 7   total         582 non-null    float64
dtypes: float64(4), object(4)
memory usage: 36.5+ KB


In [90]:
# Now, we sort by period end
# Because there are exact equal values for period end in both the parish and state salary reports,
# it will combine (sum) both the parish and state per month
df_four = df_four.groupby('period_end').sum()
    

  df_four = df_four.groupby('period_end').sum()


In [91]:
# I am glad I checked my work! Look, the period_end is not in the time order I expected. 
# I expected to see linear time: January through December of one  year, then move to next year
df_four.head()

Unnamed: 0_level_0,salary,sheltered,unsheltered,total
period_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1/31/2002,525.02,0.0,36.75,36.75
1/31/2003,1274.12,0.0,89.2,89.2
1/31/2004,1458.7,0.0,102.12,102.12
1/31/2005,1625.6,113.8,0.0,113.8
1/31/2006,1520.64,106.44,0.0,106.44


In [92]:
# Use the datetime module to sort my timeline linearly
df_four = df_four.sort_values(by='period_end', key=pd.to_datetime)
df_four.head()

Unnamed: 0_level_0,salary,sheltered,unsheltered,total
period_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1/31/2002,525.02,0.0,36.75,36.75
2/28/2002,1050.04,0.0,73.5,73.5
3/31/2002,1050.04,0.0,73.5,73.5
4/30/2002,1050.04,0.0,73.5,73.5
5/31/2002,2044.57,0.0,143.13,143.13


In [93]:
# Here we use the rolling 60 mean function built into pandas to return the
# average salaries for each 60 consecutive month periods.
# I am calling head on just 62 rows to make sure it is looking for 60 rows of data to check that the function is working properly
df_four['rolling_60'] = df_four['salary'].rolling(60).mean()
df_four.head(62)

Unnamed: 0_level_0,salary,sheltered,unsheltered,total,rolling_60
period_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1/31/2002,525.02,0.0,36.75,36.75,
2/28/2002,1050.04,0.0,73.5,73.5,
3/31/2002,1050.04,0.0,73.5,73.5,
4/30/2002,1050.04,0.0,73.5,73.5,
5/31/2002,2044.57,0.0,143.13,143.13,
6/30/2002,1225.6,0.0,85.8,85.8,
7/31/2002,1249.86,0.0,87.5,87.5,
8/31/2002,1274.12,0.0,89.2,89.2,
9/30/2002,1274.12,0.0,89.2,89.2,
10/31/2002,1274.12,0.0,89.2,89.2,


In [99]:
# Compare this to my samp_avg_salary.xlsx. Samp_avg_salary is only applicable to export_format.csv at this time
# Note that the excel workbook is also computing 60 month averages,
# Using two separate methods to compute the same outcome, I can check my work
# but, since my key is only available on one example source, I need to rerun all those cells with a different source file
# I wanted to run it on example 4 to start though, because it has the commas issue, and I wanted to show how to fix that with examples
df = pd.read_csv('export_format.csv')
# For this file export in particular, the member evidently did not have a high salary, and there were no commas
# So, pandas already labeled the column as float64, and not an object, due to no commas
# Because there were no commas to replace, this line of code was breaking the program. So, I commented it out.
#df['salary'] = df['salary'].str.replace(',', '') 
df['salary'] = pd.to_numeric(df['salary'])
df = df.groupby('period_end').sum()
df = df.sort_values(by='period_end', key=pd.to_datetime)
df['rolling_60'] = df['salary'].rolling(60).mean()


df.tail(15)

  df = df.groupby('period_end').sum()


Unnamed: 0_level_0,salary,sheltered,unsheltered,total,rolling_60
period_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5/31/2021,6881.82,481.72,0.0,481.72,6291.5405
6/30/2021,6881.81,481.72,0.0,481.72,6379.814
7/31/2021,9480.96,663.66,0.0,663.66,6516.691333
8/31/2021,7056.9,493.98,0.0,493.98,6610.742833
9/30/2021,7056.93,493.98,0.0,493.98,6700.0695
10/31/2021,8047.49,563.32,0.0,563.32,6811.563333
11/30/2021,8341.16,583.88,0.0,583.88,6922.294
12/31/2021,10840.73,758.85,0.0,758.85,7097.314833
1/31/2022,7192.15,503.45,0.0,503.45,7132.009167
2/28/2022,7237.23,506.6,0.0,506.6,7146.029833


In [100]:
# The numbers above do NOT match the expected values.....why? 
# I'll have to check some others ....