# Analyzing SD vendor checkbook data

A starting point for loading data from [the state's vendor checkbook CSV files](https://open.sd.gov/vendor.aspx) and analyzing the data in pandas.

In [29]:
from datetime import datetime
from pathlib import Path

import pandas as pd
from get_latest_data import CSV_READ_SETTINGS

In [30]:
# uncomment and run to refresh local csv files
# %run get_latest_data

In [31]:
# no scientific notation for big numbers
pd.set_option('display.float_format', lambda x: f'{x:.2f}')

In [48]:
df_codes = pd.read_csv('sd-agency-codes.csv', dtype={'agency_code': str})

In [50]:
df_codes.head()

Unnamed: 0,agency_code,agency_name
0,10,GOVERNOR'S OFFICE
1,11,BUREAU OF FINANCE & MANAGEMENT
2,12,BUREAU OF ADMINISTRATION
3,13,BUREAU OF INFORMATION & TELE.
4,14,BUREAU OF HUMAN RESOURCES


In [51]:
df = pd.concat([pd.read_csv(x, **CSV_READ_SETTINGS) for x in Path('data').glob('*.csv')])

df.sort_values(['ap_payment_date', 'vendor_name'], inplace=True)

In [52]:
df.head()

Unnamed: 0,document_date,document_number,vendor_name,vendor_number,vendor_group_number,ap_payment_date,voucher_number,amt,agency_code,agency_name
0,2020-06-17,215850,3D SPECIALTIES INC,12154482,,2020-07-01,,951.78,6,"GAME, FISH AND PARKS"
1,2020-06-30,JULY2020,4 B HOLDINGS LLC,12291623,,2020-07-01,,700.5,9,HEALTH
2,2020-06-19,IN736438,A & B BUSINESS INC,12036980,,2020-07-01,,66.29,11,BUREAU OF FINANCE & MANAGEMENT
3,2020-06-19,IN736423,A & B BUSINESS INC,12036980,,2020-07-01,,345.31,10,LABOR AND REGULATION
4,2020-06-19,IN736422,A & B BUSINESS INC,12036980,,2020-07-01,,203.23,10,LABOR AND REGULATION


In [53]:
len(df)

1150180

### Explore payments to a specific vendor

As an example, let's check out payments to vendors containing the names of a few cities in the Northern Hills.

In [54]:
# show me records where the vendor name contains any of these city names
df[df['vendor_name'].str.contains('spearfish|lead|deadwood|whitewood', case=False)]

Unnamed: 0,document_date,document_number,vendor_name,vendor_number,vendor_group_number,ap_payment_date,voucher_number,amt,agency_code,agency_name
153,2020-06-30,M020BTR024,CITY OF DEADWOOD,12054391,02,2020-07-01,,129405.23,02,REVENUE
405,2020-06-30,M020BTR024,LEAD-CITY OF,12054774,,2020-07-01,,75329.23,02,REVENUE
406,2020-06-23,01X1818-10805,LEAD-CITY OF,12054774,,2020-07-01,,75507.77,010,GOVERNOR'S OFFICE
678,2020-06-30,M020BTR024,SPEARFISH-CITY OF,12055082,,2020-07-01,,565773.78,02,REVENUE
787,2020-06-30,M020BTR024,WHITEWOOD-CITY OF,12055252,,2020-07-01,,19378.47,02,REVENUE
...,...,...,...,...,...,...,...,...,...,...
7001,2024-10-24,11332842,KNECHT HOME CENTER SPEARFISH,12058975,,2024-11-08,697467,229.00,06,"GAME, FISH AND PARKS"
7002,2024-10-24,11332850,KNECHT HOME CENTER SPEARFISH,12058975,,2024-11-08,697466,68.44,06,"GAME, FISH AND PARKS"
7003,2024-10-25,11338526,KNECHT HOME CENTER SPEARFISH,12058975,,2024-11-08,697469,67.92,06,"GAME, FISH AND PARKS"
7004,2024-10-28,11348456,KNECHT HOME CENTER SPEARFISH,12058975,,2024-11-08,697468,70.01,06,"GAME, FISH AND PARKS"


In [55]:
# drop some vendor numbers of interest into a list
vendor_ids = [
    '12054774',
    '12054391',
    '12055082',
    '12055252'
]

In [56]:
# filter into a new df
df_northern_hills = df[df['vendor_number'].isin(vendor_ids)]

In [57]:
df_northern_hills.head()

Unnamed: 0,document_date,document_number,vendor_name,vendor_number,vendor_group_number,ap_payment_date,voucher_number,amt,agency_code,agency_name
153,2020-06-30,M020BTR024,CITY OF DEADWOOD,12054391,2.0,2020-07-01,,129405.23,2,REVENUE
405,2020-06-30,M020BTR024,LEAD-CITY OF,12054774,,2020-07-01,,75329.23,2,REVENUE
406,2020-06-23,01X1818-10805,LEAD-CITY OF,12054774,,2020-07-01,,75507.77,10,GOVERNOR'S OFFICE
678,2020-06-30,M020BTR024,SPEARFISH-CITY OF,12055082,,2020-07-01,,565773.78,2,REVENUE
787,2020-06-30,M020BTR024,WHITEWOOD-CITY OF,12055252,,2020-07-01,,19378.47,2,REVENUE


### Break down spending by month

Add a `yearmonth` column to allow grouping by month.

In [58]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.html
df['yearmonth'] = df['ap_payment_date'].dt.strftime('%Y%m')

In [59]:
df.head()

Unnamed: 0,document_date,document_number,vendor_name,vendor_number,vendor_group_number,ap_payment_date,voucher_number,amt,agency_code,agency_name,yearmonth
0,2020-06-17,215850,3D SPECIALTIES INC,12154482,,2020-07-01,,951.78,6,"GAME, FISH AND PARKS",202007
1,2020-06-30,JULY2020,4 B HOLDINGS LLC,12291623,,2020-07-01,,700.5,9,HEALTH,202007
2,2020-06-19,IN736438,A & B BUSINESS INC,12036980,,2020-07-01,,66.29,11,BUREAU OF FINANCE & MANAGEMENT,202007
3,2020-06-19,IN736423,A & B BUSINESS INC,12036980,,2020-07-01,,345.31,10,LABOR AND REGULATION,202007
4,2020-06-19,IN736422,A & B BUSINESS INC,12036980,,2020-07-01,,203.23,10,LABOR AND REGULATION,202007


In [60]:
# pivot table to show payments by vendor by month
pivot_by_vendor_by_month = pd.pivot_table(df,
                              index='vendor_number',
                              columns='yearmonth',
                              values='amt',
                              aggfunc='sum').reset_index().fillna(0.0)

In [61]:
pivot_by_vendor_by_month.head()

yearmonth,vendor_number,202007,202008,202009,202010,202011,202012,202101,202102,202103,...,202402,202403,202404,202405,202406,202407,202408,202409,202410,202411
0,12001820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12001823,53353.87,52533.01,52723.84,52346.98,52186.3,52215.97,51974.41,52402.83,51845.38,...,52003.77,50806.43,50887.02,50903.19,50961.36,59863.0,42274.8,50680.2,50443.03,0.0
2,12001827,0.0,0.0,0.0,0.0,0.0,0.0,675.0,0.0,736.0,...,0.0,0.0,0.0,0.0,795.0,0.0,0.0,0.0,0.0,0.0
3,12001831,3029.28,10061.58,11085.74,4028.23,0.0,8776.72,6357.01,3394.56,5616.32,...,5455.13,5455.13,5455.13,19092.96,5455.13,12879.23,10971.15,9029.85,7989.69,16212.92
4,12001855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,375.0,0.0,0.0,0.0


In [62]:
# pivot table to show spending by agency by month
pivot_by_agency_by_month = pd.pivot_table(df,
                              index='agency_name',
                              columns='yearmonth',
                              values='amt',
                              aggfunc='sum').reset_index().fillna(0.0)

In [63]:
pivot_by_agency_by_month.head()

yearmonth,agency_name,202007,202008,202009,202010,202011,202012,202101,202102,202103,...,202402,202403,202404,202405,202406,202407,202408,202409,202410,202411
0,AGRICULTURE & NAT. RESOURCES,4553984.51,5459280.11,3184810.95,4564255.96,4019103.13,4723435.97,2674370.7,3198881.83,2096449.41,...,14229833.84,11700310.54,7032988.68,14136190.68,17029075.53,12401399.02,45672584.4,26606985.79,30061236.07,14115749.84
1,ATTORNEY GENERAL,957625.66,735497.69,623860.35,785921.39,506460.41,1007728.01,459006.43,853824.42,793472.82,...,1587772.59,817600.08,1276515.93,1518962.41,991720.68,657263.13,715125.96,988635.91,1078970.11,167327.87
2,BOARD OF REGENTS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1028.97,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BUREAU OF ADMINISTRATION,3290216.74,2439932.15,3072094.16,3313497.95,4101399.72,4510967.2,2326289.87,3640654.57,2937420.32,...,3475160.38,4323393.88,5065135.19,5169617.06,7130525.84,7887776.25,10047661.25,7000913.05,10110080.86,1063294.43
4,BUREAU OF FINANCE & MANAGEMENT,293867.53,29844357.42,16697248.8,45557236.23,26539516.4,284083649.0,251454551.14,66176055.6,10850474.69,...,173531.45,186822.79,700950.57,978116.96,172462.68,255161.93,1547130.4,1338720.18,1559813.58,23572.35


In [64]:
# sort descending by totals for the latest month, which is the last column in the df
pivot_by_agency_by_month.sort_values(pivot_by_agency_by_month.columns[-1], ascending=False).head()

yearmonth,agency_name,202007,202008,202009,202010,202011,202012,202101,202102,202103,...,202402,202403,202404,202405,202406,202407,202408,202409,202410,202411
29,TRANSPORTATION,112242554.86,80742028.73,85169178.59,88238239.08,44262726.16,49199800.15,25372804.4,17977873.11,57502298.39,...,22599819.48,35750577.19,41640872.61,115481889.64,78348045.34,119425064.75,159138597.17,132505886.87,135698490.61,46156435.95
0,AGRICULTURE & NAT. RESOURCES,4553984.51,5459280.11,3184810.95,4564255.96,4019103.13,4723435.97,2674370.7,3198881.83,2096449.41,...,14229833.84,11700310.54,7032988.68,14136190.68,17029075.53,12401399.02,45672584.4,26606985.79,30061236.07,14115749.84
22,REVENUE,80743795.95,61222012.16,38239412.77,61372287.64,36028745.61,39450414.97,98740129.01,32738523.57,36940879.35,...,60359014.27,44363913.08,52300078.93,87354891.21,27154137.15,96872414.35,83173946.49,57511586.07,72830985.17,9521183.05
9,"GAME, FISH AND PARKS",3071236.15,2996261.84,1673556.95,7232983.0,2659616.31,4191427.3,5704466.46,2322395.91,2441040.53,...,1089645.11,5417656.35,3775662.49,5552082.46,2506706.32,4135232.01,4702056.25,3923403.85,8946373.7,7431060.89
25,SOCIAL SERVICES,8910804.44,14696494.19,9760285.98,15355313.05,11015205.61,11838650.16,14656744.66,6937901.9,12034195.0,...,14512702.16,21799718.48,18706107.99,30067493.11,24497915.39,13687160.1,20922160.25,32659616.81,18581456.48,5533726.56


### Filter to examine spending by one agency

Example: Look at spending by the state Department of Education.

In [65]:
# find the code to filter on
df_codes[df_codes['agency_name'].str.contains('education', case=False)]

Unnamed: 0,agency_code,agency_name
15,12,EDUCATION


In [66]:
education = df[df['agency_code'] == '12']

In [67]:
education.head()

Unnamed: 0,document_date,document_number,vendor_name,vendor_number,vendor_group_number,ap_payment_date,voucher_number,amt,agency_code,agency_name,yearmonth
52,2020-06-16,895571X06242020,AT&T MOBILITY II LLC,12279233,,2020-07-01,,660.63,12,EDUCATION,202007
79,2020-06-12,SC12019C-463-2,BLACK HILLS SPECIAL SERVICES,12037282,,2020-07-01,,4107.5,12,EDUCATION,202007
186,2020-06-10,SC12119C-371-22,CN RESOURCE LLC,12118456,,2020-07-01,,32094.3,12,EDUCATION,202007
235,2020-03-31,SC12120C-211-2,EAST DAKOTA EDUCATIONAL COOP,12003510,,2020-07-01,,14200.0,12,EDUCATION,202007
272,2020-06-25,SC12120C-295-2,FLORIDA STATE UNIVERSITY,12115848,,2020-07-01,,3000.0,12,EDUCATION,202007


In [68]:
len(education)

77136

### Add up spending before and after a given date

E.g., compare spending before and after July 1, 2021.

In [69]:
target_date = datetime(2021, 7, 1)

In [70]:
spending_before_date = df[df['ap_payment_date'] < target_date]
spending_after_date = df[df['ap_payment_date'] >= target_date]

In [71]:
print(f'Before: ${spending_before_date["amt"].sum():,.2f}')
print(f'After: ${spending_after_date["amt"].sum():,.2f}')

Before: $3,801,255,903.43
After: $12,983,635,221.82


### Break down payment frequency to vendors by month

Use case: See if payments went up or down, or stopped or started, to a vendor or a group of vendors at a given month. E.g., a new rule went into effect and you want to see if/how that affected the frequency/amount of spending, but instead of just a before/after sum you want a monthly breakdown of previous spending to get a sense of frequency.

In [72]:
# what's the target date of the change we're looking at?
target_date = datetime(2022, 10, 1)

# get it in a form that matches column names: yyyymm
# c.f. http://strftime.org
target_yearmonth = target_date.strftime('%Y%m')

In [73]:
# goal is to get a list of column names with yearmonth values
# before and after the target_yearmonth

# gonna do this one step at a time

# show column names
pivot_by_vendor_by_month.columns

Index(['vendor_number', '202007', '202008', '202009', '202010', '202011',
       '202012', '202101', '202102', '202103', '202104', '202105', '202106',
       '202107', '202108', '202109', '202110', '202111', '202112', '202201',
       '202202', '202203', '202204', '202205', '202206', '202207', '202208',
       '202209', '202210', '202211', '202212', '202301', '202302', '202303',
       '202304', '202305', '202306', '202307', '202308', '202309', '202310',
       '202311', '202312', '202401', '202402', '202403', '202404', '202405',
       '202406', '202407', '202408', '202409', '202410', '202411'],
      dtype='object', name='yearmonth')

In [74]:
# show column names except the initial vendor_number column
pivot_by_vendor_by_month.columns[1:]

Index(['202007', '202008', '202009', '202010', '202011', '202012', '202101',
       '202102', '202103', '202104', '202105', '202106', '202107', '202108',
       '202109', '202110', '202111', '202112', '202201', '202202', '202203',
       '202204', '202205', '202206', '202207', '202208', '202209', '202210',
       '202211', '202212', '202301', '202302', '202303', '202304', '202305',
       '202306', '202307', '202308', '202309', '202310', '202311', '202312',
       '202401', '202402', '202403', '202404', '202405', '202406', '202407',
       '202408', '202409', '202410', '202411'],
      dtype='object', name='yearmonth')

In [75]:
# figure out the index position of the target_yearmonth in the list of columns
# see list.index() docs https://docs.python.org/3/tutorial/datastructures.html
# n.b., you have to coerce the .columns object to a list() before you can use the .index() method,

target_col_idx = list(pivot_by_vendor_by_month.columns).index(target_yearmonth)

In [76]:
# doublecheck that the column name you're selecting for matches `target_yearmonth`
assert(pivot_by_vendor_by_month.columns[target_col_idx] == target_yearmonth)

In [77]:
# finally, get a list of columns with yearmonth values before ... (minus initial vendor_number col)
pre_date_columns = pivot_by_vendor_by_month.columns[1:target_col_idx]

# ... and after
post_date_columns = pivot_by_vendor_by_month.columns[target_col_idx:]

In [78]:
# next, goal is to add up total spending prior to this month and after
# define a function that will sum spending across a row for the
# selected columns

def sum_totals_yearmonth(row, direction='pre'):
    cols = pre_date_columns
    
    if direction == 'post':
        cols = post_date_columns

    return round(sum([row[x] for x in cols]), 2)

In [79]:
pivot_by_vendor_by_month['pre_date_total'] = pivot_by_vendor_by_month.apply(sum_totals_yearmonth, direction='pre', axis=1)
pivot_by_vendor_by_month['post_date_total'] = pivot_by_vendor_by_month.apply(sum_totals_yearmonth, direction='post',axis=1)

In [80]:
pivot_by_vendor_by_month.head()

yearmonth,vendor_number,202007,202008,202009,202010,202011,202012,202101,202102,202103,...,202404,202405,202406,202407,202408,202409,202410,202411,pre_date_total,post_date_total
0,12001820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10608.7
1,12001823,53353.87,52533.01,52723.84,52346.98,52186.3,52215.97,51974.41,52402.83,51845.38,...,50887.02,50903.19,50961.36,59863.0,42274.8,50680.2,50443.03,0.0,1398656.23,1292617.36
2,12001827,0.0,0.0,0.0,0.0,0.0,0.0,675.0,0.0,736.0,...,0.0,0.0,795.0,0.0,0.0,0.0,0.0,0.0,2155.0,2245.0
3,12001831,3029.28,10061.58,11085.74,4028.23,0.0,8776.72,6357.01,3394.56,5616.32,...,5455.13,19092.96,5455.13,12879.23,10971.15,9029.85,7989.69,16212.92,184974.46,213257.76
4,12001855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,375.0,0.0,0.0,0.0,0.0,375.0


In [81]:
# make a quick df to look up vendor names by number
vendor_lookup = df[['vendor_number', 'vendor_name']].drop_duplicates(subset=['vendor_number'])

In [82]:
vendor_lookup.head()

Unnamed: 0,vendor_number,vendor_name
0,12154482,3D SPECIALTIES INC
1,12291623,4 B HOLDINGS LLC
2,12036980,A & B BUSINESS INC
6,12130696,ABERDEEN ENERGY
7,12115184,AFLAC


In [83]:
# merge with pivoted df
pivoted_with_vendor_names = pd.merge(
    pivot_by_vendor_by_month,
    vendor_lookup,
    how='left',
    on='vendor_number'
)

In [84]:
pivoted_with_vendor_names.head()

Unnamed: 0,vendor_number,202007,202008,202009,202010,202011,202012,202101,202102,202103,...,202405,202406,202407,202408,202409,202410,202411,pre_date_total,post_date_total,vendor_name
0,12001820,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10608.7,LL BEAN INC
1,12001823,53353.87,52533.01,52723.84,52346.98,52186.3,52215.97,51974.41,52402.83,51845.38,...,50903.19,50961.36,59863.0,42274.8,50680.2,50443.03,0.0,1398656.23,1292617.36,UNUM LIFE INS CO OF AMERICA
2,12001827,0.0,0.0,0.0,0.0,0.0,0.0,675.0,0.0,736.0,...,0.0,795.0,0.0,0.0,0.0,0.0,0.0,2155.0,2245.0,NATIONAL RURAL HEALTH ASSOC
3,12001831,3029.28,10061.58,11085.74,4028.23,0.0,8776.72,6357.01,3394.56,5616.32,...,19092.96,5455.13,12879.23,10971.15,9029.85,7989.69,16212.92,184974.46,213257.76,IDEXX LABORATORIES INC
4,12001855,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,375.0,0.0,0.0,0.0,0.0,375.0,ASSOCIATION FORENSIC QA MGRS


In [85]:
# make sure you ended up with the same number of records
assert(len(pivot_by_vendor_by_month) == len(pivoted_with_vendor_names))

In [86]:
# filter to get vendors who were paid something in the months before
# the date of interest but not after
some_before_none_after = pivoted_with_vendor_names[(pivoted_with_vendor_names['pre_date_total'] > 0) & (pivoted_with_vendor_names['post_date_total'] == 0)]

print(f'{len(some_before_none_after):,} vendors were paid something before the target month but nothing after')
print()

# starting with the month immediately preceding the target yearmonth,
# count up how many vendors were paid that month -- the
# goal is to get a sense of consecutive payments each month that stopped,
# and it's accomplished by gradually filtering the same dataframe
# with a filter that iterates backward by month

# loop over the list of pre-date columns in reverse
for i, yearmonth in enumerate(reversed(pre_date_columns), 1):

    month_phrase = f'{i} consecutive months'
    
    if i == 1:
        month_phrase = f'the month'
        

    # filter to get vendors who were paid more than 0 in this month
    some_before_none_after = some_before_none_after[some_before_none_after[yearmonth] > 0]
    
    # get the total number in this cohort
    total = len(some_before_none_after)
        
    plural_phrase = 'vendors were'
    
    if total == 1:
        plural_phrase = 'vendor was'

    msg = f'{total:>4,} {plural_phrase} paid something in the {month_phrase} before the target month, but nothing after'
    print(msg)

12,678 vendors were paid something before the target month but nothing after

 358 vendors were paid something in the the month before the target month, but nothing after
  42 vendors were paid something in the 2 consecutive months before the target month, but nothing after
  15 vendors were paid something in the 3 consecutive months before the target month, but nothing after
  12 vendors were paid something in the 4 consecutive months before the target month, but nothing after
   9 vendors were paid something in the 5 consecutive months before the target month, but nothing after
   7 vendors were paid something in the 6 consecutive months before the target month, but nothing after
   6 vendors were paid something in the 7 consecutive months before the target month, but nothing after
   4 vendors were paid something in the 8 consecutive months before the target month, but nothing after
   4 vendors were paid something in the 9 consecutive months before the target month, but nothing aft