This script is meant to process only ONE MONTH'S taxes in Distribution .txt files from the Tax Commission for FISCAL YEAR 2010 ONWARD.

In [1]:
import pandas as pd
import numpy as np
from datetime import date

This cell immediately below is the only one that requires changes to run for a new month.

In [2]:
#Read in Distribution to Finance .txt files from Tax Commission
df = pd.read_fwf('../data/2018-07- DISTRIBUTION FILE TO FINANCE.txt', header=None, names=['Date', 'Tax', 'Location', 'Distribution'])
    # dtype={'Date': str, 'Tax': str, 'Location':str, 'Distribution': str}

In [3]:
df.head()

Unnamed: 0,Date,Tax,Location,Distribution
0,201807SEM,ES,01000Beaver County,656221
1,201807SEM,ES,03038Logan,5581794
2,201807SEM,ES,06000Davis County,7881883
3,201807SEM,ES,06004Bountiful,4393324
4,201807SEM,ES,06008Clearfield,2603077


In [None]:
df.info()

In [None]:
#Only keep and relabel relevant tax categories:
    # RR = Resort Communities Tax Gross
    # RA = Resort Communities Tax Gross - Addl
    
    # FG = Tourism Restaurant Tax Gross
    
    # FF = Tourism Leasing Tax Gross - from Sales Distr.
    # FP = Tourism Leasing Tax Gross - from Population

    # TR = Transient Room Tax Gross - only Counties
    # TM = Municipal Transient Room Tax Gross
    # TA = Municipal Transient Room Tax Gross - Addl
    # TC = Transient Room Tax Gross - Convention - discontinued
    # TT = Tourism Transient Room Tax Gross
    
    # LS = Local Sales (1% of state sales/use collection returned to localities)
    
    #Later sum Resort Communities categories, and MTRT categories
keep = ["RR", "RA", "FG", "FF", "FP", "TR", "TM", "TA", "TC", "TT", "LS"]
df = df[df.Tax.isin(keep)]
df.replace('RR', 'ResortComm', inplace=True)
df.replace('RA', 'ResortComm', inplace=True)
df.replace('FG', 'Restaurant', inplace=True)
df.replace('FF', 'CarRental_Sales', inplace=True)
df.replace('FP', 'CarRental_Pop', inplace=True)
df.replace('TR', 'TRT', inplace=True)
df.replace('TM', 'MTRT', inplace=True)
df.replace('TA', 'MTRT', inplace=True)
df.replace('TC', 'TRT_Convention', inplace=True)
df.replace('TT', 'TRT_Tourism', inplace=True)
df.replace('LS', 'Local_Sales', inplace=True)

df = df.reset_index(drop = True)

df.info()

In [None]:
#Separate Location Code from Location; remove 'SEM' from Date
df['LocationCode'] = df['Location'].str[:5].apply(np.int64) 
df['Location'] = df['Location'].str[5:]
df['Date'] = df['Date'].str[:-3]
df.info()

In [None]:
print(df)

In [None]:
#Insert decimal points into Distribution amounts; rename column
df['Distribution'] = df['Distribution'].astype(str)
df.info()
df['Distribution'] = df['Distribution'].str[:-2] + "." + df['Distribution'].str[-2:]
df['Distribution'] = df['Distribution'].astype(np.float64)
df.rename(columns = {'Distribution': 'Dollars_Distributed'}, inplace=True)
df.head(15)

In [None]:
#Grab a single date from Date 
year_month = df.iloc[0]['Date']
print(type(year_month))
print(year_month)

In [None]:
#Extract year and month individually
year = int(year_month[:-2])
print(type(year))
print(year)
month = int(year_month[-2:])
print(type(month))
print(month)

In [None]:
#Convert the Date to Calendar from Fiscal year
    #FY 2010 month 1 = July 2009
df.rename(columns = {'Date': 'Date_CY'}, inplace=True)

if month == 1:
    month = 7
    year = year - 1
elif month == 2:
    month = 8
    year = year - 1
elif month == 3:
    month = 9
    year = year - 1
elif month == 4:
    month = 10
    year = year - 1
elif month == 5:
    month = 11
    year = year - 1
elif month == 6:
    month = 12
    year = year - 1
    
elif month == 7:
    month = 1
elif month == 8:
    month = 2
elif month == 9:
    month = 3
elif month == 10:
    month = 4
elif month == 11:
    month = 5
elif month == 12:
    month = 6

In [None]:
#Reformat the date
when = date(year, month, 1)
print(when)

In [None]:
#Apply the formatted date object to entire Date column
df['Date_CY'] = when

In [None]:
df.head(10)

In [None]:
#Isolate rows for Resort Comm. Tax
resorts_list = df['Tax'] == 'ResortComm'
resorts = df[resorts_list]
#ind_list = list(resorts.index.values)
resorts = resorts.reset_index(drop = True)
#print(resorts)

In [None]:
#Sum Resort Communities distr$ for locations containing this tax on >1 rows
resort_dict = {} #A dictionary of location codes and summed distr$; no duplicate locations
keep_rows = [] #A list of bools for rows containing non-duplicate locations
ind = 0
for location in resorts['LocationCode']:
    if location in resort_dict.keys():
        add_this = resorts.iloc[ind]['Dollars_Distributed']
        to_this = resort_dict[location]
        sum_resort = add_this + to_this
        print('For Loc ' + str(location) + ': adding ' + str(add_this) + ' to ' + str(to_this) + ' = ' + str(sum_resort))
        resort_dict[location] = sum_resort
        keep_rows = keep_rows + [False]
    else:
        resort_dict[location] = resorts.iloc[ind]['Dollars_Distributed'] #put loc & distr$ in dict
        keep_rows = keep_rows + [True] #Keep row (NOT a duplicate location)
    ind = ind + 1

resorts = resorts[keep_rows]
resorts = resorts.reset_index(drop = True)
#print(resorts)

In [None]:
#Print dictionary of sums -- looks correct
#for keys, values in resort_dict.items():
#    print(keys)
#    print(values)

In [None]:
#Update distr$ on rows by location code
r = 0
for location in resorts['LocationCode']:
    resorts.set_value(r, 'Dollars_Distributed', resort_dict[location])
    r = r + 1
#print(resorts)

In [None]:
#Remove original Resort Comm. rows, and append updated ones, and reset index
df = df[~np.array(resorts_list)]
df = pd.concat([df, resorts], ignore_index = True)
df = df.reset_index(drop = True)

In [None]:
df.query('Tax == "ResortComm"')

In [None]:
#Repeat same summation process for MTRT
#Isolate rows for MTRT
mtrt_list = df['Tax'] == 'MTRT'
mtrt = df[mtrt_list]
mtrt = mtrt.reset_index(drop = True)
#print(mtrt)

In [None]:
#Sum MTRT distr$ for locations containing this tax on >1 rows, and drop those duplicate rows
mtrt_dict = {} #A dictionary of location codes and summed distr$; no duplicate locations
keeprs = [] #A list of bools for rows containing non-duplicate locations
ind = 0
for location in mtrt['LocationCode']:
    if location in mtrt_dict.keys():
        add_this = mtrt.iloc[ind]['Dollars_Distributed']
        to_this = mtrt_dict[location]
        sum_mtrt = add_this + to_this
        print('For Loc ' + str(location) + ': adding ' + str(add_this) + ' to ' + str(to_this) + ' = ' + str(sum_mtrt))
        mtrt_dict[location] = sum_mtrt
        keeprs = keeprs + [False]
    else:
        mtrt_dict[location] = mtrt.iloc[ind]['Dollars_Distributed'] #put loc & distr$ in dict
        keeprs = keeprs + [True] #Keep row (NOT a duplicate location)
    ind = ind + 1

mtrt = mtrt[keeprs]
mtrt = mtrt.reset_index(drop = True)
#print(mtrt)

In [None]:
#Update distr$ on rows by location code
r = 0
for location in mtrt['LocationCode']:
    mtrt.set_value(r, 'Dollars_Distributed', mtrt_dict[location])
    r = r + 1
#print(mtrt)

In [None]:
#Remove original Resort Comm. rows, and append updated ones, and reset index
df = df[~np.array(mtrt_list)]
df = pd.concat([df, mtrt], ignore_index = True)
df = df.reset_index(drop = True)

In [None]:
df.query('Tax == "MTRT"')

In [None]:
#Replace dates as indexes 
df = df.set_index('Date_CY')

In [None]:
#Reorder columns
df = df[['Tax', 'LocationCode', 'Location', 'Dollars_Distributed']]

In [None]:
df.head(20)

In [None]:
df.tail(20)

In [None]:
df.query('Tax == "CarRental_Sales"')

In [None]:
df.query('Tax == "CarRental_Pop"')

In [None]:
df.query('Tax == "TRT"')

In [None]:
#Create new dataframe just for local sales taxes to go in separate db table; remove LS from df
df[df.Tax == 'Local_Sales']

In [None]:
ls_taxes = df[df.Tax == 'Local_Sales']
df = df[df.Tax != 'Local_Sales']
df.Tax.describe() #'top' should NOT be 'Local_Sales'; should be 'MTRT' with 7 unique

In [None]:
ls_taxes = ls_taxes.drop('Tax', axis=1)
ls_taxes.head()

In [None]:
#Publish from here