# Initialize

In [1]:
# Options

## Debugging level
debug_level = 0


In [2]:
# Initialize framework
constants, tools = init_preprocessing(debug_level)
%matplotlib inline


In [3]:
# Read anonymized data
atlas                   = pd.read_csv(constants.ANON_ATLAS_FILE_PATH, index_col="Customers")
forecast                = pd.read_csv(constants.ANON_FORECAST_DATA_FILE_PATH, index_col="Organization Name")
revenue2020             = pd.read_csv(constants.ANON_REVENUE2020_FILE_PATH, index_col="Name")
revenue2020A            = pd.read_csv(constants.ANON_REVENUE2020A_FILE_PATH, index_col="Payee Name")


In [4]:
# Clean up ATLAS data

## Invoice Date
atlas["Invoice Date"]           = tools.tool_date.cleanup_date_string_list(atlas["Invoice Date"])

## Invoice Amount
atlas["Invoice Amount"]         = tools.tool_date.cleanup_dollar_string_list(atlas["Invoice Amount"])

## Dates of service
dates_of_service                = tools.tool_special_date.extract_subscription_dates_list(atlas["Dates of service "])
atlas["Service Start"]          = dates_of_service[0]
atlas["Service End"]            = dates_of_service[1]
atlas.drop(columns=["Dates of service "], inplace=True)

## Transform Subscription into a duration to enable math
atlas["SubscriptionDuration_Timedelta"] = tools.tool_duration_parsing.parse_duration_str_list_to_timedelta(atlas["Subscription"])
atlas["SubscriptionDuration_Years"] = tools.tool_duration_parsing.parse_duration_str_list_to_years(atlas["Subscription"])

atlas.head()


Unnamed: 0_level_0,Number of Users,Invoice Date,Invoice #,Invoice Amount,Subscription,Account Code,Address,State,Lat,Long,Service Start,Service End,SubscriptionDuration_Timedelta,SubscriptionDuration_Years
Customers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Alabama A&M University-College Of Education,,2018-03-23 00:00:00-04:00,105,5460.0,3 Years,4700-0-00-00000-00-0000,"4900 Meridian St N, Huntsville, AL 35811",AL,34.785061,-86.57328,2018-03-21 00:00:00-04:00,2021-03-31 00:00:00-04:00,1095 days,3.0
Albertus Magnus College,,2018-05-16 00:00:00-04:00,127,5670.0,3 Years,4700-0-00-00000-00-0000,"700 Prospect St, New Haven, CT 06511",CT,41.331299,-72.92144,2018-09-01 00:00:00-04:00,2021-08-31 00:00:00-04:00,1095 days,3.0
Albuquerque Public Schools,,2018-11-20 00:00:00-05:00,161,2270.0,1 Year,4700-0-00-00000-00-0000,"Po Box 25704 Albuquerque, NM 87125",NM,35.09241,-106.64295,2018-11-19 00:00:00-05:00,2019-11-30 00:00:00-05:00,365 days,1.0
Alcorn State University,,2017-10-31 00:00:00-04:00,,1700.0,1 Year,4700-0-00-00000-00-0000,"1000 Asu Dr, Lorman, MS 39096",MS,31.87571,-91.141739,2017-08-31 00:00:00-04:00,2018-08-31 00:00:00-04:00,365 days,1.0
Alderson Broaddus University,50.0,2020-10-07 00:00:00-04:00,NB100820K,500.0,3 Months,4700-0-00-00000-00-0000,"101 College Hill Rd, Philippi, WV 26416",WV,39.160261,-80.049048,2020-10-02 00:00:00-04:00,2020-12-31 00:00:00-05:00,90 days,0.25


In [5]:
atlas.dtypes

Number of Users                                            float64
Invoice Date                      datetime64[ns, America/New_York]
Invoice #                                                   object
Invoice Amount                                             float64
Subscription                                                object
Account Code                                                object
Address                                                     object
State                                                       object
Lat                                                        float64
Long                                                       float64
Service Start                     datetime64[ns, America/New_York]
Service End                       datetime64[ns, America/New_York]
SubscriptionDuration_Timedelta                     timedelta64[ns]
SubscriptionDuration_Years                                 float64
dtype: object

In [6]:
# Clean up 2021 forecast data

## Remove blank columns
forecast.drop(
    columns=["Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "Unnamed: 12", "Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 16", "Unnamed: 17", "Unnamed: 18", "Unnamed: 19", "Unnamed: 20", "Unnamed: 21", "Unnamed: 22", "Unnamed: 23", "Unnamed: 24", "Unnamed: 25", "Unnamed: 26", "Unnamed: 27", "Unnamed: 28", "Unnamed: 29", "Unnamed: 30", "Unnamed: 31", "Unnamed: 32", "Unnamed: 33", "Unnamed: 34", "Unnamed: 35", "Unnamed: 36", "Unnamed: 37", "Unnamed: 38", "Unnamed: 39", "Unnamed: 40", "Unnamed: 41", "Unnamed: 42", "Unnamed: 43", "Unnamed: 44", "Unnamed: 45", "Unnamed: 46", "Unnamed: 47", "Unnamed: 48", "Unnamed: 49", "Unnamed: 50", "Unnamed: 51", "Unnamed: 52", "Unnamed: 53", "Unnamed: 54", "Unnamed: 55", "Unnamed: 56", "Unnamed: 57", "Unnamed: 58", "Unnamed: 59", "Unnamed: 60", "Unnamed: 61", "Unnamed: 62", "Unnamed: 63", "Unnamed: 64", "Unnamed: 65", "Unnamed: 66", "Unnamed: 67", "Unnamed: 68", "Unnamed: 69", "Unnamed: 70", "Unnamed: 71", "Unnamed: 72", "Unnamed: 73", "Unnamed: 74", "Unnamed: 75", "Unnamed: 76", "Unnamed: 77", "Unnamed: 78", "Unnamed: 79", "Unnamed: 80", "Unnamed: 81", "Unnamed: 82", "Unnamed: 83", "Unnamed: 84", "Unnamed: 85", "Unnamed: 86", "Unnamed: 87", "Unnamed: 88", "Unnamed: 89", "Unnamed: 90", "Unnamed: 91", "Unnamed: 92", "Unnamed: 93", "Unnamed: 94", "Unnamed: 95", "Unnamed: 96", "Unnamed: 97", "Unnamed: 98", "Unnamed: 99", "Unnamed: 100", "Unnamed: 101", "Unnamed: 102", "Unnamed: 103", "Unnamed: 104", "Unnamed: 105", "Unnamed: 106", "Unnamed: 107", "Unnamed: 108", "Unnamed: 109", "Unnamed: 110", "Unnamed: 111", "Unnamed: 112", "Unnamed: 113"],
    inplace=True)

## Parse dates
forecast["Contract Start Date"]     = tools.tool_date.cleanup_date_string_list(forecast["Contract Start Date"])
forecast["Contract End Date"]       = tools.tool_date.cleanup_date_string_list(forecast["Contract End Date"])

## Parse dollars
forecast["Subscription Fee"]        = tools.tool_date.cleanup_dollar_string_list(forecast["Subscription Fee"])

## Transform Subscription into a duration to enable math
forecast["SubscriptionDuration_Timedelta"] = tools.tool_duration_parsing.parse_duration_str_list_to_timedelta(forecast["Subscription Type"])
forecast["SubscriptionDuration_Years"] = tools.tool_duration_parsing.parse_duration_str_list_to_years(forecast["Subscription Type"])

forecast.head()

Unnamed: 0_level_0,Mailing State/Province,Organization Record Type,ATLAS Customer Type,Subscription Fee,Subscription Type,Contract Start Date,Contract End Date,SubscriptionDuration_Timedelta,SubscriptionDuration_Years
Organization Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
University Of West Alabama,AL,College/University,System Wide IHE,62.6,3-Month,2020-04-01 00:00:00-04:00,2020-06-30 00:00:00-04:00,90 days,0.25
Georgia Gwinnett College,GA,College/University,Teacher Prep Program,250.0,3-Month,2020-03-24 00:00:00-04:00,2020-06-30 00:00:00-04:00,90 days,0.25
University Of Guam,GU,College/University,,250.0,3-Month,2020-03-25 00:00:00-04:00,2020-06-30 00:00:00-04:00,90 days,0.25
University Of Hawaii At Manoa,Hawaii,College/University,Teacher Prep Program,425.0,3-Month,2020-03-25 00:00:00-04:00,2020-06-30 00:00:00-04:00,90 days,0.25
University Of Jamestown,North Dakota,General Organization,,425.0,3-Month,2020-04-09 00:00:00-04:00,2020-06-30 00:00:00-04:00,90 days,0.25


In [7]:
forecast.dtypes

Mailing State/Province                                      object
Organization Record Type                                    object
ATLAS Customer Type                                         object
Subscription Fee                                           float64
Subscription Type                                           object
Contract Start Date               datetime64[ns, America/New_York]
Contract End Date                 datetime64[ns, America/New_York]
SubscriptionDuration_Timedelta                     timedelta64[ns]
SubscriptionDuration_Years                                 float64
dtype: object

In [8]:
# Clean up revenue2020 data

## Parse dates (dates are a little suspicious since it assumes today's day-of-month and year. Only the month value is accurate.)
revenue2020["Month_Dateformat"]     = tools.tool_date.cleanup_date_string_list(revenue2020["Month"])

revenue2020.head()

Unnamed: 0_level_0,Month,Total,Month_Dateformat
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Al A&M Univ College,Jan,1820.0,2020-01-05 00:00:00-05:00
Albertus Magnus College,Jan,1890.0,2020-01-05 00:00:00-05:00
American College Of Ed,Jan,883.33,2020-01-05 00:00:00-05:00
Bainbridge Island,Jan,1200.0,2020-01-05 00:00:00-05:00
Cerra Scnbn,Jan,1166.67,2020-01-05 00:00:00-05:00


In [9]:
# Clean up revenue2020A data

## Remove blank columns
revenue2020A.drop(
    columns=["Unnamed: 8", "Unnamed: 9", "Unnamed: 10"],
    inplace=True)

## Numeric types
revenue2020A["Invoice #"]           = pd.to_numeric(revenue2020A["Invoice #"], errors='coerce')
revenue2020A["Vendor ID"]           = pd.to_numeric(revenue2020A["Vendor ID"], errors='coerce')

## Dollar figures
revenue2020A["Invoice Amount"]      = tools.tool_date.cleanup_dollar_string_list(revenue2020A["Invoice Amount"])
revenue2020A["Amount Paid"]         = tools.tool_date.cleanup_dollar_string_list(revenue2020A["Amount Paid"])

## Parse dates
revenue2020A["Invoice Date"]        = tools.tool_date.cleanup_date_string_list(revenue2020A["Invoice Date"])
revenue2020A["Mail Date"]           = tools.tool_date.cleanup_date_string_list(revenue2020A["Mail Date"])
revenue2020A[" Pymt Received"]      = tools.tool_date.cleanup_date_string_list(revenue2020A[" Pymt Received"])

## Dates of service
dates_of_service            = tools.tool_special_date.extract_subscription_dates_list(revenue2020A["Dates of service "])
revenue2020A["Service Start"]      = dates_of_service[0]
revenue2020A["Service End"]        = dates_of_service[1]
revenue2020A.drop(columns=["Dates of service "], inplace=True)

## Compute service duration
revenue2020A["Service Duration"] = revenue2020A["Service End"] - revenue2020A["Service Start"]

revenue2020A.head()

Unnamed: 0_level_0,Invoice Date,Invoice #,Vendor ID,Invoice Amount,Account Code,Mail Date,Amount Paid,Pymt Received,Service Start,Service End,Service Duration
Payee Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Southeast/South-Central Educational Cooporative,2019-11-18 00:00:00-05:00,241.0,,200.0,4700-0-00-00000-00-0000,2019-11-18 00:00:00-05:00,200.0,2020-01-08 00:00:00-05:00,2019-05-22 00:00:00-04:00,2022-05-31 00:00:00-04:00,1105 days
Austin Peay State University,2019-12-16 00:00:00-05:00,248.0,,2500.0,4700-0-00-00000-00-0000,2019-12-16 00:00:00-05:00,2500.0,2020-01-09 00:00:00-05:00,2019-12-31 00:00:00-05:00,2020-12-31 00:00:00-05:00,366 days
"University Of Californina,Riverside",2019-11-18 00:00:00-05:00,240.0,,1250.0,4700-0-00-00000-00-0000,2019-11-18 00:00:00-05:00,1250.0,2020-01-10 00:00:00-05:00,2019-12-31 00:00:00-05:00,2020-12-31 00:00:00-05:00,366 days
Four Corners Coalition/Farmington High School,2019-11-18 00:00:00-05:00,212.0,,500.0,4700-0-00-00000-00-0000,2019-11-18 00:00:00-05:00,500.0,2020-01-13 00:00:00-05:00,2019-08-31 00:00:00-04:00,2020-08-31 00:00:00-04:00,366 days
Nm Nt Santa Fe - Santa Fe Public Schools,2019-11-20 00:00:00-05:00,244.0,,250.0,4700-0-00-00000-00-0000,2019-11-20 00:00:00-05:00,250.0,2020-01-13 00:00:00-05:00,2019-11-30 00:00:00-05:00,2020-11-30 00:00:00-05:00,366 days


In [10]:
# Export all files
atlas.to_pickle(constants.PREPROCESSED_ATLAS_FILE_PATH)
forecast.to_pickle(constants.PREPROCESSED_FORECAST_DATA_FILE_PATH)
revenue2020.to_pickle(constants.PREPROCESSED_REVENUE2020_FILE_PATH)
revenue2020A.to_pickle(constants.PREPROCESSED_REVENUE2020A_FILE_PATH)
