In [8]:
# Initial imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.pandas
import panel as pn
from pathlib import Path
from dotenv import load_dotenv
from panel.interact import interact
from panel import widgets
from string import digits
import csv
import json
import numpy as np
import dateparser
from Constants import Constants
from PreprocessingTools import CustomerNameCleaningFunctions
from PreprocessingTools import DateCleaningFunctions
from PreprocessingTools import SpecializedDateCleaningFunctions

pn.extension()

%matplotlib inline

In [9]:
# Build tools
debug_level = 0
constants = Constants()
tool_customer = CustomerNameCleaningFunctions(debug_level)
tool_date = DateCleaningFunctions(debug_level)
tool_special_date = SpecializedDateCleaningFunctions(debug_level)


In [10]:
# Read anonymized data
atlas           = pd.read_csv(constants.ANON_ATLAS_FILE_PATH, index_col="Customers")
forecast        = pd.read_csv(constants.ANON_FORECAST_DATA_FILE_PATH, index_col="Organization Name")
revenue2020     = pd.read_csv(constants.ANON_REVENUE2020_FILE_PATH, index_col="Name")
revenue2020A    = pd.read_csv(constants.ANON_REVENUE2020A_FILE_PATH, index_col="Payee Name")

In [11]:
# Clean up ATLAS data

## Invoice Date
atlas["Invoice Date"]       = tool_date.cleanup_date_string_list(atlas["Invoice Date"])

## Invoice Amount
atlas["Invoice Amount"]      = tool_date.cleanup_dollar_string_list(atlas["Invoice Amount"])

## Dates of service
dates_of_service            = tool_special_date.extract_subscription_dates_list(atlas["Dates of service "])
atlas["Service Start"]      = dates_of_service[0]
atlas["Service End"]        = dates_of_service[1]
atlas.drop(columns=["Dates of service "], inplace=True)

atlas.head()

Unnamed: 0_level_0,Invoice Date,Invoice #,Invoice Amount,Subscription,Account Code,Service Start,Service End
Customers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
University 1,2015-03-20 00:00:00-04:00,ATLAS 315,72000.0,1 Year,4700-0-00-00000-18-0000,2015-03-18 00:00:00-04:00,2016-06-30 00:00:00-04:00
University 102,2015-05-28 00:00:00-04:00,AJ501,3500.0,1 Year,4700-0-00-00000-16-0000,2015-06-01 00:00:00-04:00,2016-06-30 00:00:00-04:00
University 3,2015-06-23 00:00:00-04:00,AJ502,3500.0,1 Year,4700-0-00-00000-17-0000,2015-06-10 00:00:00-04:00,2016-06-30 00:00:00-04:00
University 4,2015-06-26 00:00:00-04:00,AJ503,6500.0,1 Year,4700-0-00-00000-32-0000,2015-06-01 00:00:00-04:00,2016-06-30 00:00:00-04:00
University 5,2015-10-07 00:00:00-04:00,AJ504,750.0,1 Year,4700-0-00-00000-20-0000,2015-10-05 00:00:00-04:00,2016-09-30 00:00:00-04:00


In [12]:
# Clean up 2021 forecast data

## Remove blank columns
forecast.drop(
    columns=["Unnamed: 8", "Unnamed: 9", "Unnamed: 10", "Unnamed: 11", "Unnamed: 12", "Unnamed: 13", "Unnamed: 14", "Unnamed: 15", "Unnamed: 16", "Unnamed: 17", "Unnamed: 18", "Unnamed: 19", "Unnamed: 20", "Unnamed: 21", "Unnamed: 22", "Unnamed: 23", "Unnamed: 24", "Unnamed: 25", "Unnamed: 26", "Unnamed: 27", "Unnamed: 28", "Unnamed: 29", "Unnamed: 30", "Unnamed: 31", "Unnamed: 32", "Unnamed: 33", "Unnamed: 34", "Unnamed: 35", "Unnamed: 36", "Unnamed: 37", "Unnamed: 38", "Unnamed: 39", "Unnamed: 40", "Unnamed: 41", "Unnamed: 42", "Unnamed: 43", "Unnamed: 44", "Unnamed: 45", "Unnamed: 46", "Unnamed: 47", "Unnamed: 48", "Unnamed: 49", "Unnamed: 50", "Unnamed: 51", "Unnamed: 52", "Unnamed: 53", "Unnamed: 54", "Unnamed: 55", "Unnamed: 56", "Unnamed: 57", "Unnamed: 58", "Unnamed: 59", "Unnamed: 60", "Unnamed: 61", "Unnamed: 62", "Unnamed: 63", "Unnamed: 64", "Unnamed: 65", "Unnamed: 66", "Unnamed: 67", "Unnamed: 68", "Unnamed: 69", "Unnamed: 70", "Unnamed: 71", "Unnamed: 72", "Unnamed: 73", "Unnamed: 74", "Unnamed: 75", "Unnamed: 76", "Unnamed: 77", "Unnamed: 78", "Unnamed: 79", "Unnamed: 80", "Unnamed: 81", "Unnamed: 82", "Unnamed: 83", "Unnamed: 84", "Unnamed: 85", "Unnamed: 86", "Unnamed: 87", "Unnamed: 88", "Unnamed: 89", "Unnamed: 90", "Unnamed: 91", "Unnamed: 92", "Unnamed: 93", "Unnamed: 94", "Unnamed: 95", "Unnamed: 96", "Unnamed: 97", "Unnamed: 98", "Unnamed: 99", "Unnamed: 100", "Unnamed: 101", "Unnamed: 102", "Unnamed: 103", "Unnamed: 104", "Unnamed: 105", "Unnamed: 106", "Unnamed: 107", "Unnamed: 108", "Unnamed: 109", "Unnamed: 110", "Unnamed: 111", "Unnamed: 112", "Unnamed: 113"],
    inplace=True)

## Parse dates
forecast["Contract Start Date"]     = tool_date.cleanup_date_string_list(forecast["Contract Start Date"])
forecast["Contract End Date"]       = tool_date.cleanup_date_string_list(forecast["Contract End Date"])

## Parse dollars
forecast["Subscription Fee"]        = tool_date.cleanup_dollar_string_list(forecast["Subscription Fee"])

forecast.head()

Unnamed: 0_level_0,Mailing State/Province,Organization Record Type,ATLAS Customer Type,Subscription Fee,Subscription Type,Contract Start Date,Contract End Date
Organization Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
University 1,AL,College/University,System Wide IHE,62.6,3-Month,2020-04-01 00:00:00-04:00,2020-06-30 00:00:00-04:00
University 2,GA,College/University,Teacher Prep Program,250.0,3-Month,2020-03-24 00:00:00-04:00,2020-06-30 00:00:00-04:00
University 3,GU,College/University,,250.0,3-Month,2020-03-25 00:00:00-04:00,2020-06-30 00:00:00-04:00
University 4,Hawaii,College/University,Teacher Prep Program,425.0,3-Month,2020-03-25 00:00:00-04:00,2020-06-30 00:00:00-04:00
University 92,North Dakota,General Organization,,425.0,3-Month,2020-04-09 00:00:00-04:00,2020-06-30 00:00:00-04:00


In [13]:
# tool_date.cleanup_dollar_string_list(forecast["Subscription Fee"])

In [14]:
forecast.dtypes

Mailing State/Province                                object
Organization Record Type                              object
ATLAS Customer Type                                   object
Subscription Fee                                     float64
Subscription Type                                     object
Contract Start Date         datetime64[ns, America/New_York]
Contract End Date           datetime64[ns, America/New_York]
dtype: object