In [19]:
# Initial imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.pandas
import panel as pn
from pathlib import Path
from dotenv import load_dotenv
from panel.interact import interact
from panel import widgets
from string import digits
import csv
import json
import numpy as np
import dateparser

pn.extension()

%matplotlib inline

debug_level = 2

In [20]:
# Function definitions: TODO move to .py

## Data cleaning
def remove_strings_from_customer_names(original_customer_name):
    invalid_strings = ["PYMT", "DUE"]
    for invalid_item in invalid_strings:
        original_customer_name = original_customer_name.replace(invalid_item, "")
    return original_customer_name.strip()

def remove_numbers_from_customer_names(original_customer_name):
    remove_digits = str.maketrans('', '', digits) 
    return original_customer_name.translate(remove_digits).strip()  

def cleanup_customer_names(paying_customers_raw):
    paying_customers_cleanedup = []
    for customer in paying_customers_raw:
        corrected_customer_name = str(customer)
        #corrected_customer_name = str(customer).upper()
        corrected_customer_name = remove_strings_from_customer_names(corrected_customer_name)
        corrected_customer_name = remove_numbers_from_customer_names(corrected_customer_name)
        #corrected_customer_name = corrected_customer_name.title()
        paying_customers_cleanedup.append(corrected_customer_name)
    return paying_customers_cleanedup

def build_name_mapping(paying_customers_cleanedup):
    name_mapping = {}
    n = 1
    for customer in paying_customers_cleanedup:
        if not customer in name_mapping:
            name_mapping[customer] = "University " + str(n)
            n += 1
    return name_mapping

def read_name_mapping():
    with open(MAPPING_FILE_PATH, "r") as file:
        return json.loads(file.read())

def anonymize_customer_list(customer_list):
    anonymized_customer_list = []
    for customer in customer_list:
        anonymized_customer_list.append(customer_name_mapping[customer])
    return anonymized_customer_list


In [6]:
# Constants - TODO Move to .py
MAPPING_DIR                     = Path("../Resources/Mappings")
DATA_DIR_RAW                    = Path("../Resources/01_Raw")
DATA_DIR_ANONYMIZED             = Path("../Resources/02_Anonymized")
DATA_DIR_PREPROCESSED           = Path("../Resources/03_Preprocessed")
DATA_DIR_PROCESSED              = Path("../Resources/04_Processed")

MAPPING_FILE_PATH               = os.path.join(MAPPING_DIR, Path("CustomerNameMapping.json"))

RAW_ATLAS_FILE_PATH             = os.path.join(DATA_DIR_RAW, Path("ATLAS.csv"))
RAW_FORECAST_DATA_FILE_PATH     = os.path.join(DATA_DIR_RAW, Path("2021 forecast CSV.csv"))
RAW_REVENUE2020_FILE_PATH       = os.path.join(DATA_DIR_RAW, Path("Revenue2020.csv"))
RAW_REVENUE2020A_FILE_PATH      = os.path.join(DATA_DIR_RAW, Path("Revenue2020A.csv"))

ANON_ATLAS_FILE_PATH            = os.path.join(DATA_DIR_ANONYMIZED, Path("ATLAS.csv"))
ANON_FORECAST_DATA_FILE_PATH    = os.path.join(DATA_DIR_ANONYMIZED, Path("2021 forecast CSV.csv"))
ANON_REVENUE2020_FILE_PATH      = os.path.join(DATA_DIR_ANONYMIZED, Path("Revenue2020.csv"))
ANON_REVENUE2020A_FILE_PATH     = os.path.join(DATA_DIR_ANONYMIZED, Path("Revenue2020A.csv"))

In [14]:
def cleanup_date_string_list(date_string_list):
    date_list = []
    for date_string in date_string_list:
        try:
            date_list.append(parse_date_string(date_string))
            #date_list.append( pd.Timestamp(date_string.replace("//", "/"), tz="America/New_York") )
        except:
            print(f"Failed to parse: {date_string}")
    return date_list

def cleanup_dollar_string(dollars_string):
    return float(dollars_string.replace('$','').replace(',', ''))

def cleanup_dollar_string_list(dollars_list_in):
    dollars_list_out = []
    for dollars_string in dollars_list_in:
        try:
            dollars_list_out.append( cleanup_dollar_string(dollars_string) )
        except:
            print(f"Failed to parse: {dollars_string}")
    return dollars_list_out


def extract_subscription_dates_list(subscription_dates_string_list):
    """
    Parses a subscription date string from the ATLAS data export.

    A sample is: "1 Year Subscription 3/18/15 to 6/30/16"

    Parameters
    ----------
    subscription_dates_string_list: list[string]
        List or iterable of strings in the ATLAS subscription date string format.
    
    Returns
    -------
    [list, list]
        List containing one list of start dates, followed by one list of end dates.
    """

    subscription_dates_start_list = []
    subscription_dates_end_list = []

    for subscription_dates_string in subscription_dates_string_list:

        # Split by "Subscription"
        split1 = subscription_dates_string.find("Subscription")
        split2 = split1 + len("Subscription")
        date_range = subscription_dates_string[split2:].strip()

        # Find split by "to"
        split1 = date_range.find("to")
        split2 = split1 + len("to")
        if split1 < 0:
            # Failed so split by "-"
            split1 = date_range.find("-")
            split2 = split1 + len("-")
        
        # Split
        date1_str = date_range[0:split1].strip()
        date2_str = date_range[split2:].strip()

        # Parse dates
        debug_level >= 2 and print(f"date1_str: {date1_str}  date2_str: {date2_str}")
        date1 = parse_date_string(date1_str)
        date2 = parse_date_string(date2_str)
        debug_level >= 2 and print(f"    date1_str: {date1_str}  date1: {date1}")
        debug_level >= 2 and print(f"    date2_str: {date2_str}  date2: {date2}")

        # Build lists
        subscription_dates_start_list.append(date1)
        subscription_dates_end_list.append(date2)
    
    return [ subscription_dates_start_list, subscription_dates_end_list ]

def parse_date_string(date_str):
    date_str = date_str.replace("//", "/")
    try:
        date = dateparser.parse(date_str)
        return convert_datetime_to_timestamp(date)
    except:
        print(f"Failed to parse: {date_str}")
        return ""

def convert_datetime_to_timestamp(date_datetime):
    return pd.Timestamp(date_datetime.isoformat(), tz="America/New_York", tzinfo=date_datetime.tzinfo)

In [15]:
# import dateparser
# date1 = dateparser.parse('12/12/12')
# date1 = dateparser.parse('January 31,2017')
# date2 = pd.Timestamp(date1.isoformat(), tz="America/New_York", tzinfo=date.tzinfo)
# date2
# type(date2)
extract_subscription_dates_list(atlas.iloc[0:20]["Dates of service "])
# atlas.head()

date1_str: 3/18/15  date2_str: 6/30/16
    date1_str: 3/18/15  date1: 2015-03-18 00:00:00-04:00
    date2_str: 6/30/16  date2: 2016-06-30 00:00:00-04:00
date1_str: 6/1/15  date2_str: 6/30/16
    date1_str: 6/1/15  date1: 2015-06-01 00:00:00-04:00
    date2_str: 6/30/16  date2: 2016-06-30 00:00:00-04:00
date1_str: 6/10/15  date2_str: 6/30/16
    date1_str: 6/10/15  date1: 2015-06-10 00:00:00-04:00
    date2_str: 6/30/16  date2: 2016-06-30 00:00:00-04:00
date1_str: 6/1/15  date2_str: 6/30/16
    date1_str: 6/1/15  date1: 2015-06-01 00:00:00-04:00
    date2_str: 6/30/16  date2: 2016-06-30 00:00:00-04:00
date1_str: 10/05/15  date2_str: 9/30/16
    date1_str: 10/05/15  date1: 2015-10-05 00:00:00-04:00
    date2_str: 9/30/16  date2: 2016-09-30 00:00:00-04:00
date1_str: 10/01/15  date2_str: 6/30/17
    date1_str: 10/01/15  date1: 2015-10-01 00:00:00-04:00
    date2_str: 6/30/17  date2: 2017-06-30 00:00:00-04:00
date1_str: 09/18/15  date2_str: 12/31/16
    date1_str: 09/18/15  date1: 2015-09-1

[[Timestamp('2015-03-18 00:00:00-0400', tz='America/New_York'),
  Timestamp('2015-06-01 00:00:00-0400', tz='America/New_York'),
  Timestamp('2015-06-10 00:00:00-0400', tz='America/New_York'),
  Timestamp('2015-06-01 00:00:00-0400', tz='America/New_York'),
  Timestamp('2015-10-05 00:00:00-0400', tz='America/New_York'),
  Timestamp('2015-10-01 00:00:00-0400', tz='America/New_York'),
  Timestamp('2015-09-18 00:00:00-0400', tz='America/New_York'),
  Timestamp('2015-09-18 00:00:00-0400', tz='America/New_York'),
  Timestamp('2016-07-01 00:00:00-0400', tz='America/New_York'),
  Timestamp('2015-10-22 00:00:00-0400', tz='America/New_York'),
  Timestamp('2016-01-01 00:00:00-0500', tz='America/New_York'),
  Timestamp('2016-01-01 00:00:00-0500', tz='America/New_York'),
  Timestamp('2016-01-07 00:00:00-0500', tz='America/New_York'),
  Timestamp('2018-01-07 00:00:00-0500', tz='America/New_York'),
  Timestamp('2017-01-07 00:00:00-0500', tz='America/New_York'),
  Timestamp('2016-01-06 00:00:00-0500', 

In [11]:
# Read anonymized data
atlas           = pd.read_csv(ANON_ATLAS_FILE_PATH, index_col="Customers")
forecast        = pd.read_csv(ANON_FORECAST_DATA_FILE_PATH, index_col="Organization Name")
revenue2020     = pd.read_csv(ANON_REVENUE2020_FILE_PATH, index_col="Name")
revenue2020A    = pd.read_csv(ANON_REVENUE2020A_FILE_PATH, index_col="Payee Name")

In [31]:
# Clean up ATLAS data
atlas["Invoice Date"]       = cleanup_date_string_list(atlas["Invoice Date"])
atlas["Invoice Amount"]     = cleanup_dollar_string_list(atlas["Invoice Amount"])
# atlas["Service Start"]      = extract_subscription_dates_start_list(atlas["Dates of service"])
# atlas["Service End"]        = extract_subscription_dates_end_list(atlas["Dates of service"])
# atlas.drop(columns=["Dates of service"], inplace=True)
atlas.head()

Unnamed: 0_level_0,Invoice Date,Invoice #,Invoice Amount,Subscription,Account Code,Dates of service
Customers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
University 1,2015-03-20,ATLAS 315,72000.0,1 Year,4700-0-00-00000-18-0000,1 Year Subscription 3/18/15 to 6/30/16
University 102,2015-05-28,AJ501,3500.0,1 Year,4700-0-00-00000-16-0000,1 Year Subscription 6/1/15 to 6/30/16
University 3,2015-06-23,AJ502,3500.0,1 Year,4700-0-00-00000-17-0000,1 Year Subscription 6/10/15 to 6/30/16
University 4,2015-06-26,AJ503,6500.0,1 Year,4700-0-00-00000-32-0000,1 Year Subscription 6/1/15 to 6/30/16
University 5,2015-10-07,AJ504,750.0,1 Year,4700-0-00-00000-20-0000,1 Year Subscription 10/05/15 to 9/30/16
