In [7]:
# System imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.pandas
import panel as pn
from pathlib import Path
from dotenv import load_dotenv
from panel.interact import interact
from panel import widgets
from string import digits
import csv
import json
import numpy as np
import dateparser

pn.extension()

%matplotlib inline

In [8]:
# Local imports
import sys
sys.path.append("../lib2")

from Constants import Constants
from PreprocessingTools import CustomerNameCleaningFunctions
from PreprocessingTools import MappingFunctions


In [9]:
# Construct tools
debug_level = 0
constants = Constants()
tool_customer = CustomerNameCleaningFunctions(debug_level)
tool_mapping = MappingFunctions(debug_level)


In [10]:
# Build the anonymous customer name mapping

## Read the datasets into Pandas DataFrames
atlas               = pd.read_csv(constants.RAW_ATLAS_FILE_PATH, index_col="Customers")
forecast            = pd.read_csv(constants.RAW_FORECAST_DATA_FILE_PATH, index_col="Organization Name")
revenue2020         = pd.read_csv(constants.RAW_REVENUE2020_FILE_PATH, index_col="Name")
revenue2020A        = pd.read_csv(constants.RAW_REVENUE2020A_FILE_PATH, index_col="Payee Name")
atlas2              = pd.read_csv(constants.RAW_ATLAS_2_FILE_PATH, index_col="Customers")

## Extract paying customers
atlas_customers_raw             = atlas.index.to_list()
forecast_customers_raw          = forecast.index.to_list()
revenue2020_customers_raw       = revenue2020.index.to_list()
revenue2020A_customers_raw      = revenue2020A.index.to_list()
atlas2_customers_raw            = atlas2.index.to_list()

## Clean up names
atlas_customers_cleaned         = tool_customer.cleanup_customer_names(atlas_customers_raw)
forecast_customers_cleaned = tool_customer.cleanup_customer_names(forecast_customers_raw)
revenue2020_customers_cleaned   = tool_customer.cleanup_customer_names(revenue2020_customers_raw)
revenue2020A_customers_cleaned  = tool_customer.cleanup_customer_names(revenue2020A_customers_raw)
atlas2_customers_cleaned        = tool_customer.cleanup_customer_names(atlas2_customers_raw)

## Build name mapping
name_mapping_atlas          = tool_mapping.build_name_mapping(atlas_customers_cleaned)
name_mapping_forecast       = tool_mapping.build_name_mapping(forecast_customers_cleaned)
name_mapping_revenue2020    = tool_mapping.build_name_mapping(revenue2020_customers_cleaned)
name_mapping_revenue2020A   = tool_mapping.build_name_mapping(revenue2020A_customers_cleaned)
name_mapping_atlas2         = tool_mapping.build_name_mapping(atlas2_customers_cleaned)

## Combine name maps
customer_name_mapping = { **name_mapping_atlas, **name_mapping_forecast, **name_mapping_revenue2020, **name_mapping_revenue2020A, **name_mapping_atlas2 }

## Export the name mapping
tool_mapping.write_customer_name_mapping(customer_name_mapping)


In [11]:
# Anonymize the raw data files

## Import name mapping
customer_name_mapping = tool_mapping.read_customer_name_mapping()

## Clean up names
atlas_customers_cleaned         = tool_customer.cleanup_customer_names(atlas_customers_raw)
forecast_customers_cleaned      = tool_customer.cleanup_customer_names(forecast_customers_raw)
revenue2020_customers_cleaned   = tool_customer.cleanup_customer_names(revenue2020_customers_raw)
revenue2020A_customers_cleaned  = tool_customer.cleanup_customer_names(revenue2020A_customers_raw)
atlas2_customers_cleaned        = tool_customer.cleanup_customer_names(atlas2_customers_raw)

## Map the customer names
# atlas_customers_anonymized              = tool_customer.anonymize_customer_list(atlas_customers_cleaned, customer_name_mapping)
# forecast_customers_anonymized           = tool_customer.anonymize_customer_list(forecast_customers_cleaned, customer_name_mapping)
# revenue2020_customers_anonymized        = tool_customer.anonymize_customer_list(revenue2020_customers_cleaned, customer_name_mapping)
# revenue2020A_customers_anonymized       = tool_customer.anonymize_customer_list(revenue2020A_customers_cleaned, customer_name_mapping)
# atlas2_customers_anonymized             = tool_customer.anonymize_customer_list(atlas2_customers_cleaned, customer_name_mapping)

atlas_customers_anonymized = atlas_customers_cleaned
forecast_customers_anonymized = forecast_customers_cleaned
revenue2020_customers_anonymized = revenue2020_customers_cleaned
revenue2020A_customers_anonymized = revenue2020A_customers_cleaned
atlas2_customers_anonymized = atlas2_customers_cleaned

# Fix the customers in the DataFrames
atlas           = tool_customer.set_customers_index(atlas, atlas_customers_anonymized, "Customers")
forecast        = tool_customer.set_customers_index(forecast, forecast_customers_anonymized, "Organization Name")
revenue2020     = tool_customer.set_customers_index(revenue2020, revenue2020_customers_anonymized, "Name")
revenue2020A    = tool_customer.set_customers_index(revenue2020A, revenue2020A_customers_anonymized, "Payee Name")
atlas2          = tool_customer.set_customers_index(atlas2, atlas2_customers_anonymized, "Customers")

## Export
atlas.to_csv(constants.ANON_ATLAS_FILE_PATH)
forecast.to_csv(constants.ANON_FORECAST_DATA_FILE_PATH)
revenue2020.to_csv(constants.ANON_REVENUE2020_FILE_PATH)
revenue2020A.to_csv(constants.ANON_REVENUE2020A_FILE_PATH)
atlas2.to_csv(constants.ANON_ATLAS_2_FILE_PATH)


In [12]:
atlas

Unnamed: 0_level_0,ID,Number of Users,Invoice Date,Invoice #,Year,Month,Invoice Amount,Subscription,2021 Prediction,Lost Customer,Customers Status,Account Code,Dates of service,Address,State,Lat,Long
Customers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Alabama A&M University-College of Education,1,246.0,3/23/18,105,2018,March,"$5,460.00",3 Years,"$1,820.00",,New,4700-0-00-00000-00-0000,3 Years Subscription 3/21/18 to 3/31/21,"4900 Meridian St N, Huntsville, AL 35811",AL,34.785061,-86.573280
Albertus Magnus College,2,267.0,5/16/18,127,2018,May,"$5,670.00",3 Years,"$1,890.00",,New,4700-0-00-00000-00-0000,3 Years Subscription 9/1/18 to 8/31/21,"700 Prospect St, New Haven, CT 06511",CT,41.331299,-72.921440
Albuquerque Public Schools,3,77.0,11/20/18,161,2018,November,"$2,270.00",1 Year,,2019.0,New,4700-0-00-00000-00-0000,1 Year Subscription 11/19/18 to 11/30/19,"Po Box 25704 Albuquerque, NM 87125",NM,35.092410,-106.642950
Alcorn State University,4,20.0,10/31/17,,2017,October,"$1,700.00",1 Year,,2018.0,New,4700-0-00-00000-00-0000,1 Year Subscription 8/31/17 to 8/31/18,"1000 Asu Dr, Lorman, MS 39096",MS,31.875710,-91.141739
Alderson Broaddus University,5,50.0,10/7/20,NB100820K,2020,October,$500.00,3 Months,$500.00,,New,4700-0-00-00000-00-0000,3 Months Subscription 10/2/2020 to 12/31/2020,"101 College Hill Rd, Philippi, WV 26416",WV,39.160261,-80.049048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Winthrop University,403,375.0,9/8/20,NB090820S,2020,September,$937.50,3 Months,,,Returning,4700-0-00-00000-00-0000,3 Months Subscription 9/1/2020 to 11/30/2020,"701 Oakland Ave, Rock Hill, SC 29730",SC,34.938466,-81.029872
Winthrop University,403,100.0,10/13/20,NB101320C,2020,October,"$1,000.00",1 Year,"$1,000.00",,Returning,4700-0-00-00000-00-0000,1 Year Subscription 9/30/2020 to 9/30/2021,"701 Oakland Ave, Rock Hill, SC 29730",SC,34.938466,-81.029872
Yakima Valley College,404,56.0,9/21/20,NB092120A,2020,September,"$1,030.00",6 Months,"$1,030.00",,New,4700-0-00-00000-00-0000,6 Months Subscription 9/18/2020 to 03/18/2021,"PO Box 22520, Yakima, WA 99807-2520",WA,-35.001338,117.875053
Young Harris College/Miller Library,405,60.0,11/28/18,162,2018,November,"$2,100.00",1 Year,,,New,4700-0-00-00000-00-0000,1 Year Subscription 11/27/18 to 11/30/19,"1 College Street Young Harris, GA 30582",GA,34.934079,-83.846801
