In [None]:
# System imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.pandas
import panel as pn
from pathlib import Path
from dotenv import load_dotenv
from panel.interact import interact
from panel import widgets
from string import digits
import csv
import json
import numpy as np
import dateparser

pn.extension()

%matplotlib inline

In [None]:
# Local imports
import sys
sys.path.append("../libs")

from Constants import Constants
from PreprocessingTools import CustomerNameCleaningFunctions
from PreprocessingTools import MappingFunctions


In [None]:
# Construct tools
debug_level = 0
constants = Constants()
tool_customer = CustomerNameCleaningFunctions(debug_level)
tool_mapping = MappingFunctions(debug_level)


In [None]:
# Build the anonymous customer name mapping

## Read the datasets into Pandas DataFrames
atlas               = pd.read_csv(constants.RAW_ATLAS_FILE_PATH, index_col="Customers")
forecast            = pd.read_csv(constants.RAW_FORECAST_DATA_FILE_PATH, index_col="Organization Name")
revenue2020         = pd.read_csv(constants.RAW_REVENUE2020_FILE_PATH, index_col="Name")
revenue2020A        = pd.read_csv(constants.RAW_REVENUE2020A_FILE_PATH, index_col="Payee Name")
atlas2              = pd.read_csv(constants.RAW_ATLAS_2_FILE_PATH, index_col="Customers")
potential_customers_file    = pd.read_csv(constants.RAW_POTENTIAL_CUSTOMERS_FILE_PATH, index_col="Customer Name")

## Extract paying customers
atlas_customers_raw             = atlas.index.to_list()
forecast_customers_raw          = forecast.index.to_list()
revenue2020_customers_raw       = revenue2020.index.to_list()
revenue2020A_customers_raw      = revenue2020A.index.to_list()
atlas2_customers_raw            = atlas2.index.to_list()
potential_customers_raw             = potential_customers_file.index.to_list()

## Clean up names
atlas_customers_cleaned         = tool_customer.cleanup_customer_names(atlas_customers_raw)
forecast_customers_cleaned = tool_customer.cleanup_customer_names(forecast_customers_raw)
revenue2020_customers_cleaned   = tool_customer.cleanup_customer_names(revenue2020_customers_raw)
revenue2020A_customers_cleaned  = tool_customer.cleanup_customer_names(revenue2020A_customers_raw)
atlas2_customers_cleaned        = tool_customer.cleanup_customer_names(atlas2_customers_raw)
potential_customers_cleaned        = tool_customer.cleanup_customer_names(potential_customers_raw)

## Build name mapping
name_mapping          = tool_mapping.build_name_mapping(atlas_customers_cleaned, {})
name_mapping       = tool_mapping.build_name_mapping(forecast_customers_cleaned, name_mapping)
name_mapping    = tool_mapping.build_name_mapping(revenue2020_customers_cleaned, name_mapping)
name_mapping   = tool_mapping.build_name_mapping(revenue2020A_customers_cleaned, name_mapping)
name_mapping         = tool_mapping.build_name_mapping(atlas2_customers_cleaned, name_mapping)
name_mapping_potential_customers         = tool_mapping.build_name_mapping(potential_customers_cleaned, {})

## Export the name mappings
tool_mapping.write_customer_name_mapping(name_mapping)
tool_mapping.write_lookup_table(name_mapping_potential_customers, constants.LUT_POTENTIAL_CUSTOMER_NAME_FILE_PATH)


In [None]:
# Anonymize the raw data files

## Import name mapping
customer_name_mapping = tool_mapping.read_customer_name_mapping()

## Clean up names
atlas_customers_cleaned         = tool_customer.cleanup_customer_names(atlas_customers_raw)
forecast_customers_cleaned = tool_customer.cleanup_customer_names(forecast_customers_raw)
revenue2020_customers_cleaned   = tool_customer.cleanup_customer_names(revenue2020_customers_raw)
revenue2020A_customers_cleaned  = tool_customer.cleanup_customer_names(revenue2020A_customers_raw)
atlas2_customers_cleaned        = tool_customer.cleanup_customer_names(atlas2_customers_raw)

## Map the customer names
atlas_customers_anonymized              = tool_customer.anonymize_customer_list(atlas_customers_cleaned, customer_name_mapping)
forecast_customers_anonymized      = tool_customer.anonymize_customer_list(forecast_customers_cleaned, customer_name_mapping)
revenue2020_customers_anonymized        = tool_customer.anonymize_customer_list(revenue2020_customers_cleaned, customer_name_mapping)
revenue2020A_customers_anonymized       = tool_customer.anonymize_customer_list(revenue2020A_customers_cleaned, customer_name_mapping)
atlas2_customers_anonymized             = tool_customer.anonymize_customer_list(atlas2_customers_cleaned, customer_name_mapping)

# Fix the customers in the DataFrames
atlas           = tool_customer.set_customers_index(atlas, atlas_customers_anonymized, "Customers")
forecast        = tool_customer.set_customers_index(forecast, forecast_customers_anonymized, "Organization Name")
revenue2020     = tool_customer.set_customers_index(revenue2020, revenue2020_customers_anonymized, "Name")
revenue2020A    = tool_customer.set_customers_index(revenue2020A, revenue2020A_customers_anonymized, "Payee Name")
atlas2          = tool_customer.set_customers_index(atlas2, atlas2_customers_anonymized, "Customers")

## Export
atlas.to_csv(constants.ANON_ATLAS_FILE_PATH)
forecast.to_csv(constants.ANON_FORECAST_DATA_FILE_PATH)
revenue2020.to_csv(constants.ANON_REVENUE2020_FILE_PATH)
revenue2020A.to_csv(constants.ANON_REVENUE2020A_FILE_PATH)
atlas2.to_csv(constants.ANON_ATLAS_2_FILE_PATH)
