In [34]:
# Initial imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.pandas
import panel as pn
from pathlib import Path
from dotenv import load_dotenv
from panel.interact import interact
from panel import widgets
from string import digits
import csv
import json

pn.extension()

%matplotlib inline

In [35]:
# Function definitions: TODO move to .py

## Data cleaning
def remove_strings_from_customer_names(original_customer_name):
    invalid_strings = ["PYMT", "DUE"]
    for invalid_item in invalid_strings:
        original_customer_name = original_customer_name.replace(invalid_item, "")
    return original_customer_name.strip()

def remove_numbers_from_customer_names(original_customer_name):
    remove_digits = str.maketrans('', '', digits) 
    return original_customer_name.translate(remove_digits).strip()  

def cleanup_customer_names(paying_customers_raw):
    paying_customers_cleanedup = []
    for customer in paying_customers_raw:
        corrected_customer_name = str(customer)
        #corrected_customer_name = str(customer).upper()
        corrected_customer_name = remove_strings_from_customer_names(corrected_customer_name)
        corrected_customer_name = remove_numbers_from_customer_names(corrected_customer_name)
        #corrected_customer_name = corrected_customer_name.title()
        paying_customers_cleanedup.append(corrected_customer_name)
    return paying_customers_cleanedup

def build_name_mapping(paying_customers_cleanedup):
    name_mapping = {}
    n = 1
    for customer in paying_customers_cleanedup:
        if not customer in name_mapping:
            name_mapping[customer] = "University " + str(n)
            n += 1
    return name_mapping

def read_name_mapping():
    with open(MAPPING_FILE_PATH, "r") as file:
        return json.loads(file.read())

def anonymize_customer_list(customer_list):
    anonymized_customer_list = []
    for customer in customer_list:
        anonymized_customer_list.append(customer_name_mapping[customer])
    return anonymized_customer_list


In [36]:
# Constants - TODO Move to .py
MAPPING_DIR                     = Path("../Resources/Mappings")
DATA_DIR_RAW                    = Path("../Resources/01_Raw")
DATA_DIR_ANONYMIZED             = Path("../Resources/02_Anonymized")
DATA_DIR_PREPROCESSED           = Path("../Resources/03_Preprocessed")
DATA_DIR_PROCESSED              = Path("../Resources/04_Processed")

MAPPING_FILE_PATH               = os.path.join(MAPPING_DIR, Path("CustomerNameMapping.json"))

RAW_ATLAS_FILE_PATH             = os.path.join(DATA_DIR_RAW, Path("ATLAS.csv"))
RAW_FORECAST_DATA_FILE_PATH     = os.path.join(DATA_DIR_RAW, Path("2021 forecast CSV.csv"))
RAW_REVENUE2020_FILE_PATH       = os.path.join(DATA_DIR_RAW, Path("Revenue2020.csv"))
RAW_REVENUE2020A_FILE_PATH      = os.path.join(DATA_DIR_RAW, Path("Revenue2020A.csv"))

ANON_ATLAS_FILE_PATH            = os.path.join(DATA_DIR_ANONYMIZED, Path("ATLAS.csv"))
ANON_FORECAST_DATA_FILE_PATH    = os.path.join(DATA_DIR_ANONYMIZED, Path("2021 forecast CSV.csv"))
ANON_REVENUE2020_FILE_PATH      = os.path.join(DATA_DIR_ANONYMIZED, Path("Revenue2020.csv"))
ANON_REVENUE2020A_FILE_PATH     = os.path.join(DATA_DIR_ANONYMIZED, Path("Revenue2020A.csv"))

In [37]:
# Build the anonymous customer name mapping

## Read the datasets into Pandas DataFrames
atlas_raw               = pd.read_csv(RAW_ATLAS_FILE_PATH, index_col="Customers")
forecast_data_raw       = pd.read_csv(RAW_FORECAST_DATA_FILE_PATH, index_col="Organization Name")
revenue2020_raw         = pd.read_csv(RAW_REVENUE2020_FILE_PATH, index_col="Name")
revenue2020A_raw        = pd.read_csv(RAW_REVENUE2020A_FILE_PATH, index_col="Payee Name")

## Extract paying customers
atlas_customers_raw             = atlas_raw.index
forecast_data_customers_raw     = forecast_data_raw.index
revenue2020_customers_raw       = revenue2020_raw.index
revenue2020A_customers_raw      = revenue2020A_raw.index

## Clean up names
atlas_customers_cleaned         = cleanup_customer_names(atlas_customers_raw)
forecast_data_customers_cleaned = cleanup_customer_names(forecast_data_customers_raw)
revenue2020_customers_cleaned   = cleanup_customer_names(revenue2020_customers_raw)
revenue2020A_customers_cleaned  = cleanup_customer_names(revenue2020A_customers_raw)

## Build name mapping
name_mapping_atlas          = build_name_mapping(atlas_customers_cleaned)
name_mapping_forecast_data  = build_name_mapping(forecast_data_customers_cleaned)
name_mapping_revenue2020    = build_name_mapping(revenue2020_customers_cleaned)
name_mapping_revenue2020A   = build_name_mapping(revenue2020A_customers_cleaned)

## Combine name maps
name_mapping = { **name_mapping_atlas, **name_mapping_forecast_data, **name_mapping_revenue2020, **name_mapping_revenue2020A }

## Export the name mapping
with open(MAPPING_FILE_PATH, "w") as file:
     file.write(json.dumps(name_mapping))

In [38]:
# Anonymize the raw data files

## Import name mapping
customer_name_mapping = read_name_mapping()

# Read the data into Pandas DataFrames
atlas               = pd.read_csv(RAW_ATLAS_FILE_PATH, index_col="Customers")
forecast            = pd.read_csv(RAW_FORECAST_DATA_FILE_PATH, index_col="Organization Name")
revenue2020         = pd.read_csv(RAW_REVENUE2020_FILE_PATH, index_col="Name")
revenue2020A        = pd.read_csv(RAW_REVENUE2020A_FILE_PATH, index_col="Payee Name")

## Extract paying customers
atlas_customers_raw             = atlas_raw.index.to_list()
forecast_data_customers_raw     = forecast_data_raw.index.to_list()
revenue2020_customers_raw       = revenue2020_raw.index.to_list()
revenue2020A_customers_raw      = revenue2020A_raw.index.to_list()

## Clean up names
atlas_customers_cleaned         = cleanup_customer_names(atlas_customers_raw)
forecast_data_customers_cleaned = cleanup_customer_names(forecast_data_customers_raw)
revenue2020_customers_cleaned   = cleanup_customer_names(revenue2020_customers_raw)
revenue2020A_customers_cleaned  = cleanup_customer_names(revenue2020A_customers_raw)

## Map the customer names
atlas_customers_anonymized              = anonymize_customer_list(atlas_customers_cleaned)
forecast_data_customers_anonymized      = anonymize_customer_list(forecast_data_customers_cleaned)
revenue2020_customers_anonymized        = anonymize_customer_list(revenue2020_customers_cleaned)
revenue2020A_customers_anonymized       = anonymize_customer_list(revenue2020A_customers_cleaned)

# Fix the customers in the DataFrames
atlas.reset_index(inplace=True)
forecast.reset_index(inplace=True)
revenue2020.reset_index(inplace=True)
revenue2020A.reset_index(inplace=True)

atlas["Customers"] = atlas_customers_anonymized
forecast["Organization Name"] = forecast_data_customers_anonymized
revenue2020["Name"] = revenue2020_customers_anonymized
revenue2020A["Payee Name"] = revenue2020A_customers_anonymized

atlas.set_index("Customers", inplace=True)
forecast.set_index("Organization Name", inplace=True)
revenue2020.set_index("Name", inplace=True)
revenue2020A.set_index("Payee Name", inplace=True)

## Export
atlas.to_csv(ANON_ATLAS_FILE_PATH)
forecast.to_csv(ANON_FORECAST_DATA_FILE_PATH)
revenue2020.to_csv(ANON_REVENUE2020_FILE_PATH)
revenue2020A.to_csv(ANON_REVENUE2020A_FILE_PATH)