In [6]:
# initial imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.pandas
import panel as pn
from pathlib import Path
from dotenv import load_dotenv
from panel.interact import interact
from panel import widgets
from string import digits
import csv
import json

pn.extension()

%matplotlib inline

In [7]:
# Function definitions: TODO move to .py

## Data cleaning
def remove_strings_from_customer_names(original_customer_name):
    invalid_strings = ["PYMT", "DUE"]
    for invalid_item in invalid_strings:
        original_customer_name = original_customer_name.replace(invalid_item, "")
    return original_customer_name.strip()

def remove_numbers_from_customer_names(original_customer_name):
    remove_digits = str.maketrans('', '', digits) 
    return original_customer_name.translate(remove_digits).strip()  

def cleanup_customer_names(paying_customers_raw):
    paying_customers_cleanedup = []
    for customer in paying_customers_raw:
        #corrected_customer_name = str(customer).upper()
        corrected_customer_name = remove_strings_from_customer_names(corrected_customer_name)
        corrected_customer_name = remove_numbers_from_customer_names(corrected_customer_name)
        #corrected_customer_name = corrected_customer_name.title()
        paying_customers_cleanedup.append(corrected_customer_name)
    return paying_customers_cleanedup

def build_name_mapping(paying_customers_cleanedup):
    name_mapping = {}
    n = 1
    for customer in paying_customers_cleanedup:
        if not customer in name_mapping:
            name_mapping[customer] = "University " + str(n)
            n += 1
    return name_mapping

def read_name_mapping():
    with open(MAPPING_FILE_NAME, "r") as file:
        return json.loads(file.read())

def de_anonymize_customer_list(customer_list):
    deanonymized_customer_list = []
    for customer in customer_list:
        deanonymized_customer_list.append(customer_name_mapping[customer])


In [8]:
# Constants
MAPPING_FILE_NAME = "Resources/Mappings/CustomerNameMapping.json"

In [9]:
# Build the de-anonymous customer name mapping

## Read the datasets into Pandas DataFrames
atlas_file_path = Path("Resources/01_Raw/ATLAS.csv")
forecast_data_file_path = Path("Resources/01_Raw/2021 forecast CSV.csv")
atlas_raw = pd.read_csv(atlas_file_path, index_col="Customers")
forecast_data_raw = pd.read_csv(forecast_data_file_path, index_col="Organization Name")

## Extract paying customers
atlas_customers_raw = atlas_raw.index
forecast_data_customers_raw = forecast_data_raw.index

## Clean up names
atlas_customers_cleaned = cleanup_customer_names(atlas_customers_raw)
forecast_data_customers_cleaned = cleanup_customer_names(forecast_data_customers_raw)

## Build name mapping
name_mapping_atlas = build_name_mapping(atlas_customers_cleaned)
name_mapping_forecast_data = build_name_mapping(forecast_data_customers_cleaned)

## Combine name maps
name_mapping = { **name_mapping_atlas, **name_mapping_forecast_data }

## Export the name mapping
with open(MAPPING_FILE_NAME, "w") as file:
     file.write(json.dumps(name_mapping))

UnboundLocalError: local variable 'corrected_customer_name' referenced before assignment

In [5]:
# Fix ATLAS data

## Import name mapping
customer_name_mapping = read_name_mapping()

# Read the ATLAS data into a Pandas DataFrame
atlas_file_path = Path("Resources/01_Raw/ATLAS.csv")
forecast_data_file_path = Path("Resources/01_Raw/2021 forecast CSV.csv")
atlas_raw = pd.read_csv(atlas_file_path, index_col="Customers")
forecast_data_raw = pd.read_csv(forecast_data_file_path, index_col="Organization Name")

## Extract paying customers
atlas_customers_raw = atlas_raw.index
forecast_data_customers_raw = forecast_data_raw.index

## Clean up names
atlas_customers_cleaned = cleanup_customer_names(atlas_customers_raw)
forecast_data_customers_cleaned = cleanup_customer_names(forecast_data_customers_raw)

## Map the customer names
atlas_customers_deanonymized = de_anonymize_customer_list(atlas_customers_cleaned)
forecast_data_customers_deanonymized = de_anonymize_customer_list(forecast_data_customers_cleaned)


## Rebuild the DataFrames
atlas_deanonymized = pd.concat([
    atlas_raw["Invoice Date"],
    atlas_raw["Invoice #"],
    atlas_raw["Invoice Amount"],
    atlas_raw["Subscription"],
    atlas_raw["Account Code "],
    atlas_raw["Dates of service "]
    ], axis="columns", join="inner")
atlas_deanonymized.index = atlas_customers_deanonymized
atlas_deanonymized.index.name = "Customers"

forecast_data_deanonymized = pd.concat([
    forecast_data_raw["Mailing State/Province"],
    forecast_data_raw["Organization Record Type"],
    forecast_data_raw["ATLAS Customer Type"],
    forecast_data_raw["Subscription Fee"],
    forecast_data_raw["Subscription Type"],
    forecast_data_raw["Contract Start Date"],
    forecast_data_raw["Contract End Date"]
    ], axis="columns", join="inner")
forecast_data_deanonymized.index = atlas_customers_deanonymized
forecast_data_deanonymized.index.name = "Organization Name"

## Result
atlas_deanonymized.head()
forecast_data_deanonymized.head()

## Export
atlas_deanonymized.to_csv("Resources/02_Deanonymized/ATLAS.csv")
forecast_data_deanonymized.to_csv("Resources/02_Deanonymized/2021 forecast CSV.csv")
