In [20]:
# initial imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import hvplot.pandas
import panel as pn
from pathlib import Path
from dotenv import load_dotenv
from panel.interact import interact
from panel import widgets
from string import digits
import csv
import json

pn.extension()

%matplotlib inline

In [46]:
# Function definitions: TODO move to .py

## Data cleaning
def remove_strings_from_customer_names(original_customer_name):
    invalid_strings = ["PYMT", "DUE"]
    for invalid_item in invalid_strings:
        original_customer_name = original_customer_name.replace(invalid_item, "")
    return original_customer_name.strip()

def remove_numbers_from_customer_names(original_customer_name):
    remove_digits = str.maketrans('', '', digits) 
    return original_customer_name.translate(remove_digits).strip()  

def cleanup_customer_names(paying_customers_raw):
    paying_customers_cleanedup = []
    for customer in paying_customers_raw:
        customer_capitalized = str(customer).upper()
        corrected_customer_name = remove_strings_from_customer_names(customer_capitalized)
        corrected_customer_name = remove_numbers_from_customer_names(corrected_customer_name)
        paying_customers_cleanedup.append(corrected_customer_name)
    return paying_customers_cleanedup

def build_name_mapping(paying_customers_cleanedup):
    name_mapping = {}
    n = 1
    for customer in paying_customers_cleanedup:
        if not customer in name_mapping:
            name_mapping[customer] = "University " + str(n)
            n += 1
    return name_mapping

def read_name_mapping():
    with open(MAPPING_FILE_NAME, "r") as file:
        return json.loads(file.read())

In [47]:
# Constants
MAPPING_FILE_NAME = "Resources/Mappings/CustomerNameMapping.json"

In [48]:
# Build the de-anonymous customer name mapping

## Read the datasets into Pandas DataFrames
atlas_file_path = Path("Resources/Raw/ATLAS.csv")
forecast_data_file_path = Path("Resources/Raw/2021 forecast CSV.csv")
ATLAS_raw = pd.read_csv(atlas_file_path, index_col="Customers")
forecast_data_raw = pd.read_csv(forecast_data_file_path, index_col="Organization Name")

## Extract paying customers
atlas_customers_raw = ATLAS_raw.index
forecast_data_customers_raw = forecast_data_raw.index

## Clean up names
atlas_customers_cleaned = cleanup_customer_names(atlas_customers_raw)
forecast_data_customers_cleaned = cleanup_customer_names(forecast_data_customers_raw)

## Build name mapping
name_mapping_atlas = build_name_mapping(atlas_customers_cleaned)
name_mapping_forecast_data = build_name_mapping(forecast_data_customers_cleaned)

## Combine name maps
name_mapping = { **name_mapping_atlas, **name_mapping_forecast_data }

## Export the name mapping
with open(MAPPING_FILE_NAME, "w") as file:
     file.write(json.dumps(name_mapping))

536

In [None]:
# Fix ATLAS data

## Import name mapping
name_mapping = read_name_mapping()

# Read the ATLAS data into a Pandas DataFrame
file_path = Path("Resources/Raw/ATLAS.csv")
ATLAS_raw = pd.read_csv(file_path, index_col="Customers")
ATLAS_raw.head()
ATLAS_raw[0:10]


# Function definitions
# TODO move to .py
def remove_strings_from_customer_names(original_customer_name):
    invalid_strings = ["PYMT", "DUE"]
    for invalid_item in invalid_strings:
        original_customer_name = original_customer_name.replace(invalid_item, "")
    return original_customer_name.strip()

def remove_numbers_from_customer_names(original_customer_name):
    remove_digits = str.maketrans('', '', digits) 
    return original_customer_name.translate(remove_digits).strip()  


# Extract paying customers
paying_customers_raw = ATLAS_raw.index


# Clean up names
paying_customers_cleanedup = []
for customer in paying_customers_raw:
    customer_capitalized = customer.upper()
    corrected_customer_name = remove_strings_from_customer_names(customer_capitalized)
    corrected_customer_name = remove_numbers_from_customer_names(corrected_customer_name)
    paying_customers_cleanedup.append(corrected_customer_name)
    #print(f"Raw customer name: \"{customer_capitalized}\"  Corrected customer name: \"{corrected_customer_name}\"")


# Define needed variables for re-naming
name_mapping = {}
n = 1


# Build name mapping
for customer in paying_customers_cleanedup:
    if not customer in name_mapping:
        name_mapping[customer] = "University " + str(n)
        n += 1


# Re-name the universities in the data
paying_customers_deanonymized = []
for customer in paying_customers_cleanedup:
    paying_customers_deanonymized.append(name_mapping[customer])


# Build a de-anonymized DataFrame
ATLAS_deanon = pd.concat([
    ATLAS_raw["Invoice Date"],
    ATLAS_raw["Invoice #"],
    ATLAS_raw["Invoice Amount"],
    ATLAS_raw["Subscription"],
    ATLAS_raw["Account Code "],
    ATLAS_raw["Dates of service "]
    ], axis="columns", join="inner")

## Fix index
ATLAS_deanon.index = paying_customers_deanonymized
ATLAS_deanon.index.name = "Customers"

## Result
ATLAS_deanon.head()


# Export
ATLAS_deanon.to_csv("Resources/RawDeanonymized/ATLAS.csv")


# Export the name mapping
MAPPING_FILE_NAME = "Resources/Mappings/CustomerNameMapping.json"
with open(MAPPING_FILE_NAME, "w") as file:
     file.write(json.dumps(name_mapping)) # use `json.loads` to do the reverse
