In [3]:
import pandas as pd 
import numpy as np

from pathlib import Path

In [11]:
path_hub = "sample-input/hubspot-crm-exports-sync-erwin-hubspot-2023-10-16.xlsx"
path_er = "sample-input/crm.lead.csv"
parent_dir = Path.cwd().parents[0]

try:
    df_hub = pd.read_csv(parent_dir / path_hub)
except UnicodeDecodeError:
    df_hub = pd.read_excel(parent_dir / path_hub)

try:
    df_er = pd.read_csv(parent_dir / path_er)
except UnicodeDecodeError:
    df_er = pd.read_excel(parent_dir / path_er)


  warn("Workbook contains no default style, apply openpyxl's default")


In [124]:
def clean_email(df, email_col):
    return df[email_col].str.strip().str.lower()

def clean_phone(df, phone_col):
    return (df[phone_col]
            .str.strip()
            .str.replace("/D", "", regex=True)
            .str.replace("^62", "0", regex=True)
            .str.replace("^8", "08", regex=True)
            )

In [127]:
def clean_df_hub(df):
    return (df
        .rename(columns=lambda c: c.lower().replace(" ", "_"))
        .assign(
            email=lambda df_: clean_email(df_, "email"),
            phone_number=lambda df_: clean_phone(df_, "phone_number"),
        )
        .loc[lambda df_: (df_["email"] != np.nan) & (df_["phone_number"] != np.nan)]
        .drop_duplicates(subset=["email", "phone_number"])
    )


def clean_df_er(df):
    map_stage = {
        "Appointment": "Sales qualified lead",
        "Show": "Opportunity",
        "Down Payment": "Customer",
        "Fully Paid": "Customer",
        "False": "Lead",
        "Lead": "Lead",
        np.nan: "Lead",
    }
    map_lp = {
        "Ya": "Yes",
        "Mau!": "Yes",
        "Tidak": "No",
        "Engga, deh.": "No",
        np.nan: "Blank",
    }

    return (df_er
        .rename(columns=lambda c: c.lower().replace(" ", "_").replace("/", "_").replace("?", ""))
        .rename(columns={
            "stage_display_name": "stage"
        })
        .loc[
            lambda df_: 
                (df_["source"].isin(["Digital-Paid", "Digital-Organic"])) &\
                (df_["stage"] != "Renewal") &\
                (df_["email"] != np.nan) &\
                (df_["phone"] != np.nan)
        ]
        .assign(
            email=lambda df_: clean_email(df_, "email"),
            phone=lambda df_: clean_phone(df_, "phone"),
            stage=lambda df_: df_["stage"].map(map_stage),
            learning_preference=lambda df_: df_["learning_preference"].map(map_lp),
            tmk_call=lambda df_: df_["tmk_call"].map({True: 1, False: 0})
        )
        .drop_duplicates(subset=["email", "phone"])
    )

In [129]:
df_hub_clean =clean_df_hub(df_hub)
df_er_clean =clean_df_er(df_er)