In [58]:
import importlib
from pathlib import Path

import numpy as np
import pandas as pd

import config
import tests
from modules import module

try:
    importlib.reload(module)  # reload module
except NameError:
    pass

try:
    importlib.reload(tests)  # reload module
except NameError:
    pass

try:
    importlib.reload(config)  # reload module
except NameError:
    pass

# Load DF

In [59]:
path = Path(config.path_raw_file)
try:
    df = pd.read_csv(path)
except UnicodeDecodeError:
    df = pd.read_excel(path)

# Clean DF

In [60]:
df_clean = (
    df
    .rename(columns={"from": "From", "to": "To"})
    # drop na
    .dropna(subset="From")
    .dropna(subset="To")
    # rename column
    .rename(columns=lambda c: module.clean_col_name(c))
    .rename(columns=module.to_rename)
    .assign(
        # clean city and create region
        city=lambda df_: (df_["city"].str.title().str.strip()),
        region=lambda df_: module.create_region(df_["city"], df_["partner_street"]),
        # clean dob, if dob1 blank then dob2
        dob=lambda df_: np.where(
            df_["dob"].isna(), pd.to_datetime(df_["dob2"]), pd.to_datetime(df_["dob"])
        ),
        # clean start_date, end_date, fp_date
        start_date=lambda df_: pd.to_datetime(df_["start_date"]),
        end_date=lambda df_: pd.to_datetime(df_["end_date"]),
        fp_date=lambda df_: pd.to_datetime(df_["fp_date"]),
        # clean income
        income=lambda df_: df_["income"].astype(module.income_cat),
        # clean job and create job group
        job=lambda df_: module.clean_job(df_),
        job_group=lambda df_: module.group_job(df_["job"]),
        # clean gender
        gender=lambda df_: module.clean_gender(df_["gender"]),
        # create age
        age=lambda df_: module.get_age(df_),
        # cleah join reason, hobby, interest
        partner_join_reason=lambda df_: df_["partner_join_reason"].str.strip().str.lower(),
        hobby=lambda df_: df_["hobby"].fillna(df_["partner_hobby"]).str.strip().str.lower(),
        interest=lambda df_: df_["interest"].fillna(df_["partner_interest"]).str.strip().str.lower(),
        # create id
        id=lambda df_: (
            df_["name"] + " " 
            + df_["dob"].dt.strftime("%Y-%m-%d")
        ),
        # create membership code
        membership_code=lambda df_: module.get_membership_code(df_["product"]),
        # create membership duration
        membership_duration=lambda df_: module.get_membership_duration(df_["product"]),
    )
    # merge with membership mapping to obtain membership
    .merge(
        right=pd.read_excel(Path("input/membership_mapping.xlsx")),
        on="membership_code",
        how="left",
    )
    .assign(
        # make sure that all corporate is mapped
        is_cpt=lambda df_: module.assert_cpt_catched(df_),
        # create student center and area 
        center=lambda df_: module.clean_center(df_, "center", "is_cpt", "core_product"),
        area=lambda df_: module.clean_area(df_),
    )
    # create is next contract col
    # sort by id first
    .sort_values(["id", "end_date"], ascending=[True, False])
    .assign(is_next_contract=lambda df_: df_["id"].duplicated(keep="last"))
    # ! drop membership code na
    .loc[lambda df_: ~(df_["membership_code"].isna())]
    # ! filter staff
    .loc[lambda df_: df_["contract_type"].str.title() != "Employee"]
    .loc[lambda df_: df_["core_product"].str.title() != "Staff"]
    # ! drop date too old
    .loc[lambda df_: pd.to_datetime(df_["start_date"]).dt.year >= 2020]
    # ! filter cancelled member and free member and non member
    .loc[
        lambda df_: ~(
            df_["membership_status"].str.title().isin(
                ["Cancelled Member", "Free Member", "Non Member"]
            )
        )
    ]
    # ! filter freezed member
    .loc[lambda df_: ~df_["membership_code"].isin(["FREEZE"])]
    # # ! filter member with null id, phone and email
    .loc[
        lambda df_: ~((df_["id"].isna()) & (df_["email"].isna()) & (df_["phone"].isna()))
    ]
    # drop unused cols
    .drop(columns=module.to_drop)
    # sort column
    .sort_index(axis=1)
)

# create activity columns
date_columns = [
    "jan_2023", "feb_2023", "mar_2023", "apr_2023", "may_2023", "jun_2023",
    "jul_2023", "aug_2023", "sep_2023", "oct_2023", "nov_2023", "dec_2023", 
    "jan_2024", "feb_2024", "mar_2024", "apr_2024", "may_2024"
]
for month in date_columns:
    activity_assignments = {
        f"active_{month}": lambda df_: module.is_active(
            df_, "start_date", "end_date", f"1 {month.replace('_', ' ')}"
    )}
    df_clean = df_clean.assign(**activity_assignments)

# if nan center < 1% of df length, drop nan center
perc_nan = df_clean["center"].isna().sum() / len(df_clean) * 100
if perc_nan < 1:
    df_clean = df_clean.dropna(subset="center")


# sort columns
order = [
    "id", "dob", "age", "phone", "email", "gender", "income",
    "start_date", "end_date", "job", "job_group", "interest", "hobby", "partner_join_reason",
    "partner_street", "city", "region", "product", "membership_code", "core_product", "addon_1",
    "addon_2", "addon_3", "membership_duration", "contract_type", "membership_status", "center", "area",
    "fp_date", "is_cpt", "is_next_contract", "is_renewal", 
    'active_jan_2023', 'active_feb_2023', 'active_mar_2023',
    'active_apr_2023', 'active_may_2023', 'active_jun_2023',
    'active_jul_2023', 'active_aug_2023', 'active_sep_2023',
    'active_oct_2023', 'active_nov_2023', 'active_dec_2023',
    'active_jan_2024', 'active_feb_2024', 'active_mar_2024',
    'active_apr_2024', 'active_may_2024'
]
assert set(order) == set(df_clean.columns)
df_clean = df_clean.loc[:, order]
print(df_clean.shape)

(14214, 49)


# Tests and Assertions

In [61]:
tests.test_all_centers_mapped(df)
tests.test_all_centers_are_correct(df_clean)
tests.test_all_areas_are_correct(df_clean)
tests.test_na_center_low(df_clean["center"], threshold=0.0)
tests.test_na_area_low(df_clean["area"], threshold=0.0)
tests.test_all_memberships_are_filled(df_clean)
tests.test_all_membership_mapped(df_clean)
tests.test_cpt_is_flagged(df_clean)
tests.test_cpt_in_cpt_center(df_clean)
tests.test_cpt_in_cpt_area(df_clean)
tests.test_noncpt_in_noncpt_center(df_clean)
tests.test_noncpt_in_noncpt_area(df_clean)

# Save DF

In [62]:
import os

file = df_clean
path = f"output/member_data_{Path(config.path_raw_file).stem}.xlsx"

if not os.path.exists(path):
    file.to_excel(path, index=False)
    print("File saved.")
else:
    print("File already exist.")

# Experiment