In [29]:
import importlib
import os
from pathlib import Path

import numpy as np
import pandas as pd

import config
import module

try:
    importlib.reload(config)  # reload module
except NameError:
    pass
try:
    importlib.reload(module)  # reload module
except NameError:
    pass

In [30]:
# load df
folder_path = Path("input", config.month)

excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xls')]

dfs = []

for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path, skiprows=6)
    dfs.append(df)

df_ori = pd.concat(dfs, ignore_index=True)



# Clean DF

In [45]:
df_clean = (df_ori
    .dropna(how="all", axis="columns")
    .dropna(how="all", axis="rows")
    .rename(columns=lambda c: c.lower().replace(" ", "_"))  # replace space with _
    .assign(
        student_code=lambda df_: (
            df_["last_name"].str.upper()
            + " "
            + df_["first_name"].str.upper()
            + " - "
            + df_["student_code"].astype("str")
        ).str.strip(),
        student_membership=lambda df_: module.create_student_membership(df_),
        start_level=lambda df_: df_["start_level"].astype(float),
        current_level=lambda df_: df_["current_level"].astype(float),
        date_of_birth=lambda df_: pd.to_datetime(df_["date_of_birth"]),
        start_date=lambda df_: pd.to_datetime(df_["start_date"]),
        end_date=lambda df_: pd.to_datetime(df_["end_date"]),
        email=lambda df_: df_["email"].str.lower().str.strip(),
        mobile=lambda df_: (
            df_["mobile"]
            .astype(str)
            .str.replace("-", "", regex=False)
            .str.replace("+", "", regex=False)
            .str.strip()
        ),
        consultant = lambda df_: df_["consultant"].str.upper(),
        is_cpt = lambda df_: module.get_cpt(df_)
    )
    # ! drop ST
    .loc[
        lambda df_: ~(
            df_["student_code"].str.contains("STREET TALK|STREETTALK", na=False)
        )
    ]
    # ! drop duplicated member based on student code and start date
    .drop_duplicates(subset=["student_code", "start_date"], keep="first")
    # ! drop unnecessary cols
    .drop(
        columns=[
            "gender",
            "home",
            "work",
            "end_level",
            "on_track",
            "course_status",
            "personal_tutor",
            "first_name",
            "last_name",
            "center_name",
        ]
    )
)

In [55]:
(df_clean
    # .assign(
    #     is_cpt = lambda df_: module.get_cpt(df_)
    # )
    # .loc[lambda df_: df_["is_cpt"] == True, ["student_code", "consultant"]]
    # .sample(10)
)

Unnamed: 0,student_code,consultant
25241,SITUMORANG (GO CPT SILOAM HOSPITAL) RATNA WATI...,"AMALIA, S.T RINA"
24868,HIWANG N P A (GO CPT TIKET.COM) RIES - 2752,TEDJOKOESOEMO PUTRA PRATAMA
25143,WIBOWO (GO CPT TIKET.COM) SENTOT GALIH - 2765,TEDJOKOESOEMO PUTRA PRATAMA
24828,NOVITASARI (GO CPT BRI AGRI) DIAN - 3507,PUTRI HANDAYANI KUN ANDIKA
25224,FARIDA (GO CPT SILOAM HOSPITAL) NADA DINAR - 3630,"AMALIA, S.T RINA"
25025,EZRYANA (GO CPT SILOAMHOSPITAL) PRIMA - 1983,TEDJOKOESOEMO PUTRA PRATAMA
26358,DETHAN (GO CPT BTPN) JAYANTI - 2943,PUTRI HANDAYANI KUN ANDIKA
25309,HANDAYANI (GO CPT TIKET.COM) RULY SULIS - 3133,AIDIL MUNAWAR
24952,PUSPITHA (GO CPT BRI AGRI) ENDANG RESTIANA - 3505,PUTRI HANDAYANI KUN ANDIKA
24951,PUSPITHA (GO CPT BRI AGRI) ENDANG RESTIANA - 3505,PUTRI HANDAYANI KUN ANDIKA


# Assertion

In [4]:
assert sorted(df_clean["student_membership"].unique()) == ["Deluxe", "GO", "VIP"]

# Save DF

In [5]:
filename = ("coco member.xlsx").replace(" ", "_")
output_path = Path("output", folder_path.stem)

if not os.path.exists(full_filepath := output_path / filename):
    try:
        os.mkdir(output_path)
    except FileExistsError:
        pass
    df_clean.to_excel(full_filepath, engine="xlsxwriter", index=False)
    print("File saved.")
else:
    print("File already exist.")

File already exist.
