In [12]:
import importlib
import os
from pathlib import Path

import numpy as np
import pandas as pd

import config
import module

try:
    importlib.reload(config)  # reload module
except NameError:
    pass
try:
    importlib.reload(module)  # reload module
except NameError:
    pass

In [13]:
# load df
folder_path = Path("input", config.month)

excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xls')]

dfs = []

for file in excel_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path, skiprows=6)
    dfs.append(df)

df_ori = pd.concat(dfs, ignore_index=True)



# Clean DF

In [14]:
df_clean = (df_ori
    .dropna(how="all", axis="columns")
    .dropna(how="all", axis="rows")
    .rename(columns=lambda c: c.lower().replace(" ", "_"))  # replace space with _
    .assign(
        student_code=lambda df_: (
            df_["last_name"].str.upper()
            + " "
            + df_["first_name"].str.upper()
            + " - "
            + df_["student_code"].astype("str")
        ).str.strip(),
        student_membership=lambda df_: module.create_student_membership(df_),
        start_level=lambda df_: df_["start_level"].astype(float),
        current_level=lambda df_: df_["current_level"].astype(float),
        date_of_birth=lambda df_: pd.to_datetime(df_["date_of_birth"]),
        start_date=lambda df_: pd.to_datetime(df_["start_date"]),
        end_date=lambda df_: pd.to_datetime(df_["end_date"]),
        email=lambda df_: df_["email"].str.lower().str.strip(),
        mobile=lambda df_: (
            df_["mobile"]
            .astype(str)
            .str.replace("-", "", regex=False)
            .str.replace("+", "", regex=False)
            .str.strip()
        ),
    )
    # ! drop ST
    .loc[
        lambda df_: ~(
            df_["student_code"].str.contains("STREET TALK|STREETTALK", na=False)
        )
    ]
    # ! drop duplicated member based on student code and start date
    .drop_duplicates(subset=["student_code", "start_date"], keep="first")
    # ! drop unnecessary cols
    .drop(
        columns=[
            "gender",
            "home",
            "work",
            "end_level",
            "on_track",
            "course_status",
            "personal_tutor",
            "first_name",
            "last_name",
            "center_name",
        ]
    )
)
df_clean.head(3)

Unnamed: 0,student_code,date_of_birth,mobile,email,service_type,consultant,start_date,end_date,start_level,current_level,contract_status,student_membership
0,MARIO (GO) ANDRI - 4046,1983-03-25 07:00:00,6281389227775,andri.digital@yahoo.com,Standard,Fajria Sahista Achadiarrohma,2023-11-08,2025-02-07 23:59:59,3.0,3.0,Active-Valid,GO
1,MUTIA (GO) SEIRA - 3319,1994-03-23 07:00:00,6281947687745,racemonk9@gmail.com,Standard,Danea Sindi Dini,2023-04-10,2024-10-09 23:59:59,2.0,2.0,InActive-Valid,GO
2,YANUARI (GO CPT CTI GROUP) DANIEL EKO - 3792,2000-01-17 07:00:00,6282335725604,daniel.yanuari@helios.id,Standard,"Amalia, S.T Rina",2023-08-25,2024-02-24 23:59:59,9.0,9.0,Active-Valid,GO


# Assertion

In [15]:
assert sorted(df_clean["student_membership"].unique()) == ["Deluxe", "GO", "VIP"]

# Save DF

In [17]:
filename = ("coco member.xlsx").replace(" ", "_")
output_path = Path("output", folder_path.stem)

if not os.path.exists(full_filepath := output_path / filename):
    try:
        os.mkdir(output_path)
    except FileExistsError:
        pass
    df_clean.to_excel(full_filepath, engine="xlsxwriter", index=False)
    print("File saved.")
else:
    print("File already exist.")

File saved.
