In [115]:
import glob
import importlib
import os

import pandas as pd

import config
import module

try:
    importlib.reload(module) # reload module
except NameError:
    pass

try:
    importlib.reload(config) # reload config
except NameError:
    pass

## Load DF

In [116]:
# if df is not loaded, load df
if not "df_ori" in locals():
    path = config.path / "raw"

    # if data is in one file
    if config.is_mutiple_files == False:
        if config.has_headers:
            df_ori = pd.read_excel(path, skiprows=2)
        else:
            df_ori = pd.read_excel(path)

    # if data is in multiple files (data from coco must have headers)
    else:
        all_files = glob.glob(os.path.join(path, "*.xlsx"))

        li = []
        for filename in all_files:
            df = pd.read_excel(filename, index_col=None, skiprows=2)
            li.append(df)
        df_ori = pd.concat(li, axis=0, ignore_index=True)

    print("df loaded")

# if df is loaded, do not load again
else:
    print("df already exist")

df = df_ori.copy()
print(df.shape)

df already exist
(352073, 14)


# DF Attendance

Row = single attendance

In [117]:
df_clean= (df
    # drop null rows and cols
    .dropna(how= 'all', axis= 'columns').dropna(how= 'all', axis= 'rows')
    # clean col name
    .rename(columns= module.map_col) 
    .rename(columns= lambda c: c.lower().replace(' ', '_')) 
    
    # drop unnecessary cols
    .drop(columns= ['class_unit'])
    
    # obtain current month only
    .assign(class_date= lambda df_: module.convert_to_gmt_plus_7(df_, 'class_date'))
    .loc[lambda df_: df_['class_date'].dt.month == module.month]
    
    # drop dup student attendance because i exported the att data multiple x
    # note: assumes that one student can only exist one at a time
    .drop_duplicates(subset= ['student_code', 'class_time', 'class_date'])
    
    .assign(
        student_name= lambda df_: df_['student_name'].str.upper(),
        # new code = name + code
        student_code= lambda df_: (df_['student_name'] + 
                                   ' - ' + 
                                   df_['student_code'].astype('str')
                                   ),
        # mode = offline, online or GOC
        class_mode= lambda df_: module.create_class_mode(df_), 
        # membership = dlx, online or GO
        student_membership= lambda df_: module.create_student_membership(df_),
        # clean class type for the first time
        class_type= lambda df_: (df_['class_type']
                                 .str.replace('Class', '', regex= False)
                                 .str.title()
                                 .str.strip()
                                 .astype('category')
                                ),  
        # clean student center
        student_center= lambda df_: (df_['student_center']
                                     .str.replace('IN: ', '', regex= False)
                                     .str.strip()
                                     .astype('category')
                                    ),
        # create class time if not exist
        class_time= lambda df_: module.create_class_time(df_),
        # clean class description
        class_description= lambda df_:  (df_['class_description']
                                         .str.lower()
                                         .str.strip()
                                         .astype('str')
                                        ),
        # clean teacher name for some teachers that are duplicated in coco
        teacher= lambda df_: module.clean_teacher_name(df_).astype('str'),
        # create class duration if not exist
        class_duration= lambda df_: module.create_duration(df_).astype('float'),
        # whether the student attend or not
        student_attendance= lambda df_: module.create_attend(df_),
        # create class location from class description
        class_location= lambda df_: (module
                                     .create_class_location_1(df_)
                                     .fillna('Online')
                                    )
    )
    # note: may 2023 - replace class with shared account with its real ET
    # this is because of shared account problem
    # should be before merging with df_teacher
    .assign(teacher= lambda df_: module.clean_shared_account_et(df_))

    # merge with df_teacher
    .merge(right= module.load_df_teacher(), on= 'teacher', how= 'left')

    # note: class location 2nd time to get class_location from teacher center
    .assign(class_location= lambda df_: module.create_class_location_2(df_),)
    # assert that online class location is online
    # assign area to each class
    .assign(
        class_location= lambda df_: module.assert_class_location_online(df_),
        class_area= lambda df_: module.create_class_location_area(df_),
    )
    
    # drop unnecessary columns and sort
    .drop(columns= ['student_name', 'student_result'])
    .sort_values(['class_date', 'class_time', 'student_code'])
    .sort_index(axis= 1) # sort columns alphabetically
    .reset_index(drop= True)
    .assign(index= lambda df_: df_.index + 1) # create index column
)

## Check Assumptions

In [118]:
# check assumptions

# ! assert that class location match with area and mode
assert (
    df_clean.loc[df_clean["class_location"] == "Online", "class_area"].unique()
    == "Online"
).all()

for center, area in zip(
    [module.jkt_1, module.jkt_2, module.jkt_3, module.bdg, module.sby],
    ["JKT 1", "JKT 2", "JKT 3", "BDG", "SBY"],
):
    assert (
        df_clean.loc[df_clean["class_location"].isin(center), "class_area"].unique()
        == area
    ).all()


# ! assert that teacher center and teacher area match
for center, area in zip(
    [module.jkt_1, module.jkt_2, module.jkt_3, module.bdg, module.sby],
    ["JKT 1", "JKT 2", "JKT 3", "BDG", "SBY"],
):
    assert (
        df_clean.loc[df_clean["teacher_center"].isin(center), "teacher_area"].unique()
        == area
    ).all()


# ! assert that no class time is missing
assert len(df_clean.loc[df_clean["class_time"].isna()]) == 0

# ! assert that all online class location is online
filter_ = df_clean["class_type"].str.contains("Online")
assert (df_clean.loc[filter_, "class_location"] != "Online").sum() == 0


# ! check if all ET in shared account is mapped
teacher_contains_online = df["Teacher"].str.lower().str.contains("online")
lower_than_eq_20 = df["Teacher"].str.extract("(\d+)")[0].astype(float) <= 20

list_et_shared = (
    df.loc[(teacher_contains_online & lower_than_eq_20), "Description"]
    .str.split("-")
    .str[-1]
    .str.strip()
    .str.lower()
    .unique()
)

map_et_shared = module.shared_acc_et_map.keys()

unmapped = []
for et in list_et_shared:
    if et not in map_et_shared:
        print(et)
        unmapped.append(et)

if len(unmapped) > 0:
    raise Exception("Some ET in shared accouns are unmapped.")


# ! assert that no teacher has center area position == 'Shared Coount'
mask = (
    (
        df_clean[["teacher_center", "teacher_area", "teacher_position"]]
        == "Shared Account"
    )
    .astype(float)
    .sum(axis=1)
    .astype(bool)
)
shared_acc = df_clean[mask]

if len(shared_acc) > 0:
    for teacher in shared_acc["teacher"]:
        print(teacher)
    raise Exception("Some teachers have center/area/position == Shared Account.")


# ! assert GOC is classified as GOC
if "Global Online Center" in df_clean["student_center"].unique():
    assert (
        df_clean.loc[
            df_clean["student_center"] == "Global Online Center", "class_mode"
        ].unique()
        == "GOC"
    )


# ! assert that no teacher is unregistered to center
unmapped = []
for et in df_clean.loc[df_clean["teacher_center"].isna(), "teacher"].unique():
    if "online" in et.lower():
        continue
    print(et)
    unmapped.append(et)

if len(unmapped) > 0:
    raise Exception("Some GOC ET are unmapped.")


# ! assert that no teacher pos is unmapped
teacher_pos_na = df_clean["teacher_position"].isna()
list_techer_pos_na = df_clean.loc[(teacher_pos_na), "teacher"].unique()

if len(list_techer_pos_na) > 0:
    for teacher in list_techer_pos_na:
        print(teacher)
    raise Exception("Some teachers have null position.")


# ! assert that all student membership is mapped
assert sorted(df_clean["student_membership"].unique()) == ["Deluxe", "GO", "VIP"]

# ! sample class description and location
# (df_clean
#     .loc[df_clean['class_mode'] == 'Offline', ['class_description', 'class_location']]
#     .sample(10)
# )

# DF Session

Row = single session

In [119]:
df_session = (df_clean
    .sort_values(["teacher", "class_date", "class_time", "student_membership"])
    .assign(
        # transform attendance
        # assumes that one teacher can only teach one class at a time
        student_attendance_grouped=lambda df_: (
            df_.groupby(["teacher", "class_date", "class_time", "class_type"])[
                "student_attendance"
            ].transform(lambda x: ", ".join(x))
        ),
        student_membership_grouped=lambda df_: (
            df_.groupby(["teacher", "class_date", "class_time", "class_type"])[
                "student_membership"
            ].transform(lambda x: ", ".join(x))
        ),
    )
    # ! drop column unique to student and drop duplicate
    .drop(
        columns=[
            "student_attendance",
            "student_center",
            "student_code",
            "student_membership",
            "index",
        ]
    )
    .drop_duplicates(keep="first")
    .assign(
        # create class type grouped
        class_type_grouped=lambda df_: module.create_class_type_grouped(df_),
        # create class service
        class_service=lambda df_: module.create_class_service(df_),
        # the number of people who books this class
        class_booking=lambda df_: module.create_class_booking(df_),
        # the number of people who actually attend
        class_attendance=lambda df_: module.create_class_attendance(df_),
        # delivered or not delivered
        class_status=lambda df_: module.create_class_status(df_),
    )
    # drop unused cols and arrange
    .drop(columns=["student_attendance_grouped", "student_membership_grouped"])
    .sort_index(axis=1)
    .reset_index(drop=True)
    .assign(index=lambda df_: df_.index + 1)
)

  df_.groupby(["teacher", "class_date", "class_time", "class_type"])[
  df_.groupby(["teacher", "class_date", "class_time", "class_type"])[


## Check Assumptions

In [120]:
# check assumptions

# ! assert all online class is online in location
filter_ = df_session["class_type"].str.contains("Online") | df_session["class_type_grouped"].str.contains("Online")
assert (df_session.loc[filter_, "class_location"] != "Online").sum() == 0


# ! booking >= attendance -> should return 0
assert (df_session['class_booking'] < df_session['class_attendance']).sum() == 0


# ! vip chould only have 3 class type (one-on-one, VPG, other)
vips= [
    # 'GOC',
    'One-on-one',
    'Online One-on-one',
    'Online VPG',
    'VPG']

# print(sorted(df_session
#     .loc[
#         df_session['class_service'] == 'VIP', 'class_type_grouped'
#     ]
#     .unique()
# ))

assert sorted((df_session.loc[df_session['class_service'] == 'VIP', 'class_type_grouped']).unique()) == vips


# ! assert all class service are mapped
assert sorted(df_session['class_service'].unique()) == ['Deluxe', 'Deluxe & Go', 'VIP']


# ! vip one on one should be booked only by 1 person
# (df_session
#     .loc[(df_session['class_type_grouped'].str.contains('One-on-one')), 'class_booking']
#     .value_counts()
# )
# (df_session
#     .loc[
#         (df_session['class_type_grouped'].str.contains('One-on-one')) &\
#         (df_session['class_booking'] > 1) 
#     ]
# )


# ! match class_type with class_type_grouped
# (df_session
#     .groupby(['class_service', 'class_type', 'class_type_grouped'])
#     .agg(count= ('class_type_grouped', 'count'))
#     .loc[lambda df_: df_['count'] > 0]
# )


# ! sample class description
# df_session.loc[:, ['class_description', 'class_service', 'class_location', 'class_type', 'class_type_grouped']].sample(3)

# Save DF

df_clean usually 19000-22000 rows long
df_session usually 3300-3600 rows long

In [121]:
print(f'session = {len(df_session)}')
print(f'attendance = {len(df_clean)}\n')

# ! class status value counts
status= (df_session['class_status'].value_counts())
print(status)
print((status['Given'] / status.sum() * 100).round(0))

session = 4962
attendance = 21196

class_status
Given        4337
Not Given     625
Name: count, dtype: int64
87.0


In [122]:
date_range=\
    df_clean['class_date'].min().month_name()[:3] +\
    ' ' + str(df_clean['class_date'].min().year)
date_range= date_range.lower()
date_range

'nov 2023'

## DF Session

In [123]:
file= df_session
folder= config.path
filename= 'data session ' + date_range + '.xlsx'

filepath= folder / filename
filepath

PosixPath('data/11 nov 2023/data session nov 2023.xlsx')

In [124]:
if not os.path.exists(filepath):
    file.to_excel(filepath, engine='xlsxwriter', index= False)
    print('File saved.')
else:
    print('File already exist.')

File already exist.


## DF Attendance

In [125]:
file= df_clean
folder= config.path
filename= 'data attendance ' + date_range + '.xlsx'

filepath= folder / filename
filepath

PosixPath('data/11 nov 2023/data attendance nov 2023.xlsx')

In [126]:
if not os.path.exists(filepath):
    file.to_excel(filepath, engine='xlsxwriter', index= False)
    print('File saved.')
else:
    print('File already exist.')

File already exist.


# Experiment

In [127]:
# desc_list= list(df_ori.loc[(df_ori['Description'].str.contains('at', regex= False, na= False)), 'Description'].unique())
# for i in desc_list:
#     print(i)

In [128]:
# (df_session
#     .loc[df_session['class_service'] == 'VIP']
#     .groupby(['class_type_grouped', 'class_type'])
#     .agg(count= ('class_type', 'size'))
#     .reset_index()
#     .pivot(index= 'class_type', columns= 'class_type_grouped')
# )

In [129]:
# (df_session
#     .loc[df_session['class_service'] == 'VIP', 'class_type']
#     .unique()
#     .to_list()
# )

In [130]:
# df_clean['class_type'].unique().to_list()

In [131]:
# check why is there duration 0
# df_ori['Duration'].value_counts(dropna= False)
# df.loc[df['Description'].str.lower() == 'advising session - altysa @go']
# df_clean.loc[df_clean['class_duration'] == 0]

In [132]:
# check GOC class
# df.loc[df['Teacher'] == 'D. Sughraa']
# df_clean.loc[df_clean['teacher'] == 'D. Sughraa']
# df_clean['teacher'].sample(5)

online trainer 1-20 = inhouse ET

online trainer 21-30 = ooolab

In [133]:
# def clean_shared_account_et(df: pd.DataFrame):
    
#     et_map= {   
#         'anna': 'Tinggogoy Anna Maria',
#         'ruth olivia': 'Pakpahan Ruth Olivia Angelina',
#         'daniel': 'Bradshaw Daniel',
#         'derek': 'Laurendeau Derek',
#         'imelda': 'Basuki Imelda',
#         'vivi': 'Hazisyah Alifia Nur',
#         'priscill': 'Priscilla Yokhebed',
#         'ryan b': 'Blasczyk Ryan',
#         'eka': 'Mustikawati Eka',
#         'rahul': 'Azhar Rahul Finaya',
#         'ade': 'Setiadi Sapto',
#         'oliv': 'Pakpahan Ruth Olivia Angelina',
#         'uzli': 'Ainiyah Uzlifatul',
#         'john lawrence': 'Lawrence Moore John',
#         'toby': 'Phillips Toby',
#         'risma': 'Handayani Khaerunisyah Risma',
#         'nadya': 'Nasarah Nadya',
#         'fairuz': 'Fairuz Muhammad',
#         'aurora': 'Rifani Aurora Nurhidayah',
#         'tri bekti': 'Hundoyo Tri Bekti',
#         'madeline': 'Jane Quinn Madeline',
#         'medi': 'Medianti Annisa',
#         'jason': 'Gereau Jason Jarett',
#         'priscilla': 'Priscilla Yokhebed',
#         'nova': 'Rahmadya Nova Ayu',
#         'jack jones': 'Jones Jack William Isaac',
#         'priscil': 'Priscilla Yokhebed',
#         'olive': 'Pakpahan Ruth Olivia Angelina',
#         'prettya': 'Kartikasari Prettya Nur',
#         'alex': 'Algar Sinclair Alexander John',
#         'shafira': 'Ayuningtyas Shafira',
#         'anggi': 'Ansyahputri Anggita Rizkiarachma',
#         'ryan': 'Blasczyk Ryan'
#     }

#     contains_online= (df['teacher'].str.contains('Online'))
#     more_than_20= (df['teacher'].str.extract('(\d+)')[0].astype(float) > 20)
#     lower_than_eq_20= (df['teacher'].str.extract('(\d+)')[0].astype(float) <= 20)

#     conditions= [
#         (contains_online & lower_than_eq_20),
#     ]
#     choices= [
#         (df
#             ['class_description']
#             .str.split('-').str[-1].str.strip()
#             .map(et_map)
#         ),
#     ]
#     return np.select(conditions, choices, default= df['teacher'])

# df_clean.assign(teacher= lambda df_: clean_shared_account_et(df_))

In [134]:
# check shared account class type

# df_session.loc[
#     df_session['class_description'].str.contains(' - priscill'),
#     ['class_service', 'class_description', 'class_type', 'class_type_grouped', 'teacher']
# ]

In [135]:
# # check comunity class
# (df_session
#     .loc[df_session['class_type_grouped'] == 'Offline Community']
# )

In [136]:
# check class session
# df_session.groupby(['class_type', 'class_type_grouped']).size().to_frame().reset_index().loc[lambda df_: df_[0] > 0]

In [137]:
# find community class
# (df_session
#     .loc[df_session['class_type'].isin(['Social Club', 'Online Social Club'])]
#     .loc[df_session['class_description'].str.contains('community'),'class_description']
#     .unique()
# )

In [138]:
# df.loc[(df['Description'].str.contains('FDN OFFLINE @SMB', na= False)) & (df['Teacher'] == 'BLASCZYK (SMB) RYAN')]

In [139]:
# df_clean.loc[df_clean['teacher'].str.contains('Bushey'), 'teacher'].unique()