In [184]:
import pandas as pd
from datetime import datetime
import numpy as np
import time
import os
from dotenv import load_dotenv

# Load Data

In [105]:
pd.set_option('display.max_columns', None)

In [106]:
data_local = pd.read_excel("../../analisa/2025/Summary_2025.xlsx")

In [107]:
data_local['Source'] = "Web Ads"
data_local.to_excel("../../analisa/2025/Summary_2025.xlsx")

Load data from sql

In [186]:
from sqlalchemy import create_engine
from urllib.parse import quote_plus

load_dotenv()

username_db = os.getenv("DB_USERNAME")
password_db = os.getenv("DB_PASSWORD")
host_db = os.getenv("DB_HOST")
port_db = os.getenv("DB_PORT")
database_db = os.getenv("DB_DATABASE")

# Database credentials

# URL-encode username & password in case they have special characters
username_enc = quote_plus(username_db)
password_enc = quote_plus(password_db)

# Create SQLAlchemy engine
engine = create_engine(
    f"mysql+pymysql://{username_enc}:{password_enc}@{host_db}:{port_db}/{database_db}"
)

# Run a query and load results into Pandas
query_donasi = """SELECT a.*,b.title
FROM wp_ajskr.tEZ3UbOt_dja_donate a
LEFT JOIN wp_ajskr.tEZ3UbOt_dja_campaign b ON b.campaign_id = a.campaign_id """
query_aff_submit = """SELECT *
FROM wp_ajskr.tEZ3UbOt_dja_aff_submit"""
query_aff_code = """SELECT *
FROM wp_ajskr.tEZ3UbOt_dja_aff_code"""
query_users = """SELECT *
FROM wp_ajskr.tEZ3UbOt_users"""

donasi_table = pd.read_sql(query_donasi, engine)
aff_submit_table = pd.read_sql(query_aff_submit, engine)
aff_code_table = pd.read_sql(query_aff_code, engine)
user_table = pd.read_sql(query_users, engine)

In [109]:
# Join Table
donasi_submit = donasi_table.merge(aff_submit_table, left_on='id', right_on='donate_id', how='left')
donasi_submit_code = donasi_submit.merge(aff_code_table, left_on='affcode_id', right_on='id', how='left')
donasi_users = donasi_submit_code.merge(user_table, left_on='user_id_y', right_on='ID', how='left')


# Choose only necessary columns
donasi_download = donasi_users[['campaign_id','invoice_id', 'name', 'whatsapp', 'email', 'ip', 'comment', 'title','nominal', 'payment_method', 'payment_number', 'payment_account', 'status', 'nominal_commission', 'display_name', 'aff_code', 'created_at_x', 'info_qurban', 'info_package2', 'info_zfitrah', 'info_donate', 'utm_source', 'utm_medium', 'utm_content', 'utm_campaign', 'utm_term', 'utm_id']]

In [110]:
# Mapping campaign and campaigner
campaign_map = {
    "dja3d0s0b0k":"pungky",
    "djah4o94i2w":"cekat",
    "djazb6xk1xx": "fahry",
    "djayr4jyoxw": "fahry",
    "djax0gtszee": "alif",
    "djauvpug3dk": "fahry",
    "djar8al0qg2": "ali",
    "djam1ptsz9a": "dedi",
    "djait62m3qv": "fatih",
    "djah4o94i2w": "cekat",
    "djabkim8p3t": "fredo",
    "dja9o7ftxxf": "cekat"
}

donasi_download["zisco"] = donasi_download["campaign_id"].map(campaign_map)  # returns "alif" or None if not found
donasi_download['zisco'].value_counts(dropna=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  donasi_download["zisco"] = donasi_download["campaign_id"].map(campaign_map)  # returns "alif" or None if not found


zisco
fredo     19980
fahry     18718
NaN       17954
alif      13494
fatih     11958
ali         955
dedi        319
cekat        15
pungky        4
Name: count, dtype: int64

# Feature engineering data from SQL

In [111]:
donasi_download['Day'] = donasi_download['created_at_x'].dt.strftime("%a")
donasi_download['Time'] = donasi_download['created_at_x'].dt.strftime("%H:%M:%S")
donasi_download = donasi_download.rename(columns={
    "invoice_id": "Invoice ID",
    "name": "Donatur",
    "whatsapp": "Whatsapp",
    "email": "Email",
    "comment": "Comment",
    "title": "Program",
    "nominal":"Total",
    "payment_method": "Payment Method",
    "payment_number": "Payment Number",
    "payment_account": "Payment Account",
    "status": "Payment Status",
    "nominal_commission":"Fundraiser Commision",
    "display_name":"Fundraiser Name",
    "created_at_x":"Date",
    "info_qurban":"Data Qurban",
    "info_package2":"Data Package-2",
    "info_zfitrah":"Data Zakat Fitrah",
    "info_donate":"Additional Data",
    "utm_source":"UTM Source",
    "utm_medium":"UTM Medium",
    "utm_content":"UTM Content",
    "utm_campaign":"UTM Campaign",
    "utm_term":"UTM Term",
    "utm_id":"UTM ID"    
})

donasi_download["Payment Status"] = np.where(donasi_download['Payment Status'] == 1, "Success", "Waiting")

donasi_download['Source'] = "Web Ads"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  donasi_download['Day'] = donasi_download['created_at_x'].dt.strftime("%a")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  donasi_download['Time'] = donasi_download['created_at_x'].dt.strftime("%H:%M:%S")


In [112]:
donasi_download.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83397 entries, 0 to 83396
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   campaign_id           81703 non-null  object        
 1   Invoice ID            83390 non-null  object        
 2   Donatur               83390 non-null  object        
 3   Whatsapp              83390 non-null  object        
 4   Email                 83390 non-null  object        
 5   ip                    83390 non-null  object        
 6   Comment               83390 non-null  object        
 7   Program               83390 non-null  object        
 8   Total                 83390 non-null  float64       
 9   Payment Method        83390 non-null  object        
 10  Payment Number        83390 non-null  object        
 11  Payment Account       83390 non-null  object        
 12  Payment Status        83397 non-null  object        
 13  Fundraiser Commi

In [113]:
# save data donation from database to local
donasi_download.to_excel("data_payment.xlsx")

data_from_web = pd.read_excel("Data_baru.xlsx")

data_from_web.to_excel("Data_baru.xlsx")

Load data sql that frequently updated by donasi download

In [114]:
def update_dataframe(main_df: pd.DataFrame, updated_df: pd.DataFrame, cols_to_check: list) -> pd.DataFrame:
    """
    Update main_df with changes from updated_df based on specified columns.
    - Rows with different values in cols_to_check will be updated.
    - New rows not present in main_df will be added.

    Parameters:
    -----------
    main_df : pd.DataFrame
        The original DataFrame (e.g. existing data from web or database).
    updated_df : pd.DataFrame
        The new DataFrame with possibly updated or new rows.
    cols_to_check : list
        List of columns to check for changes.

    Returns:
    --------
    pd.DataFrame
        Updated DataFrame with changes applied.
    """

    # # 1. Find common indices 
    common_ids = updated_df.index.intersection(main_df.index) 
    # 2. Identify rows that have differences 
    diff_mask = (updated_df.loc[common_ids, cols_to_check].ne(main_df.loc[common_ids, cols_to_check]).any(axis=1))
    # 3. Extract rows to update 
    rows_to_update = updated_df.loc[common_ids[diff_mask]] 
    # 4. Identify new rows (not in main_df) 
    new_rows = updated_df.loc[~updated_df.index.isin(main_df.index)] 
    # 5. Apply updates and add new rows 
    main_df.update(rows_to_update) 
    updated_main_df = pd.concat([main_df, new_rows]) 
    # Optional: sort index if needed 
    updated_main_df = updated_main_df.sort_index() 
    # Info 
    print("‚úÖ Excel updated successfully:") 
    print(f"- {len(new_rows)} new rows added") 
    print(f"- {len(rows_to_update)} rows updated based on {cols_to_check}") 
    return updated_main_df




def update_data_web(main_df: pd.DataFrame, updated_df: pd.DataFrame, key_col: str, cols_to_check: list) -> pd.DataFrame:
    import pandas as pd

    main_df = main_df.copy()
    updated_df = updated_df.copy()

    # --- Pastikan kolom kunci unik ---
    if main_df[key_col].duplicated().any():
        print(f"‚ö†Ô∏è Duplicate keys found in main_df on '{key_col}', keeping first.")
        main_df = main_df.drop_duplicates(subset=key_col, keep='first')

    if updated_df[key_col].duplicated().any():
        print(f"‚ö†Ô∏è Duplicate keys found in updated_df on '{key_col}', keeping first.")
        updated_df = updated_df.drop_duplicates(subset=key_col, keep='first')

    # --- Jadikan kolom kunci sebagai index ---
    main_df = main_df.set_index(key_col)
    updated_df = updated_df.set_index(key_col)

    # --- Ambil key yang sama di kedua dataframe ---
    common_ids = updated_df.index.intersection(main_df.index)

    # --- 1Ô∏è‚É£ Cari baris yang berbeda di kolom tertentu ---
    diff_mask = main_df.loc[common_ids, cols_to_check].ne(updated_df.loc[common_ids, cols_to_check]).any(axis=1)

    # --- 2Ô∏è‚É£ Ambil baris yang berbeda dari updated_df untuk overwrite ---
    rows_to_overwrite = updated_df.loc[common_ids[diff_mask]]

    # --- 3Ô∏è‚É£ Overwrite baris yang berbeda ---
    main_df.loc[rows_to_overwrite.index] = rows_to_overwrite

    # --- 4Ô∏è‚É£ Tambahkan baris baru yang belum ada ---
    new_rows = updated_df.loc[~updated_df.index.isin(main_df.index)]
    updated_main_df = pd.concat([main_df, new_rows])

    # --- 5Ô∏è‚É£ Reset index ke kolom kunci ---
    updated_main_df = updated_main_df.reset_index()

    # --- 6Ô∏è‚É£ Log hasil ---
    print("‚úÖ Overwrite update completed:")
    print(f"- {len(rows_to_overwrite)} rows overwritten (different in {cols_to_check})")
    print(f"- {len(new_rows)} new rows added")
    print(f"- Final total rows: {len(updated_main_df)}")

    return updated_main_df


In [115]:
data_from_web_updated = update_data_web(
    main_df=data_from_web,
    updated_df=donasi_download,
    key_col="Invoice ID",
    cols_to_check=["Payment Status"]
)

‚ö†Ô∏è Duplicate keys found in main_df on 'Invoice ID', keeping first.
‚ö†Ô∏è Duplicate keys found in updated_df on 'Invoice ID', keeping first.


 '081247104445' '085333738069' '0881311964250' '0881315816284'
 '082261488406' '082386662861' '085377999881' '08129279973' '08127256610'
 '087895162548' '081809429700' '085963008090' '08111011808' '081368253770'
 '081270892318' '081361183468' '081999228230' '085268274398'
 '081246086390' '085796730876' '081372156814' '085700909080'
 '081310266642' '085247072344' '082175141569' '085759464210'
 '085368250439' '081334633599' '081362434311' '089668010707'
 '089522977143' '085243860169' '081244721421' '081341296912'
 '085742446373' '085379966990' '08819123657' '081586812189' '085360395395'
 '082319540240' '081218856817' '0895402971394' '085263676604'
 '081373604781' '085704199409' '082129971844' '083139819619'
 '089673090117' '085355462555' '083129735042' '085343522121'
 '085852841471' '085888707679' '081241355483' '08156236206' '089604101566'
 '0895369954573' '08161724470' '081353424351' '085752321977'
 '081221902935' '081281801898' '081362226003' '085886936360'
 '085735421027' '0813833369

‚úÖ Overwrite update completed:
- 284 rows overwritten (different in ['Payment Status'])
- 24390 new rows added
- Final total rows: 83594


In [116]:
# df_final.reset_index(drop=True, inplace=True)

# üî• Also remove any "Unnamed" columns just in case
# df_final = df_final.loc[:, ~df_final.columns.str.contains("^Unnamed")]
data_from_web_updated = data_from_web_updated[['Invoice ID', 'Donatur', 'Whatsapp',
       'Email', 'ip', 'Comment', 'Program', 'Total', 'Payment Method',
       'Payment Number', 'Payment Account', 'Payment Status',
       'Fundraiser Commision','aff_code', 'Fundraiser Name','zisco', 'Date', 'Day', 'Time', 'Data Qurban',
       'Data Package-2', 'Data Zakat Fitrah', 'Additional Data', 'UTM Source',
       'UTM Medium', 'UTM Content', 'UTM Campaign', 'UTM Term', 'UTM ID','Source']]

data_from_web_updated.to_excel("checkpoint_files/data_from_web_updated.xlsx")

# Next Step (main)

In [117]:
data_local['Invoice ID'].duplicated().sum()

np.int64(0)

In [118]:
data_local_clean = data_local[~data_local["Invoice ID"].isin(data_from_web["Invoice ID"])]

In [119]:
data_all = pd.concat([data_local_clean, data_from_web_updated])
data_all

Unnamed: 0.49,Unnamed: 0.48,Unnamed: 0.47,Unnamed: 0.46,Unnamed: 0.45,Unnamed: 0.44,Unnamed: 0.43,Unnamed: 0.42,Unnamed: 0.41,Unnamed: 0.40,Unnamed: 0.39,Unnamed: 0.38,Unnamed: 0.37,Unnamed: 0.36,Unnamed: 0.35,Unnamed: 0.34,Unnamed: 0.33,Unnamed: 0.32,Unnamed: 0.31,Unnamed: 0.30,Unnamed: 0.29,Unnamed: 0.28,Unnamed: 0.27,Unnamed: 0.26,Unnamed: 0.25,Unnamed: 0.24,Unnamed: 0.23,Unnamed: 0.22,Unnamed: 0.21,Unnamed: 0.20,Unnamed: 0.19,Unnamed: 0.18,Unnamed: 0.17,Unnamed: 0.16,Unnamed: 0.15,Unnamed: 0.14,Unnamed: 0.13,Unnamed: 0.12,Unnamed: 0.11,Unnamed: 0.10,Unnamed: 0.9,Unnamed: 0.8,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,No,Invoice ID,Donatur,Sapaan,Nominal,Kode Unik,Total,Whatsapp,Email,Comment,Program,Payment Method,Payment Number,Payment Account,Payment Status,Fundraiser Commission,Fundraiser Name,Date,Day,Time,UTM Source,UTM Medium,Source,ip,Fundraiser Commision,aff_code,zisco,Data Qurban,Data Package-2,Data Zakat Fitrah,Additional Data,UTM Content,UTM Campaign,UTM Term,UTM ID
5,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,6.0,INV-2508040IVSI,Mayang,Bapak,100000.0,446.0,100446.0,82162426297.0,,,INFAK PALESTINA: BENTUK PEDULI KITA KEPADA SAU...,instant,https://m.dana.id/s/m7v7pwbn,DANA,Waiting,1004.0,Teman Baik 1,2025-08-04 00:00:00,Mon,13:36:00,fb,paid,Web Ads,,,,,,,,,,,,
6,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,7.0,INV-250804R2C2K,Isep Kamiludin,Bapak,100000.0,528.0,100528.0,87772391069.0,,Semoga Allah mengabulkqn doa kita semua..aamiin,BANTU PALESTINA SEKARANG,instant,https://m.dana.id/s/49unkdhk,DANA,Waiting,1005.0,Teman Baik 1,2025-08-04 00:00:00,Mon,13:36:00,fb,paid,Web Ads,,,,,,,,,,,,
54,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,55.0,INV-2508049SZY8,A.nabila,Ibu,100000.0,589.0,100589.0,812812335.0,,,SEDEKAH PALESTINA,instant,https://flip.id/pwf/transaction/consolidated?r...,QRIS,Waiting,1006.0,Teman Baik 1,2025-08-04 00:00:00,Mon,10:52:00,fb,paid,Web Ads,,,,,,,,,,,,
56,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,57.0,INV-250804LQMVG,A.nabila,Ibu,100000.0,209.0,100209.0,812812335.0,,,SEDEKAH PALESTINA,transfer,BSI 7772526274,BSI,Waiting,1002.0,Teman Baik 1,2025-08-04 00:00:00,Mon,10:49:00,fb,paid,Web Ads,,,,,,,,,,,,
72,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,72.0,73.0,INV-250804DZ01S,Didi sutardi.didi1968@gmail.com,Bapak,50000.0,640.0,50640.0,81324233883.0,didisutardi0568@gmail.com,Semoga Allah cepat akhiri penderitaan Rakyat d...,BANTU PALESTINA SEKARANG,instant,https://api.midtrans.com/v2/gopay/d1730895-e65...,GOPAY,Waiting,506.0,Teman Baik 1,2025-08-04 00:00:00,Mon,10:12:00,fb,paid,Web Ads,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83589,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,INV-251213C2J5V,Lanny,,,,100357.0,08164865534,Lannyhalim54@yahoo.com,Moga semua selamat dan cepat diatasi,BANGKITKAN SUMATERA,instant,https://flip.id/pwf/transaction/consolidated?r...,OVO,Success,,Teman Baik 1,2025-12-13 08:54:18,Sat,08:54:18,fb,Facebook_Stories,Web Ads,182.8.97.246,1004.0,alofv,,[],[],[],"{""Kode Unik"":""357""}",idp hku 7,idp hku 7,120239540522040706,120239540522020706
83590,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,INV-251213VMDUS,Adipurnawan,,,,50374.0,08954906702,adipurnawan676@gmail.com,,BANGKITKAN SUMATERA,instant,https://flip.id/pwf/transaction/consolidated?r...,QRIS,Success,,Teman Baik 1,2025-12-13 09:07:06,Sat,09:07:06,fb,Facebook_Mobile_Reels,Web Ads,114.79.22.184,504.0,alofv,,[],[],[],"{""Kode Unik"":""374""}",idp hku 7,idp hku 7,120239540522040706,120239540522020706
83591,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,INV-2512133TN4T,Tri Susatyo,,,,53598.0,0817828553,t.susatyo@yahoo.co.id,Allahu Akbar,BANGKITKAN SUMATERA,va,1900800026637688,BCA VA,Success,,Teman Baik 1,2025-12-13 09:18:20,Sat,09:18:20,fb,Facebook_Mobile_Reels,Web Ads,111.94.35.36,536.0,alofv,,[],[],[],"{""Kode Unik"":""268""}",Sumatera Berduka - Salin,Sumatera Berduka,120243299298100602,120243299298090602
83592,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,INV-251213H9YEZ,Bahtiar,,,,100404.0,082245283632,,,BANGKITKAN SUMATERA,transfer,BRI 115901000647561,Yayasan Teman Jalan Kebaikan,Waiting,,Teman Baik 1,2025-12-13 09:59:15,Sat,09:59:15,fb,Facebook_Stories,Web Ads,114.10.135.121,1004.0,alofv,,[],[],[],"{""Kode Unik"":""404""}",Sumatera Berduka - Salin,Sumatera Berduka,120243299298100602,120243299298090602


In [120]:
data_all = data_all.dropna(subset=["Invoice ID"])

In [121]:
data_all.to_excel("Data_transaksi.xlsx")

In [122]:
data_all['Whatsapp'] = pd.to_numeric(data_all['Whatsapp'], errors='coerce')
data_all = data_all.dropna(subset=['Whatsapp'])
data_all.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_all['Whatsapp'] = pd.to_numeric(data_all['Whatsapp'], errors='coerce')


<class 'pandas.core.frame.DataFrame'>
Index: 90225 entries, 5 to 83593
Data columns (total 84 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Unnamed: 0.48          6633 non-null   float64       
 1   Unnamed: 0.47          6633 non-null   float64       
 2   Unnamed: 0.46          6633 non-null   float64       
 3   Unnamed: 0.45          6633 non-null   float64       
 4   Unnamed: 0.44          6633 non-null   float64       
 5   Unnamed: 0.43          6633 non-null   float64       
 6   Unnamed: 0.42          6633 non-null   float64       
 7   Unnamed: 0.41          6633 non-null   float64       
 8   Unnamed: 0.40          6633 non-null   float64       
 9   Unnamed: 0.39          6633 non-null   float64       
 10  Unnamed: 0.38          6633 non-null   float64       
 11  Unnamed: 0.37          6633 non-null   float64       
 12  Unnamed: 0.36          6633 non-null   float64       
 13  Unname

In [123]:
# change data type and make a new necessary columns 
data_all['Whatsapp'] = data_all['Whatsapp'].astype(int)
data_all['Time'] = pd.to_datetime(data_all['Time'], format='%H:%M:%S')  # ‚Üê keep full datetime
data_all['hour'] = data_all['Time'].dt.hour
data_all['label_jam'] = data_all['hour'].fillna(0).apply(
    lambda h: f"{int(h):02d}-{(int(h) + 1) % 24:02d}"
)

In [124]:
# Standarize whatsapp number
def is_random_number(num):
    num = str(num)

    # Nomor harus hanya angka
    if not num.isdigit():
        return False

    # Harus valid dari segi panjang dan awalan
    starts_valid = num.startswith("08") or num.startswith("8") or num.startswith("62")
    length_valid = 8 < len(num) <= 14

    if not (starts_valid and length_valid):
        return False

    # Cek jika terlalu banyak angka sama (seperti 08888888888)
    if len(set(num)) <= 3:
        return False

    # Cek pola berulang (seperti 081234567812345678)
    if len(num) > 6 and num[:int(len(num)/2)] == num[int(len(num)/2):]:
        return False

    # Cek pola urutan (seperti 08123456789)
    if num[2:].isdigit() and num[2:] in '1234567890'*2:
        return False

    return True

data_all['is_random'] = data_all['Whatsapp'].apply(is_random_number)
data_all['kategori_nomor'] = data_all['is_random'].map({True: 'Nomor Acak', False: 'Nomor Pola/Tidak Valid'})
data_all[data_all['is_random'] == False].head()

Unnamed: 0.49,Unnamed: 0.48,Unnamed: 0.47,Unnamed: 0.46,Unnamed: 0.45,Unnamed: 0.44,Unnamed: 0.43,Unnamed: 0.42,Unnamed: 0.41,Unnamed: 0.40,Unnamed: 0.39,Unnamed: 0.38,Unnamed: 0.37,Unnamed: 0.36,Unnamed: 0.35,Unnamed: 0.34,Unnamed: 0.33,Unnamed: 0.32,Unnamed: 0.31,Unnamed: 0.30,Unnamed: 0.29,Unnamed: 0.28,Unnamed: 0.27,Unnamed: 0.26,Unnamed: 0.25,Unnamed: 0.24,Unnamed: 0.23,Unnamed: 0.22,Unnamed: 0.21,Unnamed: 0.20,Unnamed: 0.19,Unnamed: 0.18,Unnamed: 0.17,Unnamed: 0.16,Unnamed: 0.15,Unnamed: 0.14,Unnamed: 0.13,Unnamed: 0.12,Unnamed: 0.11,Unnamed: 0.10,Unnamed: 0.9,Unnamed: 0.8,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,No,Invoice ID,Donatur,Sapaan,Nominal,Kode Unik,Total,Whatsapp,Email,Comment,Program,Payment Method,Payment Number,Payment Account,Payment Status,Fundraiser Commission,Fundraiser Name,Date,Day,Time,UTM Source,UTM Medium,Source,ip,Fundraiser Commision,aff_code,zisco,Data Qurban,Data Package-2,Data Zakat Fitrah,Additional Data,UTM Content,UTM Campaign,UTM Term,UTM ID,hour,label_jam,is_random,kategori_nomor
7218,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,7218.0,508.0,INV-250720QFDBQ,Hamba Alloh,Bapak,50000.0,782.0,50782.0,81000000,,Aamiin,INFAK PALESTINA: BENTUK PEDULI KITA KEPADA SAU...,transfer,Mandiri 1410000665448,Mandiri,Waiting,508.0,Teman Baik 1,2025-07-20,Sun,1900-01-01 04:37:00,ig,paid,Web Ads,,,,,,,,,,,,,4,04-05,False,Nomor Pola/Tidak Valid
27138,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,27138.0,494.0,INV-2505242MJ04,Rafli Firyal,Bapak,100000.0,547.0,103877.0,87777777777,,,INFAK PALESTINA,va,1900800023227110,BCA VA,Success,1039.0,Teman Baik 1,2025-05-24,Sat,1900-01-01 02:11:00,ig,paid,Web Ads,,,,,,,,,,,,,2,02-03,False,Nomor Pola/Tidak Valid
27226,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,27226.0,582.0,INV-2505234S3PA,Hamba Allah,Ibu,100000.0,169.0,100169.0,123506328,,Bismillah hamba donasi karena Allah Barokallah...,INFAK PALESTINA,transfer,BSI 7772526274,BSI,Waiting,1002.0,Teman Baik 1,2025-05-23,Fri,1900-01-01 13:12:00,fb,paid,Web Ads,,,,,,,,,,,,,13,13-14,False,Nomor Pola/Tidak Valid
27287,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,27287.0,643.0,INV-250523EQSG2,Hambaallah,Bapak,100000.0,628.0,100628.0,999086705,,,INFAK PALESTINA,transfer,BSI 7772526274,BSI,Waiting,1006.0,Teman Baik 1,2025-05-23,Fri,1900-01-01 05:54:00,fb,paid,Web Ads,,,,,,,,,,,,,5,05-06,False,Nomor Pola/Tidak Valid
27439,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,27439.0,795.0,INV-250522IEGUE,Alwi nopriansyah,Bapak,100000.0,76.0,100076.0,8131191,,,INFAK PALESTINA,transfer,Mandiri 1410000665448,Mandiri,Waiting,1001.0,Teman Baik 1,2025-05-22,Thu,1900-01-01 02:05:00,fb,paid,Web Ads,,,,,,,,,,,,,2,02-03,False,Nomor Pola/Tidak Valid


In [125]:
# Standarisasi nomor agar diawali dengan 62
def format_nomor(nomor):
    nomor = str(nomor)
    if nomor.startswith('6'):
        return nomor
    elif nomor.startswith('8'):
        return '62' + nomor
    else:
        return nomor  # Bisa disesuaikan jika ada kondisi lain
    
data_all['Whatsapp'] = data_all['Whatsapp'].apply(format_nomor)

# Rekap himpunan

In [126]:
# load processed data from web
data_with_rekapan = pd.read_excel("checkpoint_files/data_with_updated_himpunan.xlsx")

# gsheet id each CRM rekap himpunan
cs_zein = "17mDDgZm5jrbOUZU2rhFMiC999FvhJuTtvooqs5gZAaY"
cs_vicky = "1e1uyBU_1MtbdktSrnqHj-f83t7DM17S8_LFGMqYcccQ"
cs_intan = "1CUC2hdiJ2CkKDv3d5vmLdtEI4aawz7ERV7E0ChwWjxM"
cs_diah = "12ARtZE3RVK87uKPvzNbPxqouzZnEB3cAtI6WVv3emiA"
cs_shania = "1LmfIz2ZARiROFnSOpiG1oRxYI_hV0ZdWPxeW1LJT1c8"
cs_firda = "10BLcY2mY904pTu5uAeKyFs4UhI6WAyz68zcW70Odw28"
cs_endah = "1hGYcODnzpNyzYfLiGE4UU8SNx7yV12NqmOOpEVthtC8"
cs_erni = "1vvRzfvm64b5yNLzt_4FKO2XpJp25vV66s8tbr3F3InE"
cs_agil = "1lSffBtyrLEa5qkoqzVdHxYCSqprpzM6EV4sGmIp0aC0"
cs_oliv = "1X9Sv4LbRHLFv5LLzKhPxlRHWUB_vA6byxdoWUNFE1Gw"



# rekap himpunan
rekap_himpunan_wa_Juni = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_Juni.xlsx")
rekap_himpunan_wa_7778 = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_7778.xlsx")
rekap_himpunan_wa_7778_Juli = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_Juli.xlsx")
rekap_himpunan_wa_2061 = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_2061.xlsx")
rekap_himpunan_wa_2062 = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_2062.xlsx")
rekap_himpunan_wa_2060_juli = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_2060_Juli.xlsx")

# agus
rekap_himpunan_wa_2060_agustus = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_2060_Agustus.xlsx")
rekap_himpunan_0056_0036_agustus = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_Agustus zein.xlsx")

# sept
rekap_himpunan_wa_2060_september = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_2060_September.xlsx")
rekap_himpunan_2056_2057_sept = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap_Himpunan_2056-2057_Sept.xlsx")
rekap_himpunan_0056_0036_sept = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_September zein.xlsx")
rekap_himpunan_7778_sept = pd.read_excel("../data_blast/rekap_himpunan/new/REKAP HIMPUNAN 7778_Sept.xlsx")
rekap_himpunan_1097_sept = pd.read_excel("../data_blast/rekap_himpunan/new/REKAP HIMPUNAN 0038_Sept.xlsx")

# okt
rekap_himpunan_0046_0047_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Shania Oktober.xlsx")
rekap_himpunan_0041_0051_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Diah Oktober.xlsx")
rekap_himpunan_2056_2057_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Himpunan 2056-2057 (okt).xlsx")
rekap_himpunan_wa_2060_oktober = pd.read_excel("../data_blast/rekap_himpunan/old/Rekap_Himpunan_2060_Oktober.xlsx")
rekap_himpunan_1097_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Oktiber 1097.xlsx")
rekap_himpunan_7778_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap_Himpunan_Oktober_7778.xlsx")
rekap_himpunan_31_32_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Erni Oktober.xlsx")
rekap_himpunan_1095_1096_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Himpunan Firda Oktober.xlsx")
rekap_himpunan_2058_2059_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Endah Oktober.xlsx")
rekap_himpunan_0034_8196_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Agil Oktober.xlsx")
rekap_himpunan_0056_0036_okt = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap_Himpunan_Oktober zein.xlsx")

# nov
rekap_himpunan_0046_0047_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Shania November.xlsx")
rekap_himpunan_0041_0051_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Diah November.xlsx")
rekap_himpunan_2056_2057_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Himpunan 2056-2057 (nov).xlsx")
rekap_himpunan_31_32_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Erni November.xlsx")
rekap_himpunan_1095_1096_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Himpunan Firda November.xlsx")
rekap_himpunan_2058_2059_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Endah November.xlsx")
rekap_himpunan_0034_8196_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap Himpunan Agil November.xlsx")
rekap_himpunan_0056_0036_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Rekap_Himpunan_Cut_Off_Nopember.xlsx")
rekap_himpunan_8194_8195_nov = pd.read_excel("../data_blast/rekap_himpunan/new/Himpunan Oliv November.xlsx")
rekap_himpunan_1097_nov = pd.read_excel("../data_blast/rekap_himpunan/new/REKAP HIMPUNAN NOVEMBER 1097.xlsx")
rekap_himpunan_7778_nov = pd.read_excel("../data_blast/rekap_himpunan/new/REKAP HIMPUNAN NOVEMBER 7778.xlsx")



# collected himpunan each CRM
rekap_himpunan_wa_2060 = pd.concat([rekap_himpunan_wa_2060_juli,rekap_himpunan_wa_2060_agustus,rekap_himpunan_wa_2060_september,rekap_himpunan_wa_2060_oktober])
rekap_himpunan_wa_0056_0036 = pd.concat([rekap_himpunan_0056_0036_agustus, rekap_himpunan_0056_0036_sept, rekap_himpunan_0056_0036_okt, rekap_himpunan_8194_8195_nov])
rekap_himpunan_wa_2056_2057 = pd.concat([rekap_himpunan_2056_2057_sept,rekap_himpunan_2056_2057_okt, rekap_himpunan_2056_2057_nov])
rekap_himpunan_wa_7778_1097 = pd.concat([rekap_himpunan_wa_7778, rekap_himpunan_wa_7778_Juli,rekap_himpunan_7778_sept,rekap_himpunan_1097_sept, rekap_himpunan_7778_okt,rekap_himpunan_1097_okt, rekap_himpunan_7778_nov,rekap_himpunan_1097_nov])
rekap_himpunan_wa_41_51 = pd.concat([rekap_himpunan_0041_0051_okt, rekap_himpunan_0041_0051_nov])
rekap_himpunan_wa_46_47 = pd.concat([rekap_himpunan_0046_0047_okt,rekap_himpunan_0046_0047_nov])
rekap_himpunan_wa_1095_1096 = pd.concat([rekap_himpunan_1095_1096_okt, rekap_himpunan_1095_1096_nov])
rekap_himpunan_wa_2058_2059 = pd.concat([rekap_himpunan_2058_2059_okt, rekap_himpunan_2058_2059_nov])
rekap_himpunan_wa_31_32 = pd.concat([rekap_himpunan_31_32_okt, rekap_himpunan_31_32_nov])
rekap_himpunan_wa_0034_8196 = pd.concat([rekap_himpunan_0034_8196_okt, rekap_himpunan_0034_8196_nov])
rekap_himpunan_8194_8195 = pd.concat([rekap_himpunan_8194_8195_nov])




# all rekap himpunan
rekap_himpunan = pd.concat([rekap_himpunan_wa_2061,rekap_himpunan_wa_2062, rekap_himpunan_wa_Juni,
                            rekap_himpunan_wa_2060, 
                            rekap_himpunan_wa_0056_0036,
                            rekap_himpunan_wa_2056_2057,
                            rekap_himpunan_wa_7778_1097,
                            rekap_himpunan_wa_41_51,
                            rekap_himpunan_wa_46_47,
                            rekap_himpunan_wa_1095_1096,
                            rekap_himpunan_wa_2058_2059,
                            rekap_himpunan_wa_31_32,
                            rekap_himpunan_wa_0034_8196,
                            rekap_himpunan_8194_8195])


rekap_himpunan['Source'] = "Web Ads"


rekap_himpunan = rekap_himpunan.dropna(subset=['Total'])
rekap_himpunan['Date'] = rekap_himpunan['Date'].fillna(rekap_himpunan['Date Blast']).fillna(rekap_himpunan['Date Donation'])

In [127]:
# CTWA
folder_path = '../data_blast/rekap_himpunan/old/Oktober'

all_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]

# Read and combine
ctwa_raw = pd.concat(
    [pd.read_excel(os.path.join(folder_path, f)) for f in all_files],
    ignore_index=True
)
ctwa_raw['Source'] = "CTWA"

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [128]:
# rekap_himpunan_all['Whatsapp'] = rekap_himpunan_all['Whatsapp'].replace(['', 'NaN', 'None'], pd.NA)
# Drop rows where Whatsapp is null
ctwa_all = ctwa_raw[['CRM','Tanggal','Nama Donatur','Whatsapps','Payment Status','Bank','Donasi','Akad','Source']].dropna(subset=['Donasi'])
# rekap_himpunan_all
ctwa = ctwa_all.rename(columns={'Whatsapps':'Whatsapp', 'Nama Donatur':'Donatur','Donasi':'Total','Akad':'Program','Tanggal':'Date'})

In [129]:
# make sure whatsapp dtype is the same (int)
ctwa['Whatsapp'] = ctwa['Whatsapp'].astype(int)
rekap_himpunan['Whatsapp'] = rekap_himpunan['Whatsapp'].fillna(0).astype(int)

#  fillna date with date blast
rekap_himpunan['Date'] = rekap_himpunan['Date'].fillna(rekap_himpunan['Date Blast'])

# concat ctwa and rekap himpunan
rekap_himpunan_all = pd.concat([ctwa,rekap_himpunan])

In [130]:
rekap_himpunan_all['Date'] = rekap_himpunan_all['Date'].apply(
    lambda x: pd.to_datetime(x, errors='coerce') if not isinstance(x, pd.Timestamp) else x
)

# choose only necessary columns
rekap_himpunan_all = rekap_himpunan_all[['Donatur', 'Whatsapp', 'Payment Account', 'Date', 'Program',
       'Total', 'Funnel', 'Date Blast', 'Source']]


# get a day column
rekap_himpunan_all['Day'] = rekap_himpunan_all['Date'].dt.strftime('%a')

  lambda x: pd.to_datetime(x, errors='coerce') if not isinstance(x, pd.Timestamp) else x
  lambda x: pd.to_datetime(x, errors='coerce') if not isinstance(x, pd.Timestamp) else x


Standarisasi Payment Account

In [131]:
def payment(akun,number):
    number = str(number)
    if akun == "Yayasan Teman Jalan Kebaikan":
        if ' ' not in number:
            return "BSI"
        else: 
            return number.split()[0]
    else:
        return akun
    
data_all['Payment'] = data_all.apply(lambda row: payment(row['Payment Account'], row['Payment Number']), axis=1)

data_all.drop('Payment Account', axis=1, inplace=True)

In [132]:
def move_column(df, col_name, new_pos):
    cols = list(df.columns)
    cols.insert(new_pos, cols.pop(cols.index(col_name)))
    return df[cols]

data_all = move_column(data_all,"Payment", 13)


data_all.rename(columns={'Payment': 'Payment Account'}, inplace=True)

In [133]:
df = data_all[(data_all['is_random'] == True)]
# df = df[df['Payment Status'] == "Success"]
# df = data_all[data_all['Payment Status'] == "Success"]

Menggabungkan data dari web dengan rekap himpunan

In [134]:
rekap_himpunan_all['Program'] = rekap_himpunan_all['Program'].str.upper()

rekap_himpunan_all['Whatsapp'] = rekap_himpunan_all['Whatsapp'].astype(str)

rekap_himpunan_all = rekap_himpunan_all.drop_duplicates()

In [135]:
df = df.loc[:, ~df.columns.str.contains("Unnamed")]
df

Unnamed: 0,Payment Account,No,Invoice ID,Donatur,Sapaan,Nominal,Kode Unik,Total,Whatsapp,Email,Comment,Program,Payment Method,Payment Number,Payment Status,Fundraiser Commission,Fundraiser Name,Date,Day,Time,UTM Source,UTM Medium,Source,ip,Fundraiser Commision,aff_code,zisco,Data Qurban,Data Package-2,Data Zakat Fitrah,Additional Data,UTM Content,UTM Campaign,UTM Term,UTM ID,hour,label_jam,is_random,kategori_nomor
5,DANA,6.0,INV-2508040IVSI,Mayang,Bapak,100000.0,446.0,100446.0,6282162426297,,,INFAK PALESTINA: BENTUK PEDULI KITA KEPADA SAU...,instant,https://m.dana.id/s/m7v7pwbn,Waiting,1004.0,Teman Baik 1,2025-08-04 00:00:00,Mon,1900-01-01 13:36:00,fb,paid,Web Ads,,,,,,,,,,,,,13,13-14,True,Nomor Acak
6,DANA,7.0,INV-250804R2C2K,Isep Kamiludin,Bapak,100000.0,528.0,100528.0,6287772391069,,Semoga Allah mengabulkqn doa kita semua..aamiin,BANTU PALESTINA SEKARANG,instant,https://m.dana.id/s/49unkdhk,Waiting,1005.0,Teman Baik 1,2025-08-04 00:00:00,Mon,1900-01-01 13:36:00,fb,paid,Web Ads,,,,,,,,,,,,,13,13-14,True,Nomor Acak
54,QRIS,55.0,INV-2508049SZY8,A.nabila,Ibu,100000.0,589.0,100589.0,62812812335,,,SEDEKAH PALESTINA,instant,https://flip.id/pwf/transaction/consolidated?r...,Waiting,1006.0,Teman Baik 1,2025-08-04 00:00:00,Mon,1900-01-01 10:52:00,fb,paid,Web Ads,,,,,,,,,,,,,10,10-11,True,Nomor Acak
56,BSI,57.0,INV-250804LQMVG,A.nabila,Ibu,100000.0,209.0,100209.0,62812812335,,,SEDEKAH PALESTINA,transfer,BSI 7772526274,Waiting,1002.0,Teman Baik 1,2025-08-04 00:00:00,Mon,1900-01-01 10:49:00,fb,paid,Web Ads,,,,,,,,,,,,,10,10-11,True,Nomor Acak
72,GOPAY,73.0,INV-250804DZ01S,Didi sutardi.didi1968@gmail.com,Bapak,50000.0,640.0,50640.0,6281324233883,didisutardi0568@gmail.com,Semoga Allah cepat akhiri penderitaan Rakyat d...,BANTU PALESTINA SEKARANG,instant,https://api.midtrans.com/v2/gopay/d1730895-e65...,Waiting,506.0,Teman Baik 1,2025-08-04 00:00:00,Mon,1900-01-01 10:12:00,fb,paid,Web Ads,,,,,,,,,,,,,10,10-11,True,Nomor Acak
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83589,OVO,,INV-251213C2J5V,Lanny,,,,100357.0,628164865534,Lannyhalim54@yahoo.com,Moga semua selamat dan cepat diatasi,BANGKITKAN SUMATERA,instant,https://flip.id/pwf/transaction/consolidated?r...,Success,,Teman Baik 1,2025-12-13 08:54:18,Sat,1900-01-01 08:54:18,fb,Facebook_Stories,Web Ads,182.8.97.246,1004.0,alofv,,[],[],[],"{""Kode Unik"":""357""}",idp hku 7,idp hku 7,120239540522040706,120239540522020706,8,08-09,True,Nomor Acak
83590,QRIS,,INV-251213VMDUS,Adipurnawan,,,,50374.0,628954906702,adipurnawan676@gmail.com,,BANGKITKAN SUMATERA,instant,https://flip.id/pwf/transaction/consolidated?r...,Success,,Teman Baik 1,2025-12-13 09:07:06,Sat,1900-01-01 09:07:06,fb,Facebook_Mobile_Reels,Web Ads,114.79.22.184,504.0,alofv,,[],[],[],"{""Kode Unik"":""374""}",idp hku 7,idp hku 7,120239540522040706,120239540522020706,9,09-10,True,Nomor Acak
83591,BCA VA,,INV-2512133TN4T,Tri Susatyo,,,,53598.0,62817828553,t.susatyo@yahoo.co.id,Allahu Akbar,BANGKITKAN SUMATERA,va,1900800026637688,Success,,Teman Baik 1,2025-12-13 09:18:20,Sat,1900-01-01 09:18:20,fb,Facebook_Mobile_Reels,Web Ads,111.94.35.36,536.0,alofv,,[],[],[],"{""Kode Unik"":""268""}",Sumatera Berduka - Salin,Sumatera Berduka,120243299298100602,120243299298090602,9,09-10,True,Nomor Acak
83592,BRI,,INV-251213H9YEZ,Bahtiar,,,,100404.0,6282245283632,,,BANGKITKAN SUMATERA,transfer,BRI 115901000647561,Waiting,,Teman Baik 1,2025-12-13 09:59:15,Sat,1900-01-01 09:59:15,fb,Facebook_Stories,Web Ads,114.10.135.121,1004.0,alofv,,[],[],[],"{""Kode Unik"":""404""}",Sumatera Berduka - Salin,Sumatera Berduka,120243299298100602,120243299298090602,9,09-10,True,Nomor Acak


In [136]:
import re

data_web_himpunan = pd.concat([df,rekap_himpunan_all])

data_web_himpunan['Total'] = data_web_himpunan['Total'].apply(
    lambda x: int(re.sub(r'[^0-9]', '', x)) if isinstance(x, str) else x
)

In [137]:
# Format tanggal dan ambil bulan
data_web_himpunan["Tanggal"] = pd.to_datetime(data_web_himpunan["Date"], dayfirst=True)
data_web_himpunan["Bulan"] = data_web_himpunan["Date"].dt.month
data_web_himpunan["Tahun"] = data_web_himpunan["Date"].dt.year
data_web_himpunan["Bulan_Nama"] = data_web_himpunan["Date"].dt.strftime('%B')

# Bulan pertama donasi
first_donation = data_web_himpunan.groupby("Whatsapp")["Date"].min().reset_index()
first_donation["Bulan_Pertama"] = first_donation["Date"].dt.month
first_donation["Tahun_Pertama"] = first_donation["Date"].dt.year

# Gabung ke df utama
data_web_himpunan = data_web_himpunan.merge(first_donation[["Whatsapp", "Bulan_Pertama"]], on="Whatsapp")
data_web_himpunan = data_web_himpunan[data_web_himpunan["Bulan"] >= data_web_himpunan["Bulan_Pertama"]]

# Tracking

In [138]:
data_success_only = data_web_himpunan[data_web_himpunan['Payment Status'] == "Success"]

agg_success = data_success_only.groupby(["Whatsapp", "Bulan"]).agg(
    Donatur=("Donatur", "last"),
    Total=("Total", "sum"),
    Date=("Date", "min"),
    Payment=("Payment Account", "first"),
    Program=("Program", lambda x: list(x)),
    Time=('label_jam', lambda x: list(x)),
    Day=("Day", lambda x: x.mode().iloc[0] if not x.mode().empty else None)
).reset_index()

all_donor = data_web_himpunan[["Whatsapp", "Bulan"]].drop_duplicates()

all_agg = all_donor.merge(agg_success, on=["Whatsapp", "Bulan"], how="left")

In [139]:
final_programs = []
previous_programs = {}

for _, row in all_agg.iterrows():
    key = row["Whatsapp"]
    val = row["Program"]

    # --- Normalisasi Program agar selalu list (no NaN mix) --- #
    if isinstance(val, float) and pd.isna(val):
        current_programs = []
    elif isinstance(val, list):
        current_programs = [x for x in val if not (isinstance(x, float) and pd.isna(x))]
        current_programs = list(dict.fromkeys(current_programs))
    else:
        current_programs = [val]

    prev = previous_programs.get(key, [])

    different_programs = [p for p in current_programs if p not in prev]

    if not prev:
        final_programs.append(current_programs[0] if current_programs else None)
    elif different_programs:
        final_programs.append(different_programs[0])
    else:
        final_programs.append(current_programs[0] if current_programs else None)

    previous_programs[key] = current_programs

In [140]:
from collections import Counter
import pandas as pd
import ast
import math

def custom_mode_from_list(time_list):
    # if the value is nan then return none
    if isinstance(time_list, float) and math.isnan(time_list):
        return None

    # change the value to list
    if not isinstance(time_list, list):
        time_list = [time_list]

    # remove nan from the list
    cleaned = [x for x in time_list if not (isinstance(x, float) and math.isnan(x))]

    if len(cleaned) == 0:
        return None
    if len(cleaned) == 1:
        return cleaned[0]

    counter = Counter(cleaned)
    mc = counter.most_common()

    # if there is more than one and the top freq is tie then return the last value
    if len(mc) > 1 and mc[0][1] == mc[1][1]:
        return cleaned[-1]

    return mc[0][0]  # mode tunggal

# if column value contains string list change to list
all_agg['Time'] = all_agg['Time'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

all_agg['Label_Jam'] = all_agg['Time'].apply(custom_mode_from_list)

In [141]:
# input the final program from the previous processing
all_agg["Final_Program"] = final_programs

# Persiapan output format lebar (wide)
bulan_dict = {
    1: "Januari", 2: "Februari", 3: "Maret", 4: "April", 5: "Mei", 6: "Juni",
    7: "Juli", 8: "Agustus", 9: "September", 10: "Oktober", 11: "November", 12: "Desember"
}

# Output awal
output_test = pd.DataFrame(all_agg["Whatsapp"].drop_duplicates())

for bulan_num in range(1, 13):
    bulan_nama = bulan_dict[bulan_num]
    sub = all_agg[all_agg["Bulan"] == bulan_num][["Whatsapp", "Final_Program", "Total", "Date", "Day", "Label_Jam", "Payment"]]
    sub.columns = ["Whatsapp", f"{bulan_nama}_Program", f"{bulan_nama}_Donasi", f"{bulan_nama}_Tanggal", f"{bulan_nama}_Day", f"{bulan_nama}_Time", f"{bulan_nama}_Payment"]
    output_test = output_test.merge(sub, on="Whatsapp", how="left")

# Tambah Nama
output_test = output_test.merge(data_web_himpunan[["Whatsapp", "Donatur"]].drop_duplicates("Whatsapp"), on="Whatsapp", how="left")

# Tambah bulan pertama donasi untuk pengurutan
output_test = output_test.merge(first_donation[["Whatsapp", "Bulan_Pertama"]], on="Whatsapp")
output_test = output_test.sort_values("Bulan_Pertama").drop(columns=["Bulan_Pertama"])

In [142]:
# Urutkan kolom
cols = output_test.columns.tolist()
cols = ["Whatsapp", "Donatur"] + [col for col in cols if col not in ["Whatsapp", "Donatur"]]
output_test = output_test[cols]

# Hapus nilai 0
output_test.replace(0, "", inplace=True)
# Ambil semua kolom yang mengandung '_Program'
program_cols = [col for col in output_test.columns if '_Program' in col]

time_cols = [col for col in output_test.columns if '_Time' in col]

# Total donasi
output_test['Total'] = output_test[[col for col in output_test.columns if '_Donasi' in col]].sum(axis=1)

# Frekuensi donasi (jumlah kolom donasi yang tidak kosong)
output_test['Frekuensi'] = output_test[[col for col in output_test.columns if '_Donasi' in col]].notna().sum(axis=1)

# output['Total_Preferensi'] = (output[[col for col in output.columns if '_Preferensi' in col]].sum(axis=1) / output[[col for col in output.columns if '_Preferensi' in col]].notna().sum(axis=1)).astype(int)

# Hitung preferensi: jumlah program unik per baris
output_test['Preferensi'] = output_test[program_cols].apply(lambda row: row.dropna().nunique(), axis=1)

# Hitung program yang paling sering muncul per baris
output_test['Program'] = output_test[program_cols].apply(lambda row: row.dropna().mode().iloc[0] if not row.dropna().empty else None, axis=1)

# Hitung range time paling sering muncul per baris
output_test['Label_Jam'] = output_test[time_cols].apply(lambda row: row.dropna().mode().iloc[0] if not row.dropna().empty else None, axis=1)

In [143]:
# choose donation column
donasi_cols = [col for col in output_test.columns if '_Donasi' in col]

# 1) cleanse and convert every donation column to numeric(non-numeric -> NaN)
for c in donasi_cols:
    # change to string first, delete character beside digit, minus, and dot (ex: "50.000" -> "50.000", "50,000" -> "50000")
    # and also handle thousand sign and whitespace
    cleaned = output_test[c].astype(str).str.replace(r'[^\d\.\-]', '', regex=True)

    # if empty string "" after cleansing, treat it as a NaN
    cleaned = cleaned.replace('', np.nan)

    # konversi ke numeric
    output_test[c] = pd.to_numeric(cleaned, errors='coerce')

# 2) Hitung mean per baris (abaikan NaN)
mean_series = output_test[donasi_cols].mean(axis=1, skipna=True)

# 3) Bulatkan dan ubah NaN -> None, non-NaN -> int
output_test['Rata - rata'] = mean_series.round(0).apply(lambda x: None if pd.isna(x) else int(x))
# Change total dtype to int
output_test['Total'] = output_test['Total'].astype(int)

  cleaned = cleaned.replace('', np.nan)


In [144]:
# output_test
date_and_day = data_web_himpunan.groupby('Whatsapp').agg(
    Donasi_terakhir=("Total","last"),
    Status_payment=("Payment Status", lambda x: list(x)),
    Day_Mode=("Day", lambda x: x.mode().iloc[0]),
    Tanggal_terakhir_donasi=("Tanggal", max),
    Date=("Tanggal", list),
    Source=("Source","first")
).reset_index()

  date_and_day = data_web_himpunan.groupby('Whatsapp').agg(


In [145]:
def classify_cutoff(dates):
    # Konversi ke pandas Series dan ambil hari (tanggal) saja
    days = pd.Series(dates).dt.day
    
    # Hitung modus
    modus = days.mode().iloc[0]
    
    # Klasifikasi berdasarkan rentang tanggal cut-off
    return 'Masa Gajian' if (modus >= 26 or modus <= 5) else 'Bukan Gajian'

date_and_day['Date_Category'] = date_and_day['Date'].apply(classify_cutoff)
date_and_day['Date_Category'] = np.where(date_and_day['Day_Mode'] == 'Fri', 'Jumat', date_and_day['Date_Category'])
date_and_day = date_and_day.drop(columns=['Date'])

date_and_day['Status'] = date_and_day['Status_payment'].apply(
    lambda lst: 'Pernah Sukses' if any(x == 'Success' for x in lst if pd.notna(x)) 
                else 'Tetap Waiting'
)


In [146]:
# merge date and day and also first donation
output_test = pd.merge(output_test,date_and_day, how='left', on='Whatsapp')
output_test = output_test.merge(first_donation, how='left', on='Whatsapp')

# change the year and month from date column
output_test['Bulan'] = output_test['Date'].dt.strftime('%B')
output_test['Tahun'] = output_test['Date'].dt.year
# 
output_test['klasifikasi_program'] = np.where(
    output_test['Program'].str.lower().str.contains("palestina|gaza"),
    "PALESTINA",
    np.where(
        output_test['Program'].str.lower().str.contains("sudan"),
        "SUDAN",
        np.where(output_test['Program'].str.lower().str.contains("sumatera"),
        "SUMATERA",
            output_test['Program']
        )
    )
)

In [147]:
# Tambahkan kolom 'Kategori' berdasarkan nilai 'Frekuensi'
output_test['Kategori'] = output_test['Frekuensi'].apply(
    lambda x: 'Pasif' if x < 1 else ('Aktif' if x < 4.5 else 'Loyal')
)

# Tambahkan kolom 'Badge' berdasarkan nilai 'Total'
output_test['Badge'] = output_test['Total'].apply(
    lambda x: 'BRONZE' if x < 100000 else ('SILVER' if x <= 1000000 else ('GOLD' if x < 5000000 else 'PLATINUM'))
)

# Tambahkan kolom 'Avg Kategori' berdasarkan nilai 'Rata - rata'
output_test['Avg Kategori'] = output_test['Rata - rata'].apply(
    lambda x: "< 100.000" if x < 100000 else ("100.000 - 1000.000" if x <= 1000000 else ("1000.000 - 5000.000" if x < 5000000 else "> 5000.000"))
)

In [148]:
def update_data_himpunan(main_df, updated_df, cols_to_check, key_col="Whatsapp"):

    # --- 0Ô∏è‚É£ Remove unnamed columns to prevent alignment issues
    main_df = main_df.loc[:, ~main_df.columns.str.contains('^Unnamed')]
    updated_df = updated_df.loc[:, ~updated_df.columns.str.contains('^Unnamed')]

    # --- 1Ô∏è‚É£ Ensure the key column exists
    if key_col not in main_df.columns or key_col not in updated_df.columns:
        raise KeyError(f"‚ùå Key column '{key_col}' not found in one of the DataFrames")

    # --- 2Ô∏è‚É£ Set index to key column
    main_df = main_df.set_index(key_col, drop=False)
    updated_df = updated_df.set_index(key_col, drop=False)

    # --- 3Ô∏è‚É£ Keep only common columns
    updated_df = updated_df.loc[:, updated_df.columns.intersection(main_df.columns)]

    # --- 4Ô∏è‚É£ Align index based on key column
    common_ids = updated_df.index.intersection(main_df.index)

    # --- 5Ô∏è‚É£ Match dtypes
    for col in main_df.columns.intersection(updated_df.columns):
        if main_df[col].dtype != updated_df[col].dtype:
            try:
                updated_df[col] = updated_df[col].astype(main_df[col].dtype)
            except Exception:
                if pd.api.types.is_datetime64_any_dtype(main_df[col]):
                    updated_df[col] = pd.to_datetime(updated_df[col], errors='coerce')
                elif pd.api.types.is_numeric_dtype(main_df[col]):
                    updated_df[col] = pd.to_numeric(updated_df[col], errors='coerce')
                else:
                    updated_df[col] = updated_df[col].astype(str)

    # --- 6Ô∏è‚É£ Detect differences
    diff_mask = updated_df.loc[common_ids, cols_to_check].ne(
        main_df.loc[common_ids, cols_to_check]
    ).any(axis=1)

    # --- 7Ô∏è‚É£ Update only changed rows
    cols_to_update = updated_df.columns.intersection(main_df.columns)
    main_df.loc[common_ids[diff_mask], cols_to_update] = updated_df.loc[common_ids[diff_mask], cols_to_update].values

    # --- 8Ô∏è‚É£ Add new rows (new WhatsApp IDs)
    new_ids = updated_df.index.difference(main_df.index)
    main_df = pd.concat([main_df, updated_df.loc[new_ids, cols_to_update]])

    print("‚úÖ Excel updated successfully:")
    print(f"- {len(new_ids)} new rows added")
    print(f"- {diff_mask.sum()} rows updated based on {cols_to_check}")

    return main_df


In [149]:
data_with_rekapan["Whatsapp"] = data_with_rekapan["Whatsapp"].astype(str)

data_with_rekapan_updated = update_data_web(
    main_df=data_with_rekapan,
    updated_df=output_test,
    key_col="Whatsapp",
    cols_to_check=["Total","Date_Category","Source"]
)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan na

‚úÖ Overwrite update completed:
- 293 rows overwritten (different in ['Total', 'Date_Category', 'Source'])
- 785 new rows added
- Final total rows: 67768


In [150]:
duplicates = data_with_rekapan_updated[data_with_rekapan_updated['Whatsapp'].duplicated(keep=False)]

if not duplicates.empty:
    print("‚ùå Duplicate rows found:\n", duplicates)
    raise SystemExit("Stopping cell due to duplicates.")

In [151]:
data_with_rekapan_updated = data_with_rekapan_updated[['Whatsapp', 'Donatur',
       'Januari_Program', 'Januari_Donasi', 'Januari_Tanggal', 'Januari_Day',
       'Januari_Time', 'Januari_Payment', 'Februari_Program',
       'Februari_Donasi', 'Februari_Tanggal', 'Februari_Day', 'Februari_Time',
       'Februari_Payment', 'Maret_Program', 'Maret_Donasi', 'Maret_Tanggal',
       'Maret_Day', 'Maret_Time', 'Maret_Payment', 'April_Program',
       'April_Donasi', 'April_Tanggal', 'April_Day', 'April_Time',
       'April_Payment', 'Mei_Program', 'Mei_Donasi', 'Mei_Tanggal', 'Mei_Day',
       'Mei_Time', 'Mei_Payment', 'Juni_Program', 'Juni_Donasi',
       'Juni_Tanggal', 'Juni_Day', 'Juni_Time', 'Juni_Payment', 'Juli_Program',
       'Juli_Donasi', 'Juli_Tanggal', 'Juli_Day', 'Juli_Time', 'Juli_Payment',
       'Agustus_Program', 'Agustus_Donasi', 'Agustus_Tanggal', 'Agustus_Day',
       'Agustus_Time', 'Agustus_Payment', 'September_Program',
       'September_Donasi', 'September_Tanggal', 'September_Day',
       'September_Time', 'September_Payment', 'Oktober_Program',
       'Oktober_Donasi', 'Oktober_Tanggal', 'Oktober_Day', 'Oktober_Time',
       'Oktober_Payment', 'November_Program', 'November_Donasi',
       'November_Tanggal', 'November_Day', 'November_Time', 'November_Payment',
       'Desember_Program', 'Desember_Donasi', 'Desember_Tanggal',
       'Desember_Day', 'Desember_Time', 'Desember_Payment', 'Total',
       'Frekuensi', 'Preferensi', 'Program','klasifikasi_program','Label_Jam', 'Rata - rata',
       'Day_Mode', 'Date_Category', 'Date', 'Bulan_Pertama', 'Tahun_Pertama',
       'Bulan', 'Tahun', 'Kategori','Status', 'Badge', 'Avg Kategori',"Tanggal_terakhir_donasi","Donasi_terakhir", "Source"]]

In [152]:
data_with_rekapan_updated.to_excel("checkpoint_files/data_with_updated_himpunan.xlsx", index=False)

In [153]:
output_test.to_excel("tracking_test_with_waiting.xlsx")

# Check number validity

In [154]:
# data with number validity (registered and not)
data_with_validity = pd.read_excel("checkpoint_files/data_with_number_validity_updated.xlsx")
data_with_validity = data_with_validity[["Whatsapp", "validity"]]

# crm data
df_crm = pd.read_excel("checkpoint_files/data_each_crm_updated.xlsx")
df_crm = df_crm[['Whatsapp','CRM']]

# category each crm
df_category = pd.read_excel("checkpoint_files/category_each_donor.xlsx")

# make sure whatsapp dtype is int on both dataframes
df_crm['Whatsapp'] = df_crm['Whatsapp'].astype(int)
data_with_rekapan_updated['Whatsapp'] = data_with_rekapan_updated['Whatsapp'].astype(int)
data_with_validity['Whatsapp'] = data_with_validity['Whatsapp'].astype(int)

# merge to get a number validity
df_tracking_valid = data_with_rekapan_updated.merge(data_with_validity, how="left", on="Whatsapp")


df_with_crm = df_tracking_valid.merge(df_crm,how='left',on='Whatsapp')

In [155]:
df_tracking_valid = df_tracking_valid[['Whatsapp', 'Donatur', 'Bulan', 'Tahun', 'Januari_Program', 'Januari_Donasi',
       'Januari_Tanggal', 'Januari_Day', 'Januari_Time', 'Januari_Payment',
       'Februari_Program', 'Februari_Donasi', 'Februari_Tanggal',
       'Februari_Day', 'Februari_Time', 'Februari_Payment', 'Maret_Program',
       'Maret_Donasi', 'Maret_Tanggal', 'Maret_Day', 'Maret_Time',
       'Maret_Payment', 'April_Program', 'April_Donasi', 'April_Tanggal',
       'April_Day', 'April_Time', 'April_Payment', 'Mei_Program', 'Mei_Donasi',
       'Mei_Tanggal', 'Mei_Day', 'Mei_Time', 'Mei_Payment', 'Juni_Program',
       'Juni_Donasi', 'Juni_Tanggal', 'Juni_Day', 'Juni_Time', 'Juni_Payment',
       'Juli_Program', 'Juli_Donasi', 'Juli_Tanggal', 'Juli_Day', 'Juli_Time',
       'Juli_Payment', 'Agustus_Program', 'Agustus_Donasi', 'Agustus_Tanggal',
       'Agustus_Day', 'Agustus_Time', 'Agustus_Payment', 'September_Program',
       'September_Donasi', 'September_Tanggal', 'September_Day',
       'September_Time', 'September_Payment', 'Oktober_Program',
       'Oktober_Donasi', 'Oktober_Tanggal', 'Oktober_Day', 'Oktober_Time',
       'Oktober_Payment', 'November_Program', 'November_Donasi',
       'November_Tanggal', 'November_Day', 'November_Time', 'November_Payment',
       'Desember_Program', 'Desember_Donasi', 'Desember_Tanggal',
       'Desember_Day', 'Desember_Time', 'Desember_Payment', 'Total',
       'Frekuensi', 'Preferensi', 'Program','klasifikasi_program' ,'Label_Jam', 'Rata - rata',
       'Day_Mode', 'Date_Category', 'Date', 'Bulan_Pertama', 'Tahun_Pertama',
       'Kategori', 'Badge','Status', 'Avg Kategori','Tanggal_terakhir_donasi',"Donasi_terakhir", 'validity', 'Source']]

In [156]:
duplicates = df_tracking_valid[df_tracking_valid['Whatsapp'].duplicated(keep=False)]

if not duplicates.empty:
    print("‚ùå Duplicate rows found:\n", duplicates)
    raise SystemExit("Stopping cell due to duplicates.")

In [157]:
df_with_crm = df_tracking_valid.merge(df_crm,how='left',on='Whatsapp')

321789732189

In [158]:
df_with_crm = df_with_crm[['Whatsapp', 'Donatur',  'CRM', 'Bulan', 'Tahun', 'Januari_Program',
       'Januari_Donasi', 'Januari_Tanggal', 'Januari_Day', 'Januari_Time',
       'Januari_Payment', 'Februari_Program', 'Februari_Donasi',
       'Februari_Tanggal', 'Februari_Day', 'Februari_Time', 'Februari_Payment',
       'Maret_Program', 'Maret_Donasi', 'Maret_Tanggal', 'Maret_Day',
       'Maret_Time', 'Maret_Payment', 'April_Program', 'April_Donasi',
       'April_Tanggal', 'April_Day', 'April_Time', 'April_Payment',
       'Mei_Program', 'Mei_Donasi', 'Mei_Tanggal', 'Mei_Day', 'Mei_Time',
       'Mei_Payment', 'Juni_Program', 'Juni_Donasi', 'Juni_Tanggal',
       'Juni_Day', 'Juni_Time', 'Juni_Payment', 'Juli_Program', 'Juli_Donasi',
       'Juli_Tanggal', 'Juli_Day', 'Juli_Time', 'Juli_Payment',
       'Agustus_Program', 'Agustus_Donasi', 'Agustus_Tanggal', 'Agustus_Day',
       'Agustus_Time', 'Agustus_Payment', 'September_Program',
       'September_Donasi', 'September_Tanggal', 'September_Day',
       'September_Time', 'September_Payment', 'Oktober_Program',
       'Oktober_Donasi', 'Oktober_Tanggal', 'Oktober_Day', 'Oktober_Time',
       'Oktober_Payment', 'November_Program', 'November_Donasi',
       'November_Tanggal', 'November_Day', 'November_Time', 'November_Payment',
       'Desember_Program', 'Desember_Donasi', 'Desember_Tanggal',
       'Desember_Day', 'Desember_Time', 'Desember_Payment', 'Total',
       'Frekuensi', 'Preferensi', 'Program', 'klasifikasi_program',
       'Label_Jam', 'Rata - rata', 'Day_Mode', 'Date_Category', 'Date',
       'Bulan_Pertama', 'Tahun_Pertama', 'Kategori', 'Badge', 'Avg Kategori',
       'Tanggal_terakhir_donasi',"Donasi_terakhir", 'Status', 'validity', 'Source']]

In [159]:
category_update = pd.read_excel("checkpoint_files/category_each_donor.xlsx")

In [160]:
df_output_final = df_with_crm.merge(category_update, how='left', on='Whatsapp')

In [161]:
def update_category(df):
    # if Kategori_y isna take Kategori_x
    if pd.isna(df['kategori_update']):
        return df['Kategori']
    
    # if Kategori_x and Kategori_y are different take Kategori_y
    elif df['kategori_update'] == "Aktif":
        return df['Kategori']
    elif df['kategori_update'] in ['Eliminasi', 'Invalid']:
        return df['kategori_update']
    
    # if both are the same take Kategori_x
    else:
        return df['Kategori']

df_output_final['Kategori'] = df_output_final.apply(update_category, axis=1)

In [162]:
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
df_output_final['Bulan'] = pd.Categorical(df_output_final['Bulan'], categories=month_order, ordered=True)
df_output_final = df_output_final.sort_values('Bulan')

In [163]:
duplicates = df_output_final[df_output_final['Whatsapp'].duplicated(keep=False)]

if not duplicates.empty:
    print("‚ùå Duplicate rows found:\n", duplicates)
    raise SystemExit("Stopping cell due to duplicates.")

In [164]:
df_output_final['validity'].value_counts(dropna=False)

validity
NaN                                                                                                                                                40070
Number registered                                                                                                                                  24150
Number not registered                                                                                                                               3529
Error: 504 Server Error: Gateway Time-out for url: https://api.starsender.online/api/check-number                                                     13
Error: 502 Server Error: Bad Gateway for url: https://api.starsender.online/api/check-number                                                           3
Error: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))        2
Timeout                                                                  

In [165]:
df_output_final['CRM'].value_counts()

CRM
WA_2060             10673
WA_7778              2961
WA_0036              2504
WA_0056              2003
WA_1097              1500
Eliminated           1477
WA_2057              1469
WA_2056              1431
WA_1095               891
WA_0047               873
WA_0051               871
WA_0046               870
WA_0041               869
WA_1094               868
WA_8194               868
WA_8195               867
WA_0032               864
WA_0034               864
WA_1096               864
WA_8271               863
WA_8272               863
WA_2058               863
WA_8204               863
WA_2059               863
WA_8203               863
WA_1092               863
WA_8197               863
WA_0031               863
refill_WA_0031        641
refill_WA_0032        640
Refill_WA_0046        637
Refill_WA_8196        637
refill_WA_0041        637
refill_WA_0051        636
Refill_WA_0047        636
Refill_WA_2056        570
Refill_WA_2057        535
refill_WA_0041_2      500
Refill_W

In [166]:
# tracking data
df_output_final.to_excel("Tracking_temanbaik.xlsx")

In [167]:
df_summary_database = df_output_final[['Whatsapp',
 'Donatur',
 'Bulan',
 'Tahun',
 'CRM',
 'Source',
 'Total',
 'Frekuensi',
 'klasifikasi_program',
 'Preferensi',
 'Label_Jam',
 'Rata - rata',
 'Day_Mode',
 'Date_Category',
 'Tahun_Pertama',
 'Kategori',
 'Badge',
 'Avg Kategori','Tanggal_terakhir_donasi',"Donasi_terakhir",'Status', 'validity']]

df_summary_database.to_excel("Database.xlsx")

In [168]:
# tracking data sukses
data_sukses = df_output_final[df_output_final['Status'] == "Pernah Sukses"]

# tracking data waiting
data_waiting = df_output_final[df_output_final['Status'] == 'Tetap Waiting']

In [169]:
df_summary_database_sukses = data_sukses[['Whatsapp',
 'Donatur',
 'Bulan',
 'Tahun',
 'CRM',
 'Source',
 'Total',
 'Frekuensi',
 'klasifikasi_program',
 'Preferensi',
 'Label_Jam',
 'Rata - rata',
 'Day_Mode',
 'Date_Category',
 'Tahun_Pertama',
 'Kategori',
 'Badge',
 'Avg Kategori','Tanggal_terakhir_donasi',"Donasi_terakhir",'Status', 'validity']]

df_summary_database_waiting = data_waiting[['Whatsapp',
 'Donatur',
 'Bulan',
 'Tahun',
 'CRM',
 'Source',
 'Total',
 'Frekuensi',
 'klasifikasi_program',
 'Preferensi',
 'Label_Jam',
 'Rata - rata',
 'Day_Mode',
 'Date_Category',
 'Tahun_Pertama',
 'Kategori',
 'Badge',
 'Avg Kategori','Tanggal_terakhir_donasi',"Donasi_terakhir",'Status', 'validity']]

In [170]:
df = df_summary_database_sukses[(df_summary_database_sukses['CRM'].isna()) & (df_summary_database_sukses['Source'] == "Web Ads")]
df['validity'].value_counts(dropna=False)

validity
NaN                      4859
Number not registered    2866
Number registered        2674
Name: count, dtype: int64

In [171]:
# database sukses
df_summary_database_sukses.to_excel('Database_sukses.xlsx')

# database waiting
df_summary_database_waiting.to_excel('Database_waiting.xlsx')

In [172]:
data_with_rekapan_updated['Katasdfkljalskdjfri'].value_sadcounts(dropna=False)

KeyError: 'Katasdfkljalskdjfri'

# backup data

In [187]:
username_backup = os.getenv("USERNAME_BU")
password_backup = os.getenv("PASSWORD_BU")
host_backup = os.getenv("HOST_BU")
port_backup = os.getenv("PORT_BU")
database_backup = os.getenv("DATABASE_BU")

# URL-encode username & password in case they have special characters
username_enc = quote_plus(username_backup)
password_enc = quote_plus(password_backup)

# Create SQLAlchemy engine
engine = create_engine(
    f"mysql+pymysql://{username_enc}:{password_enc}@{host_backup}:{port_backup}/{database_backup}"
)
query_backup = """SELECT *
FROM temanbaik_backup.tb_backup_26_mei_25_july"""

data_backup = pd.read_sql(query_backup, engine)

In [None]:
start_date = data_backup['created_at_x'].min().strftime('%Y-%m-%d')
end_date = (pd.Timestamp.today() - pd.DateOffset(months=1)).strftime('%Y-%m-%d')
data_backup_tb = donasi_download[(donasi_download['created_at_x'] > start_date) & (donasi_download['created_at_x'] <= end_date)]
data_backup_teman_baik = pd.concat([data_backup_tb, data_backup_tb], ignore_index=True)
data_backup_teman_baik.to_sql("tb_backup_26_mei_25_july", con=engine, if_exists="append", index=False)

# TRACKING DATA

In [None]:
# Gabungkan data donasi per bulan per Whatsapp
# agg = data_web_himpunan.groupby(["Whatsapp", "Bulan"]).agg({
#     "Donatur": "first",
#     "Total": "sum",
#     "Date": "min",
#     "Payment Account": "first",
#     "Program": lambda x: list(x),
#     "Day":lambda x: x.mode().iloc[0]
# }).reset_index()

agg = data_web_himpunan.groupby(["Whatsapp",  "Bulan"]).agg(
    Donatur=("Donatur", "last"),
    Total=("Total", "sum"),
    Date=("Date", "min"),
    Payment=("Payment Account", "first"),
    Program=("Program", lambda x: list(x)),
    Time=('label_jam', lambda x: list(x)),
    Day=("Day", lambda x: x.mode().iloc[0])
).reset_index()


# Urutkan data untuk pemrosesan berurutan
agg = agg.sort_values(by=["Whatsapp", "Bulan"]).reset_index(drop=True)

In [None]:
# Tambahkan program hasil pemrosesan
final_programs = []
previous_programs = {}

for _, row in agg.iterrows():
    key = row["Whatsapp"]
    current_programs = list(dict.fromkeys(row["Program"]))  # unik, urut
    prev = previous_programs.get(key, [])
    
    # Cari program yang berbeda dari sebelumnya
    different_programs = [p for p in current_programs if p not in prev]

    if not prev:  # bulan pertama
        final_programs.append(current_programs[0])
    elif different_programs:
        final_programs.append(different_programs[0])  # tampilkan program baru
    else:
        final_programs.append(current_programs[0])  # sama semua, tampilkan salah satu

    previous_programs[key] = current_programs

In [None]:
from collections import Counter

# Example: assuming df is your DataFrame
def custom_mode_from_list(time_list):
    if not time_list or len(time_list) == 0:
        return None
    if len(time_list) == 1:
        return time_list[0]
    
    counter = Counter(time_list)
    most_common = counter.most_common()
    
    if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
        return time_list[-1]  # Tie: return last value
    return most_common[0][0]  # Clear mode

# Make sure the "Time" column is a list (not a string)
# If it's stringified, convert it using ast.literal_eval
import ast
agg['Time'] = agg['Time'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Apply the function to each row
agg['Label_Jam'] = agg['Time'].apply(custom_mode_from_list)

agg["Final_Program"] = final_programs

In [None]:
# Persiapan output format lebar (wide)
bulan_dict = {
    1: "Januari", 2: "Februari", 3: "Maret", 4: "April", 5: "Mei", 6: "Juni",
    7: "Juli", 8: "Agustus", 9: "September", 10: "Oktober", 11: "November", 12: "Desember"
}

# Output awal
output = pd.DataFrame(agg["Whatsapp"].drop_duplicates())

for bulan_num in range(1, 13):
    bulan_nama = bulan_dict[bulan_num]
    sub = agg[agg["Bulan"] == bulan_num][["Whatsapp", "Final_Program", "Total", "Date", "Day", "Label_Jam", "Payment"]]
    sub.columns = ["Whatsapp", f"{bulan_nama}_Program", f"{bulan_nama}_Donasi", f"{bulan_nama}_Tanggal", f"{bulan_nama}_Day", f"{bulan_nama}_Time", f"{bulan_nama}_Payment"]
    output = output.merge(sub, on="Whatsapp", how="left")

# Tambah Nama
output = output.merge(data_web_himpunan[["Whatsapp", "Donatur"]].drop_duplicates("Whatsapp"), on="Whatsapp", how="left")

# Tambah bulan pertama donasi untuk pengurutan
output = output.merge(first_donation[["Whatsapp", "Bulan_Pertama"]], on="Whatsapp")
output = output.sort_values("Bulan_Pertama").drop(columns=["Bulan_Pertama"])

# Urutkan kolom
cols = output.columns.tolist()
cols = ["Whatsapp", "Donatur"] + [col for col in cols if col not in ["Whatsapp", "Donatur"]]
output = output[cols]

# Hapus nilai 0
output.replace(0, "", inplace=True)

In [None]:
# Ambil semua kolom yang mengandung '_Program'
program_cols = [col for col in output.columns if '_Program' in col]

time_cols = [col for col in output.columns if '_Time' in col]

# Total donasi
output['Total'] = output[[col for col in output.columns if '_Donasi' in col]].sum(axis=1)

# Frekuensi donasi (jumlah kolom donasi yang tidak kosong)
output['Frekuensi'] = output[[col for col in output.columns if '_Donasi' in col]].notna().sum(axis=1)

# output['Total_Preferensi'] = (output[[col for col in output.columns if '_Preferensi' in col]].sum(axis=1) / output[[col for col in output.columns if '_Preferensi' in col]].notna().sum(axis=1)).astype(int)

# Hitung preferensi: jumlah program unik per baris
output['Preferensi'] = output[program_cols].apply(lambda row: row.dropna().nunique(), axis=1)

# Hitung program yang paling sering muncul per baris
output['Program'] = output[program_cols].apply(lambda row: row.dropna().mode().iloc[0] if not row.dropna().empty else None, axis=1)

# Hitung range time paling sering muncul per baris
output['Label_Jam'] = output[time_cols].apply(lambda row: row.dropna().mode().iloc[0] if not row.dropna().empty else None, axis=1)

# Rata-rata donasi, dibulatkan ke bilangan bulat
output['Rata - rata'] = output[[col for col in output.columns if '_Donasi' in col]].mean(axis=1).round(0).astype(int)

# Change total dtype to int
output['Total'] = output['Total'].astype(int)

find day mode

In [None]:
date_and_day = data_web_himpunan.groupby('Whatsapp').agg(
    Donasi_terakhir=("Total","last"),
    Status_payment=("Payment Status", lambda x: list(x)),
    Day_Mode=("Day", lambda x: x.mode().iloc[0]),
    Tanggal_terakhir_donasi=("Tanggal", max),
    Date=("Tanggal", list),
    Source=("Source","first")
).reset_index()

  date_and_day = data_web_himpunan.groupby('Whatsapp').agg(


In [None]:
def classify_cutoff(dates):
    # Konversi ke pandas Series dan ambil hari (tanggal) saja
    days = pd.Series(dates).dt.day
    
    # Hitung modus
    modus = days.mode().iloc[0]
    
    # Klasifikasi berdasarkan rentang tanggal cut-off
    return 'Masa Gajian' if (modus >= 26 or modus <= 5) else 'Bukan Gajian'

date_and_day['Date_Category'] = date_and_day['Date'].apply(classify_cutoff)
date_and_day['Date_Category'] = np.where(date_and_day['Day_Mode'] == 'Fri', 'Jumat', date_and_day['Date_Category'])
date_and_day = date_and_day.drop(columns=['Date'])



date_and_day['Status'] = date_and_day['Status_payment'].apply(
    lambda lst: 'Pernah Sukses' if any(x == 'Success' for x in lst if pd.notna(x)) 
                else 'Tetap Waiting'
)

In [None]:
# merge date and day and also first donation
output = pd.merge(output,date_and_day, how='left', on='Whatsapp')
output = output.merge(first_donation, how='left', on='Whatsapp')

# change the year and month from date column
output['Bulan'] = output['Date'].dt.strftime('%B')
output['Tahun'] = output['Date'].dt.year

In [None]:
# 
output['klasifikasi_program'] = np.where(
    output['Program'].str.lower().str.contains("palestina|gaza"),
    "PALESTINA",
    np.where(
        output['Program'].str.lower().str.contains("sudan"),
        "SUDAN",
        output['Program']
    )
)

# Tambahkan kolom 'Kategori' berdasarkan nilai 'Frekuensi'
output['Kategori'] = output['Frekuensi'].apply(
    lambda x: 'Pasif' if x < 1 else ('Aktif' if x < 4.5 else 'Loyal')
)

# Tambahkan kolom 'Badge' berdasarkan nilai 'Total'
output['Badge'] = output['Total'].apply(
    lambda x: 'BRONZE' if x < 100000 else ('SILVER' if x <= 1000000 else ('GOLD' if x < 5000000 else 'PLATINUM'))
)

# Tambahkan kolom 'Avg Kategori' berdasarkan nilai 'Rata - rata'
output['Avg Kategori'] = output['Rata - rata'].apply(
    lambda x: "< 100.000" if x < 100000 else ("100.000 - 1000.000" if x <= 1000000 else ("1000.000 - 5000.000" if x < 5000000 else "> 5000.000"))
)

In [None]:
output['Status'].value_counts()

Status
Pernah Sukses    49832
Tetap Waiting    11356
Name: count, dtype: int64

In [None]:
def update_data_himpunan(main_df, updated_df, cols_to_check, key_col="Whatsapp"):

    # --- 0Ô∏è‚É£ Remove unnamed columns to prevent alignment issues
    main_df = main_df.loc[:, ~main_df.columns.str.contains('^Unnamed')]
    updated_df = updated_df.loc[:, ~updated_df.columns.str.contains('^Unnamed')]

    # --- 1Ô∏è‚É£ Ensure the key column exists
    if key_col not in main_df.columns or key_col not in updated_df.columns:
        raise KeyError(f"‚ùå Key column '{key_col}' not found in one of the DataFrames")

    # --- 2Ô∏è‚É£ Set index to key column
    main_df = main_df.set_index(key_col, drop=False)
    updated_df = updated_df.set_index(key_col, drop=False)

    # --- 3Ô∏è‚É£ Keep only common columns
    updated_df = updated_df.loc[:, updated_df.columns.intersection(main_df.columns)]

    # --- 4Ô∏è‚É£ Align index based on key column
    common_ids = updated_df.index.intersection(main_df.index)

    # --- 5Ô∏è‚É£ Match dtypes
    for col in main_df.columns.intersection(updated_df.columns):
        if main_df[col].dtype != updated_df[col].dtype:
            try:
                updated_df[col] = updated_df[col].astype(main_df[col].dtype)
            except Exception:
                if pd.api.types.is_datetime64_any_dtype(main_df[col]):
                    updated_df[col] = pd.to_datetime(updated_df[col], errors='coerce')
                elif pd.api.types.is_numeric_dtype(main_df[col]):
                    updated_df[col] = pd.to_numeric(updated_df[col], errors='coerce')
                else:
                    updated_df[col] = updated_df[col].astype(str)

    # --- 6Ô∏è‚É£ Detect differences
    diff_mask = updated_df.loc[common_ids, cols_to_check].ne(
        main_df.loc[common_ids, cols_to_check]
    ).any(axis=1)

    # --- 7Ô∏è‚É£ Update only changed rows
    cols_to_update = updated_df.columns.intersection(main_df.columns)
    main_df.loc[common_ids[diff_mask], cols_to_update] = updated_df.loc[common_ids[diff_mask], cols_to_update].values

    # --- 8Ô∏è‚É£ Add new rows (new WhatsApp IDs)
    new_ids = updated_df.index.difference(main_df.index)
    main_df = pd.concat([main_df, updated_df.loc[new_ids, cols_to_update]])

    print("‚úÖ Excel updated successfully:")
    print(f"- {len(new_ids)} new rows added")
    print(f"- {diff_mask.sum()} rows updated based on {cols_to_check}")

    return main_df


In [None]:
data_with_rekapan["Whatsapp"] = data_with_rekapan["Whatsapp"].astype(str)

# data_with_rekapan_updated = update_dataframe(
#     main_df=data_with_rekapan,
#     updated_df=output,
#     cols_to_check=["Total","Badge"]
# )

data_with_rekapan_updated = update_data_web(
    main_df=data_with_rekapan,
    updated_df=output,
    key_col="Whatsapp",
    cols_to_check=["Total","Date_Category","Source","Status","Donasi_terakhir"]
)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan na

‚úÖ Overwrite update completed:
- 376 rows overwritten (different in ['Total', 'Date_Category', 'Source', 'Status', 'Donasi_terakhir'])
- 519 new rows added
- Final total rows: 61219


In [None]:
duplicates = data_with_rekapan_updated[data_with_rekapan_updated['Whatsapp'].duplicated(keep=False)]

if not duplicates.empty:
    print("‚ùå Duplicate rows found:\n", duplicates)
    raise SystemExit("Stopping cell due to duplicates.")

In [None]:
data_with_rekapan_updated = data_with_rekapan_updated[['Whatsapp', 'Donatur',
       'Januari_Program', 'Januari_Donasi', 'Januari_Tanggal', 'Januari_Day',
       'Januari_Time', 'Januari_Payment', 'Februari_Program',
       'Februari_Donasi', 'Februari_Tanggal', 'Februari_Day', 'Februari_Time',
       'Februari_Payment', 'Maret_Program', 'Maret_Donasi', 'Maret_Tanggal',
       'Maret_Day', 'Maret_Time', 'Maret_Payment', 'April_Program',
       'April_Donasi', 'April_Tanggal', 'April_Day', 'April_Time',
       'April_Payment', 'Mei_Program', 'Mei_Donasi', 'Mei_Tanggal', 'Mei_Day',
       'Mei_Time', 'Mei_Payment', 'Juni_Program', 'Juni_Donasi',
       'Juni_Tanggal', 'Juni_Day', 'Juni_Time', 'Juni_Payment', 'Juli_Program',
       'Juli_Donasi', 'Juli_Tanggal', 'Juli_Day', 'Juli_Time', 'Juli_Payment',
       'Agustus_Program', 'Agustus_Donasi', 'Agustus_Tanggal', 'Agustus_Day',
       'Agustus_Time', 'Agustus_Payment', 'September_Program',
       'September_Donasi', 'September_Tanggal', 'September_Day',
       'September_Time', 'September_Payment', 'Oktober_Program',
       'Oktober_Donasi', 'Oktober_Tanggal', 'Oktober_Day', 'Oktober_Time',
       'Oktober_Payment', 'November_Program', 'November_Donasi',
       'November_Tanggal', 'November_Day', 'November_Time', 'November_Payment',
       'Desember_Program', 'Desember_Donasi', 'Desember_Tanggal',
       'Desember_Day', 'Desember_Time', 'Desember_Payment', 'Total',
       'Frekuensi', 'Preferensi', 'Program','klasifikasi_program','Label_Jam', 'Rata - rata',
       'Day_Mode', 'Date_Category', 'Date', 'Bulan_Pertama', 'Tahun_Pertama',
       'Bulan', 'Tahun', 'Kategori','Status', 'Badge', 'Avg Kategori',"Tanggal_terakhir_donasi","Donasi_terakhir", "Source"]]

data_with_rekapan_updated.to_excel("checkpoint_files/data_with_updated_himpunan.xlsx", index=False)

# Data each crm

In [None]:
data_with_rekapan_updated['Kategori'].value_counts(dropna=False)

In [None]:
tracking_with_crm = pd.read_excel("checkpoint_files/data_tracking_with_crm.xlsx")

In [None]:
# df_7778_Intan = pd.read_excel("../data_blast/data_each_crm/data_WA_7778_Intan.xlsx")
# df_1097_Intan = pd.read_excel("../data_blast/data_each_crm/data_WA_1097_Intan.xlsx")
# df_0036_zein = pd.read_excel("../data_blast/data_each_crm/data_WA_0036_Zein.xlsx")
# df_0056_zein = pd.read_excel("../data_blast/data_each_crm/data_WA_0056_Zein.xlsx")
# df_0041_Diah = pd.read_excel("../data_blast/data_each_crm/data_WA_0041_Diah.xlsx")
# df_0051_Diah = pd.read_excel("../data_blast/data_each_crm/data_WA_0051_Diah.xlsx")
# df_0047_Shania = pd.read_excel("../data_blast/data_each_crm/data_WA_0047_Shania.xlsx")
# df_0046_Shania = pd.read_excel("../data_blast/data_each_crm/data_WA_0046_Shania.xlsx")
# df_1096_Firda = pd.read_excel("../data_blast/data_each_crm/data_WA_1096_Firda.xlsx")
# df_2059_Endah = pd.read_excel("../data_blast/data_each_crm/data_WA_2059_Endah.xlsx")
# df_2058_Endah = pd.read_excel("../data_blast/data_each_crm/data_WA_2058_Endah.xlsx")
# df_1095_Firda = pd.read_excel("../data_blast/data_each_crm/data_WA_1095_Firda.xlsx")
# df_2057_vicky = pd.read_excel("../data_blast/data_each_crm/data_WA_2057_Vicky.xlsx")
# df_2056_vicky = pd.read_excel("../data_blast/data_each_crm/data_WA_2056_Vicky.xlsx")
# df_0031_Erni = pd.read_excel("../data_blast/data_each_crm/data_WA_0031_Erni.xlsx")
# df_0032_Erni = pd.read_excel("../data_blast/data_each_crm/data_WA_0032_Erni.xlsx")
# df_0034_Agil = pd.read_excel("../data_blast/data_each_crm/data_WA_0034_Agil.xlsx")
# df_1094_Agil = pd.read_excel("../data_blast/data_each_crm/data_WA_1094_Agil.xlsx")
# df_2062 = pd.read_excel("../data_blast/data_each_crm/data_WA_2062.xlsx")
# df_2061 = pd.read_excel("../data_blast/data_each_crm/data_WA_2061.xlsx")
# df_2060 = pd.read_excel("../data_blast/data_each_crm/data_WA_2060.xlsx")
# df_nothing = pd.read_excel("../data_blast/data_each_crm/data_harus_tanpa_CRM.xlsx")
# refill_1097 = pd.read_excel("../data_blast/data_each_crm/Intan/refill_data_WA_1097_Intan.xlsx")
# refill_2056 = pd.read_excel("../data_blast/data_each_crm/Vicky/refill_data_WA_2056_Vicky.xlsx")
# refill_2057 = pd.read_excel("../data_blast/data_each_crm/Vicky/refill_data_WA_2057_Vicky.xlsx")
# df_olivia_1 = pd.read_excel("../data_blast/data_each_crm/Olivia/data_WA_Olivia_1.xlsx")
# df_olivia_2 = pd.read_excel("../data_blast/data_each_crm/Olivia/data_WA_Olivia_2.xlsx")

In [None]:
# df_crm = pd.concat([df_7778_Intan, df_2062, df_2061, df_2060, df_nothing, df_0036_zein, df_0056_zein, df_2056_vicky, df_2057_vicky, df_1097_Intan, df_0041_Diah, df_0051_Diah, df_1095_Firda, df_1096_Firda, df_2058_Endah, df_2059_Endah, df_0046_Shania, df_0047_Shania, df_0031_Erni, df_0032_Erni,df_0034_Agil,df_1094_Agil])

In [None]:
df_crm = pd.read_excel("checkpoint_files/data_each_crm_updated.xlsx")

In [None]:
df_crm['Whatsapp'] = df_crm['Whatsapp'].astype(str)
df_crm['CRM'].value_counts()

merge data on tracking with data each crm

In [None]:
df_crm = df_crm[['Whatsapp','CRM',
       'Donatur']]

In [None]:
data_with_rekapan_updated['Whatsapp'] = data_with_rekapan_updated['Whatsapp'].astype(str)

In [None]:
df_crm['Whatsapp'] = df_crm['Whatsapp'].astype(str)

In [None]:
df_final = data_with_rekapan_updated.merge(df_crm, how='left', on='Whatsapp')

In [None]:
df_final['CRM'].value_counts()

In [None]:
# Contoh: pindahkan kolom 'email' ke posisi ke-1 (indeks 0 berarti paling depan)
df_final = move_column(df_final, 'Bulan', 2)
df_final = move_column(df_final, 'Tahun', 3)
df_final = move_column(df_final, "CRM", 4)

In [None]:
df_final['klasifikasi_program'] = np.where(df_final['Program'].str.lower().str.contains("palestina|gaza"), "PALESTINA", df_final['Program'])

In [None]:
df_final = df_final.rename(columns={"Donatur_x":"Donatur"})

In [None]:
df_final = df_final.drop(['Date','Bulan_Pertama','Donatur_y', 'Program'], axis=1)

In [None]:
df_final = move_column(df_final, "klasifikasi_program", 79)

In [None]:
df_final['Whatsapp'] = df_final['Whatsapp'].astype(int)

In [None]:
# def update_crm(main_df, updated_df, cols_to_check, log_limit=10):
#     """
#     Update main_df with updated_df using index as ID (Whatsapp).
#     - Only updates selected columns (cols_to_check)
#     - Does NOT overwrite with NaN
#     - Inserts new rows if not exist
#     - Prints log of changes (limited)
#     """
#     main_df = main_df.copy()
#     updated_df = updated_df.copy()

#     # Pastikan kolom yang dicek ada di kedua dataframe
#     for col in cols_to_check:
#         if col not in updated_df.columns or col not in main_df.columns:
#             raise KeyError(f"Kolom '{col}' tidak ada di kedua dataframe.")

#     # Temukan index yang sama
#     common_ids = updated_df.index.intersection(main_df.index)

#     # Update rows jika ada perbedaan
#     changes_log = []
#     for idx in common_ids:
#         for col in cols_to_check:
#             old_val = main_df.at[idx, col]
#             new_val = updated_df.at[idx, col]

#             # Hanya update kalau beda dan new_val TIDAK NaN
#             if pd.notna(new_val) and old_val != new_val:
#                 main_df.at[idx, col] = new_val
#                 changes_log.append(f"[UPDATE] {idx} | {col}: '{old_val}' ‚Üí '{new_val}'")

#     # Insert rows baru
#     new_rows = updated_df.loc[~updated_df.index.isin(main_df.index)]
#     main_df = pd.concat([main_df, new_rows], axis=0)

#     # Print log perubahan
#     print("===== LOG PERUBAHAN DATA =====")
#     if changes_log:
#         if len(changes_log) > log_limit:
#             print("\n".join(changes_log[:log_limit]))
#             print(f"... and {len(changes_log) - log_limit} more changes")
#         else:
#             print("\n".join(changes_log))
#     else:
#         print("Tidak ada data yang berubah.")
#     print(f"\nBaris baru ditambahkan: {len(new_rows)}")

#     return main_df
def update_status_dataframe(main_df: pd.DataFrame, updated_df: pd.DataFrame, key_col: str, cols_to_update: list) -> pd.DataFrame:
    """
    Update main_df using updated_df based on a key column and selected columns.
    - Updates values for matching rows.
    - Adds new rows if key not found in main_df.
    - Adds new columns from updated_df if they don't exist in main_df.
    """

    # --- 0Ô∏è‚É£ Copy data to avoid modifying originals ---
    main_df = main_df.copy()
    updated_df = updated_df.copy()

    # --- 1Ô∏è‚É£ Handle duplicate keys ---
    if main_df[key_col].duplicated().any():
        print(f"‚ö†Ô∏è Warning: Duplicate keys found in main_df on column '{key_col}' ‚Äî keeping first occurrence.")
        main_df = main_df.drop_duplicates(subset=key_col, keep='first')

    if updated_df[key_col].duplicated().any():
        print(f"‚ö†Ô∏è Warning: Duplicate keys found in updated_df on column '{key_col}' ‚Äî keeping first occurrence.")
        updated_df = updated_df.drop_duplicates(subset=key_col, keep='first')

    # --- 2Ô∏è‚É£ Ensure new columns from updated_df exist in main_df ---
    new_columns = [col for col in updated_df.columns if col not in main_df.columns]
    if new_columns:
        print(f"üÜï Found new columns in updated_df: {new_columns}")
        for col in new_columns:
            main_df[col] = None  # initialize with None (or pd.NA)

    # --- 3Ô∏è‚É£ Set index for efficient update ---
    main_df = main_df.set_index(key_col)
    updated_df = updated_df.set_index(key_col)

    # --- 4Ô∏è‚É£ Identify common and new keys ---
    common_ids = updated_df.index.intersection(main_df.index)
    new_rows = updated_df.loc[~updated_df.index.isin(main_df.index)]

    # --- 5Ô∏è‚É£ Update selected columns ---
    for col in cols_to_update:
        if col in updated_df.columns:
            main_df.loc[common_ids, col] = updated_df.loc[common_ids, col]
        else:
            print(f"‚ö†Ô∏è Column '{col}' not found in updated_df ‚Äî skipped.")

    # --- 6Ô∏è‚É£ Add new rows ---
    updated_main_df = pd.concat([main_df, new_rows])

    # --- 7Ô∏è‚É£ Reset index back ---
    updated_main_df = updated_main_df.reset_index()

    # --- 8Ô∏è‚É£ Log summary ---
    print("‚úÖ Update process completed successfully:")
    print(f"- {len(common_ids)} matched rows updated ({cols_to_update})")
    print(f"- {len(new_rows)} new rows added")
    print(f"- {len(new_columns)} new columns added")
    print(f"- Final total rows: {len(updated_main_df)}")

    return updated_main_df



In [None]:
df_crm['Whatsapp'] = df_crm['Whatsapp'].astype(int)

In [None]:
tracking_with_crm_updated = update_status_dataframe(
    main_df=tracking_with_crm,
    updated_df=df_final,
    key_col="Whatsapp",
    cols_to_update=["Tanggal_terakhir_donasi",'Kategori']
)

In [None]:
tracking_with_crm_updated = tracking_with_crm_updated.reset_index(drop=True)

In [None]:
duplicates = tracking_with_crm_updated[tracking_with_crm_updated['Whatsapp'].duplicated(keep=False)]

if not duplicates.empty:
    print("‚ùå Duplicate rows found:\n", duplicates)
    raise SystemExit("Stopping cell due to duplicates.")

In [None]:
tracking_with_crm_updated = tracking_with_crm_updated[['Whatsapp', 'Donatur', 'Bulan', 'Tahun', 'CRM',
       'Januari_Program', 'Januari_Donasi', 'Januari_Tanggal', 'Januari_Day',
       'Januari_Time', 'Januari_Payment', 'Februari_Program',
       'Februari_Donasi', 'Februari_Tanggal', 'Februari_Day', 'Februari_Time',
       'Februari_Payment', 'Maret_Program', 'Maret_Donasi', 'Maret_Tanggal',
       'Maret_Day', 'Maret_Time', 'Maret_Payment', 'April_Program',
       'April_Donasi', 'April_Tanggal', 'April_Day', 'April_Time',
       'April_Payment', 'Mei_Program', 'Mei_Donasi', 'Mei_Tanggal', 'Mei_Day',
       'Mei_Time', 'Mei_Payment', 'Juni_Program', 'Juni_Donasi',
       'Juni_Tanggal', 'Juni_Day', 'Juni_Time', 'Juni_Payment', 'Juli_Program',
       'Juli_Donasi', 'Juli_Tanggal', 'Juli_Day', 'Juli_Time', 'Juli_Payment',
       'Agustus_Program', 'Agustus_Donasi', 'Agustus_Tanggal', 'Agustus_Day',
       'Agustus_Time', 'Agustus_Payment', 'September_Program',
       'September_Donasi', 'September_Tanggal', 'September_Day',
       'September_Time', 'September_Payment', 'Oktober_Program',
       'Oktober_Donasi', 'Oktober_Tanggal', 'Oktober_Day', 'Oktober_Time',
       'Oktober_Payment', 'November_Program', 'November_Donasi',
       'November_Tanggal', 'November_Day', 'November_Time', 'November_Payment',
       'Desember_Program', 'Desember_Donasi', 'Desember_Tanggal',
       'Desember_Day', 'Desember_Time', 'Desember_Payment', 'Total',
       'Frekuensi', 'klasifikasi_program', 'Preferensi', 'Label_Jam',
       'Rata - rata', 'Day_Mode', 'Date_Category', 'Tahun_Pertama', 'Kategori',
       'Badge', 'Avg Kategori',"Tanggal_terakhir_donasi"]]
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
tracking_with_crm_updated['Bulan'] = pd.Categorical(tracking_with_crm_updated['Bulan'], categories=month_order, ordered=True)
tracking_with_crm_updated = tracking_with_crm_updated.sort_values('Bulan')

In [None]:
tracking_with_crm_updated.to_excel("checkpoint_files/data_tracking_with_crm.xlsx")

# Daily activity each crm

In [None]:
tracking_with_status = pd.read_excel("checkpoint_files/tracking_with_updated_category_daily_activity.xlsx")

In [None]:
status_donor = tracking_with_status[['Whatsapp','Kategori']]

here is to add the updated daily activity to update the donor category

In [None]:
status_donor = tracking_with_crm_updated.merge(status_donor,how='left',on='Whatsapp')

In [None]:
tracking_baru = status_donor
status_donor = status_donor[["Whatsapp","Kategori_y"]]

In [None]:
daily_1097 = pd.read_excel("../data_blast/daily_activity/filled/new/Daily_Activity_1097.xlsx")
daily_2056_bonding = pd.read_excel("../data_blast/daily_activity/filled/new/Daily Activity 2056 (BONDING).xlsx")
daily_2056_maintain = pd.read_excel("../data_blast/daily_activity/filled/new/Daily Activity 2056 (MAINTAIN).xlsx")
daily_2057_bonding = pd.read_excel("../data_blast/daily_activity/filled/new/Daily Activity 2057 (BONDING).xlsx")
daily_2057_maintain = pd.read_excel("../data_blast/daily_activity/filled/new/Daily Activity 2057 (MAINTAIN).xlsx")

In [None]:
daily_2056 = pd.concat([daily_2056_bonding, daily_2056_maintain])
daily_2057 = pd.concat([daily_2057_bonding, daily_2057_maintain])

 - daily that has been tracked

In [None]:
daily_activity_all = pd.concat([daily_1097, daily_2056, daily_2057])
daily_activity_all = daily_activity_all.dropna(subset=['Whatsapp'])

In [None]:
daily_activity_all.rename(columns={
    "Next Action":"kategori_update"
},inplace=True)

In [None]:
df_daily = status_donor.merge(daily_activity_all, how="left", on="Whatsapp")

In [None]:
status_donor = status_donor.rename(columns={"Kategori_y":"Kategori"})

In [None]:
df_daily = df_daily[["Whatsapp","Kategori","kategori_update"]]

In [None]:
df_daily['kategori_update'].value_counts()

In [None]:
def label_status(x):
    status = x['kategori_update']
    kategori = x['Kategori']

    if status in ['Eliminasi', 'Invalid']:
        return 'INVALID'
    else:
        return kategori  # keep original kategori

df_daily['Kategori'] = df_daily.apply(label_status, axis=1)

In [None]:
df_daily['Kategori'].value_counts()

- update data each crm,. if it's invalid then remove from data each crm

Final output for tracking data

In [None]:
# def update_dataframe_status(main_df: pd.DataFrame, updated_df: pd.DataFrame, cols_to_check: list) -> pd.DataFrame:
#     common_ids = updated_df.index.intersection(main_df.index)

#     # Use .ne() to detect NaN differences correctly
#     diff_mask = updated_df.loc[common_ids, cols_to_check].ne(main_df.loc[common_ids, cols_to_check]).any(axis=1)

#     rows_to_update = updated_df.loc[common_ids[diff_mask]]
#     new_rows = updated_df.loc[~updated_df.index.isin(main_df.index)]

#     # ‚úÖ Overwrite rows explicitly instead of using .update()
#     main_df.loc[rows_to_update.index, cols_to_check] = rows_to_update[cols_to_check]

#     # Add new rows
#     updated_main_df = pd.concat([main_df, new_rows])
#     updated_main_df = updated_main_df.sort_index()

#     print("‚úÖ Excel updated successfully:")
#     print(f"- {len(new_rows)} new rows added")
#     print(f"- {len(rows_to_update)} rows updated based on {cols_to_check}")

#     return updated_main_df

# FUNCTION FOR CHANGING ONLY CHOSEN COLUMN BASED ON COLUMN CHANGES

In [None]:
tracking_with_updated_status = update_status_dataframe(
    main_df=tracking_with_crm_updated,
    updated_df=df_daily,
    key_col="Whatsapp",
    cols_to_update=["Kategori"]
)

In [None]:
df_crm['Whatsapp'] = df_crm['Whatsapp'].astype(int)

In [None]:
# tracking_with_updated_status = tracking_with_updated_status.drop_duplicates(subset=["Whatsapp"], keep="first")
tracking_with_updated_status['Kategori'].value_counts(dropna=False)

In [None]:
duplicates = tracking_with_updated_status[tracking_with_updated_status['Whatsapp'].duplicated(keep=False)]

if not duplicates.empty:
    print("‚ùå Duplicate rows found:\n", duplicates)
    raise SystemExit("Stopping cell due to duplicates.")

In [None]:
tracking_with_updated_status = tracking_with_updated_status[['Whatsapp', 'Donatur', 'Bulan', 'Tahun', 'CRM', 'Januari_Program',
       'Januari_Donasi', 'Januari_Tanggal', 'Januari_Day', 'Januari_Time',
       'Januari_Payment', 'Februari_Program', 'Februari_Donasi',
       'Februari_Tanggal', 'Februari_Day', 'Februari_Time', 'Februari_Payment',
       'Maret_Program', 'Maret_Donasi', 'Maret_Tanggal', 'Maret_Day',
       'Maret_Time', 'Maret_Payment', 'April_Program', 'April_Donasi',
       'April_Tanggal', 'April_Day', 'April_Time', 'April_Payment',
       'Mei_Program', 'Mei_Donasi', 'Mei_Tanggal', 'Mei_Day', 'Mei_Time',
       'Mei_Payment', 'Juni_Program', 'Juni_Donasi', 'Juni_Tanggal',
       'Juni_Day', 'Juni_Time', 'Juni_Payment', 'Juli_Program', 'Juli_Donasi',
       'Juli_Tanggal', 'Juli_Day', 'Juli_Time', 'Juli_Payment',
       'Agustus_Program', 'Agustus_Donasi', 'Agustus_Tanggal', 'Agustus_Day',
       'Agustus_Time', 'Agustus_Payment', 'September_Program',
       'September_Donasi', 'September_Tanggal', 'September_Day',
       'September_Time', 'September_Payment', 'Oktober_Program',
       'Oktober_Donasi', 'Oktober_Tanggal', 'Oktober_Day', 'Oktober_Time',
       'Oktober_Payment', 'November_Program', 'November_Donasi',
       'November_Tanggal', 'November_Day', 'November_Time', 'November_Payment',
       'Desember_Program', 'Desember_Donasi', 'Desember_Tanggal',
       'Desember_Day', 'Desember_Time', 'Desember_Payment', 'Total',
       'Frekuensi', 'klasifikasi_program', 'Preferensi', 'Label_Jam',
       'Rata - rata', 'Day_Mode', 'Date_Category', 'Tahun_Pertama', 'Kategori',
       'Badge', 'Avg Kategori',"Tanggal_terakhir_donasi"]]

In [None]:
tracking_with_updated_status

In [None]:
tracking_with_updated_status.to_excel("checkpoint_files/tracking_with_updated_category_daily_activity.xlsx")

# Data Waiting that has never been successful before

In [None]:
df_waiting = data_all[(data_all['is_random'] == True)]
df_waiting = df_waiting[df_waiting['Payment Status'] == 'Waiting']
df_waiting.head()

In [None]:
agg_waiting = df_waiting.groupby("Whatsapp").agg(
    Donatur=("Donatur", "last"),
    Total=("Total", "sum"),
    Date=("Date", "min"),
    Payment=("Payment Account", "first"),
    Program=("Program", lambda x: list(x)),
    Time=('label_jam', lambda x: list(x)),
    Day=("Day", lambda x: x.mode().iloc[0])
).reset_index().sort_values(by='Total', ascending=False)

In [None]:
agg_waiting['Whatsapp'] = agg_waiting['Whatsapp'].astype(int)

df_merge = pd.merge(agg_waiting,df_summary_database, how='left', on='Whatsapp')

In [None]:
df_merge = df_merge[df_merge['CRM'].isna()]

In [None]:
df_merge

In [None]:
only_waiting = df_merge.copy()

waiting_final = only_waiting[['Whatsapp','Donatur_x','Sapaan','Program','Date','Total_x', "Day", "Date"]]

In [None]:
waiting_final['Bulan_Angka'] = pd.to_datetime(waiting_final['Date']).dt.month
waiting_final['Nama_Bulan'] = pd.to_datetime(waiting_final['Date']).dt.strftime('%B')
waiting_final['Nama_Hari'] = pd.to_datetime(waiting_final['Date']).dt.strftime('%A')
waiting_final['Date'] = pd.to_datetime(waiting_final['Date']).dt.date

In [None]:
waiting_final = waiting_final.rename(columns={
    "Donatur_x": "Donatur",
    "Total_x": "Total"
})

In [None]:
# Tambahkan program hasil pemrosesan
final_programs = []
previous_programs = {}

for _, row in waiting_final.iterrows():
    key = row["Whatsapp"]
    current_programs = list(dict.fromkeys(row["Program"]))  # unik, urut
    prev = previous_programs.get(key, [])
    
    # Cari program yang berbeda dari sebelumnya
    different_programs = [p for p in current_programs if p not in prev]

    if not prev:  # bulan pertama
        final_programs.append(current_programs[0])
    elif different_programs:
        final_programs.append(different_programs[0])  # tampilkan program baru
    else:
        final_programs.append(current_programs[0])  # sama semua, tampilkan salah satu

    previous_programs[key] = current_programs

In [None]:
waiting_final["Final_Program"] = final_programs

In [None]:
waiting_final

In [None]:
# Persiapan output_waiting format lebar (wide)
bulan_dict = {
    1: "Januari", 2: "Februari", 3: "Maret", 4: "April", 5: "Mei", 6: "Juni",
    7: "Juli", 8: "Agustus", 9: "September", 10: "Oktober", 11: "November", 12: "Desember"
}

# Output awal
output_waiting = pd.DataFrame(waiting_final["Whatsapp"].drop_duplicates())

for bulan_num in range(1, 13):
    bulan_nama = bulan_dict[bulan_num]
    sub = waiting_final[waiting_final["Nama_Bulan"] == bulan_num][["Whatsapp", "Final_Program", "Total", "Date", "Day", "Payment"]]
    sub.columns = ["Whatsapp", f"{bulan_nama}_Program", f"{bulan_nama}_Donasi", f"{bulan_nama}_Tanggal", f"{bulan_nama}_Day", f"{bulan_nama}_Payment"]
    output_waiting = output_waiting.merge(sub, on="Whatsapp", how="left")

# Tambah Nama
output_waiting = output_waiting.merge(data_web_himpunan[["Whatsapp", "Donatur"]].drop_duplicates("Whatsapp"), on="Whatsapp", how="left")

# Tambah bulan pertama donasi untuk pengurutan
output_waiting = output_waiting.merge(first_donation[["Whatsapp", "Bulan_Pertama"]], on="Whatsapp")
output_waiting = output_waiting.sort_values("Bulan_Pertama").drop(columns=["Bulan_Pertama"])

# Urutkan kolom
cols = output_waiting.columns.tolist()
cols = ["Whatsapp", "Donatur"] + [col for col in cols if col not in ["Whatsapp", "Donatur"]]
output_waiting = output_waiting[cols]

# Hapus nilai 0
output_waiting.replace(0, "", inplace=True)

In [None]:
# Ambil semua kolom yang mengandung '_Program'
program_cols = [col for col in output.columns if '_Program' in col]

time_cols = [col for col in output.columns if '_Time' in col]

# Total donasi
output['Total'] = output[[col for col in output.columns if '_Donasi' in col]].sum(axis=1)

# Frekuensi donasi (jumlah kolom donasi yang tidak kosong)
output['Frekuensi'] = output[[col for col in output.columns if '_Donasi' in col]].notna().sum(axis=1)

# output['Total_Preferensi'] = (output[[col for col in output.columns if '_Preferensi' in col]].sum(axis=1) / output[[col for col in output.columns if '_Preferensi' in col]].notna().sum(axis=1)).astype(int)

# Hitung preferensi: jumlah program unik per baris
output['Preferensi'] = output[program_cols].apply(lambda row: row.dropna().nunique(), axis=1)

# Hitung program yang paling sering muncul per baris
output['Program'] = output[program_cols].apply(lambda row: row.dropna().mode().iloc[0] if not row.dropna().empty else None, axis=1)

# Hitung range time paling sering muncul per baris
output['Label_Jam'] = output[time_cols].apply(lambda row: row.dropna().mode().iloc[0] if not row.dropna().empty else None, axis=1)

# Rata-rata donasi, dibulatkan ke bilangan bulat
output['Rata - rata'] = output[[col for col in output.columns if '_Donasi' in col]].mean(axis=1).round(0).astype(int)

# Change total dtype to int
output['Total'] = output['Total'].astype(int)

In [None]:
waiting_final.to_excel('Data_waiting.xlsx')

sdafasdfasdf

In [None]:
# df_database = pd.read_excel('Tracking_temanbaik.xlsx')
df_database = df_tracking.copy()

In [None]:
df_database['Bulan'].value_counts(dropna=False)

In [None]:
df_database[df_database['Label_Jam'].isna()]