In [1]:
import os 
import pandas as pd

# Compilation of raw/2024 AND raw/2025 datasets

Note: Manually replace the year of the year to be processed

In [111]:
import pandas as pd
from pathlib import Path
import os

year = '2025'

# 📁 Define the base directory
base_dir = Path(f'../../dataset/raw/{year}/')

def discover_header(file_path, sheet_name='Member Consumers Data'):
    # Temporarily read with no header to inspect rows
    temp_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None, usecols='A:N')

    for i, row in temp_df.iterrows():
        normalized = [str(cell).strip().lower() for cell in row]
        if any('name' in cell for cell in normalized):
            print(f"✅ {file_path}: Header found at row {i}")
            return i

    print("❌ Header not found — fallback to default row")
    return Exception("Header not found")

# 🧹 Sanitize sheet reader
def read_sheet(file_path, is_first):
    try:
        header_row = discover_header(file_path)
        return pd.read_excel(
            file_path,
            sheet_name='Member Consumers Data',
            skiprows=header_row if is_first else 6,
            usecols='A:N',
            header=None if not is_first else 0
        )
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")
        return pd.DataFrame()

# 🚦 Main loop
output_folder = 'staging'
os.makedirs(base_dir / output_folder, exist_ok=True)
for folder in base_dir.iterdir():
    if not folder.is_dir():
        continue

    compiled_df = pd.DataFrame()
    try:
        for subfolder in folder.iterdir():
            if not subfolder.is_dir():
                continue

            # Filter valid Excel files
            excel_files = [f for f in subfolder.iterdir()
                           if f.suffix in ('.xlsm', '.xlsx') and not f.name.startswith('~')]

            for i, file_path in enumerate(excel_files):
                new_df = read_sheet(file_path, is_first=(i == 0))

                # ✅ Validate new_df before appending
                if not new_df.empty and new_df.dropna(axis=1, how='all').shape[1] > 0:
                    new_df.dropna(thresh=5, inplace=True, axis=0)
                    compiled_df = pd.concat([compiled_df, new_df], ignore_index=True)

        # 📤 Export compiled data
        month = folder.name[0:3].upper()
        output_file = base_dir / output_folder / f'{month}_{year}.csv'
        compiled_df.to_csv(output_file, index=False)
        print(f"✅ Compiled: {output_file}")

    except Exception as e:
        print(f"Error accessing {folder.name}: {e}")

✅ ..\..\dataset\raw\2025\April 2025\HAZEL-Altarejos, Lique St\BILLING FORM_ALTAREJOS ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\HAZEL-Altarejos, Lique St\BILLING FORM_AMICAN ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\HAZEL-Altarejos, Lique St\BILLING FORM_LIQUE ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_BARTOLABAC ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_GRAFILO ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_VILLAMOR ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JOSHUA-Amican St., Burgos\BILLING FORM_BINALIW, BURGOS.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JOSHUA-Amican St., Burgos\BILLING FORM_NHA DUPLEX.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025

In [39]:
# Handle the conversion of xlsx to csv for datasets already in compiled form prior to preprocessing
import os 

def discover_header(file_path, sheet_name='Sheet1'):
    # Temporarily read with no header to inspect rows
    temp_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None, usecols='A:H')

    for i, row in temp_df.iterrows():
        normalized = [str(cell).strip().lower() for cell in row]
        if any('name' in cell for cell in normalized):
            print(f"✅ {file_path}: Header found at row {i}")
            return i

    print("❌ Header not found — fallback to default row")
    return Exception("Header not found")

base_dir = Path(f'../../dataset/raw/{year}/')

files = [f for f in base_dir.iterdir() if f.suffix in ('.xlsm', '.xlsx') and not f.name.startswith('~')]

output_dir = base_dir / 'compiled'
os.makedirs(output_dir, exist_ok=True)

for file in files:
    print(file.name)
    month = file.name[0:3].upper()

    header_row = discover_header(file)
    df = pd.read_excel(file, sheet_name = 'Sheet1', skiprows=header_row, usecols='A:H', header = 0)
    df.to_csv(output_dir / f'{month}_{year}.csv', index=False)

In [114]:
# Final dataset preparation
base_dir = Path(f'../../dataset/raw/{year}/{output_folder}')
files = [f for f in base_dir.iterdir() if f.is_file() and not(f.name == 'compiled' or f.name.startswith('COM') or f.name.startswith('STA'))]

for file in files:
    print(file.name)
    df = pd.read_csv(file)
    df.dropna(how='all', inplace=True, axis=1)
    print(df.info())

APR_2025.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2164 entries, 0 to 2163
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Control No.           629 non-null    float64
 1   Account Name          629 non-null    object 
 2   Service Address       629 non-null    object 
 3   Type                  629 non-null    object 
 4   Meter Serial No.      397 non-null    object 
 5   Previous Reading      629 non-null    object 
 6   Present Reading       627 non-null    object 
 7   Last Present Reading  550 non-null    object 
 8   Consumption           629 non-null    object 
 9   Status                600 non-null    object 
 10  0                     1535 non-null   float64
 11  2                     1535 non-null   object 
 12  3                     1535 non-null   object 
 13  4                     1535 non-null   object 
 14  5                     944 non-null    object 
 15  6       

# Dataset Final Preparations for Months of July 2024 to May 2025
Should be run manually

| Column | Description |
|--------|-------------|
| `df['0']` | Control numbers |
| `df['2']` | Account names |
| `df['3']` | Service addresses |
| `df['4']` | Types |
| `df['5']` | Meter serial numbers |
| `df['6']` | Previous readings |
| `df['7']` | Present readings |
| `df['8']` | Last present readings |
| `df['9']` | Consumptions |
| `df['13']` | Statuses |

In [123]:
# Final 2024 dataset preparation
from pathlib import Path
import pandas as pd

source_folder = 'staging'
output_folder = 'compiled'
month = 'JAN'
year = '2025'
base_dir = Path(f'../../dataset/raw/{year}/{source_folder}/{month}_{year}.csv')

df = pd.read_csv(base_dir)
display(df.head())
display(df.columns)
display(df.info())


has_invalid_columns = False
if any(col_name in df.columns for col_name in ['0', '2', '3', '4', '5', '6', '7', '8', '9', '13']):
    has_invalid_columns = True

Unnamed: 0,Control No.,SC,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Last Present Reading,Consumption,...,4,5,6,7,8,9,10,11,12,13
0,500118.0,,"Abejero, Ernesto Jr.","Altarejos St., Dist. 3",R,,7714,7729,7714,15,...,,,,,,,,,,
1,501026.0,,"Barrun, Ana","Altarejos St., Dist. 3",R,,1082,1090,1082,8,...,,,,,,,,,,
2,500810.0,,"Almerol, Nilo","Altarejos St., Dist. 3",C,,641,647,641,6,...,,,,,,,,,,
3,501534.0,,"Almiñe, Nixon","Altarejos St., Dist. 3",R,,443,455,443,12,...,,,,,,,,,,
4,500447.0,,"Almodiel, Fe","Altarejos St., Dist. 3",R,O151660,nr,nr,nr,nr,...,,,,,,,,,,


Index(['Control No.', 'SC', 'Account Name', 'Service Address', 'Type',
       'Meter Serial No.', 'Previous Reading', 'Present Reading',
       'Last Present Reading', 'Consumption', 'Maintenance Service Charge',
       'Other Charges & Penalty', 'Previous Unpaid Balance', 'Status', '0',
       '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2156 entries, 0 to 2155
Data columns (total 28 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Control No.                 685 non-null    float64
 1   SC                          0 non-null      float64
 2   Account Name                685 non-null    object 
 3   Service Address             685 non-null    object 
 4   Type                        684 non-null    object 
 5   Meter Serial No.            119 non-null    object 
 6   Previous Reading            685 non-null    object 
 7   Present Reading             681 non-null    object 
 8   Last Present Reading        594 non-null    object 
 9   Consumption                 685 non-null    object 
 10  Maintenance Service Charge  0 non-null      float64
 11  Other Charges & Penalty     0 non-null      float64
 12  Previous Unpaid Balance     0 non-null      float64
 13  Status                      646 n

None

In [None]:
if has_invalid_columns:
    temp_df = df[['0', '2', '3', '4', '5', '6', '7', '8', '9', '13']].copy()
    temp_df.dropna(how='all', inplace=True, axis=0)
    display(temp_df.head())
    display(temp_df.info())
else:
    final_df = df.copy()
    display(final_df.info())

Unnamed: 0,0,2,3,4,5,6,7,8,9,13
81,501400.0,"Alcantara, Sheena","Lique St., Dist. 1",C,,1219,1233,1219,14,active
82,500256.0,"Almodal, Evelyn","Lique St., Dist. 1",C,,6561,6580,6561,19,active
83,501966.0,"Almodal, Mabel","Lique St., Dist. 4",C,,1215,1262,1215,47,active
84,500506.0,"Almiñe, Filben","Lique St., Dist. 1",R,028200-02,380,392,380,12,active
85,501705.0,"Almodiel, Ullysis","Lique St., Dist. 4",R,,1175,1192,1175,17,active


<class 'pandas.core.frame.DataFrame'>
Index: 1471 entries, 81 to 2155
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1471 non-null   float64
 1   2       1471 non-null   object 
 2   3       1471 non-null   object 
 3   4       1471 non-null   object 
 4   5       185 non-null    object 
 5   6       1471 non-null   object 
 6   7       1466 non-null   object 
 7   8       1209 non-null   object 
 8   9       1471 non-null   object 
 9   13      1393 non-null   object 
dtypes: float64(1), object(9)
memory usage: 126.4+ KB


None

In [118]:
if has_invalid_columns:
    df.columns = df.columns.str.strip()
    new_df = df[['Control No.', 'Account Name', 'Service Address', 'Type', 'Meter Serial No.', 'Previous Reading', 'Present Reading', 'Consumption', 'Status']].copy()
    display(new_df.head())
    display(new_df.info())

else:
    final_df.info()

Unnamed: 0,Control No.,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Consumption,Status
0,500118.0,"Abejero, Ernesto Jr.","Altarejos St., Dist. 3",R,,7714,7729,15,active
1,501026.0,"Barrun, Ana","Altarejos St., Dist. 3",R,,1082,1090,8,active
2,500810.0,"Almerol, Nilo","Altarejos St., Dist. 3",C,,641,647,6,active
3,501534.0,"Almiñe, Nixon","Altarejos St., Dist. 3",R,,443,455,12,active
4,500447.0,"Almodiel, Fe","Altarejos St., Dist. 3",R,O151660,nr,nr,nr,active


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2156 entries, 0 to 2155
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Control No.       685 non-null    float64
 1   Account Name      685 non-null    object 
 2   Service Address   685 non-null    object 
 3   Type              684 non-null    object 
 4   Meter Serial No.  119 non-null    object 
 5   Previous Reading  685 non-null    object 
 6   Present Reading   681 non-null    object 
 7   Consumption       685 non-null    object 
 8   Status            646 non-null    object 
dtypes: float64(1), object(8)
memory usage: 151.7+ KB


None

In [119]:
if has_invalid_columns:
    new_col_names = {'0': 'Control No.', '2': 'Account Name', '3': 'Service Address', '4': 'Type', '5': 'Meter Serial No.', '6': 'Previous Reading', '7': 'Present Reading', '9': 'Consumption', '13': 'Status'}
    temp_df.rename(columns=new_col_names, inplace=True)
    temp_df = temp_df[['Control No.', 'Account Name', 'Service Address', 'Type', 'Meter Serial No.', 'Previous Reading', 'Present Reading', 'Consumption', 'Status']].copy()
    
    display(temp_df.info())
    display(temp_df.head())

else:
    final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1471 entries, 81 to 2155
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Control No.       1471 non-null   float64
 1   Account Name      1471 non-null   object 
 2   Service Address   1471 non-null   object 
 3   Type              1471 non-null   object 
 4   Meter Serial No.  185 non-null    object 
 5   Previous Reading  1471 non-null   object 
 6   Present Reading   1466 non-null   object 
 7   Consumption       1471 non-null   object 
 8   Status            1393 non-null   object 
dtypes: float64(1), object(8)
memory usage: 114.9+ KB


None

Unnamed: 0,Control No.,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Consumption,Status
81,501400.0,"Alcantara, Sheena","Lique St., Dist. 1",C,,1219,1233,14,active
82,500256.0,"Almodal, Evelyn","Lique St., Dist. 1",C,,6561,6580,19,active
83,501966.0,"Almodal, Mabel","Lique St., Dist. 4",C,,1215,1262,47,active
84,500506.0,"Almiñe, Filben","Lique St., Dist. 1",R,028200-02,380,392,12,active
85,501705.0,"Almodiel, Ullysis","Lique St., Dist. 4",R,,1175,1192,17,active


In [120]:
if has_invalid_columns:
    final_df = pd.concat([new_df, temp_df], axis=0, ignore_index=True)
    final_df.dropna(how='all', inplace=True, axis=0)
    display(final_df.head())
    display(final_df.info())

else:
    final_df.info()
    print("Year not supported")

Unnamed: 0,Control No.,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Consumption,Status
0,500118.0,"Abejero, Ernesto Jr.","Altarejos St., Dist. 3",R,,7714,7729,15,active
1,501026.0,"Barrun, Ana","Altarejos St., Dist. 3",R,,1082,1090,8,active
2,500810.0,"Almerol, Nilo","Altarejos St., Dist. 3",C,,641,647,6,active
3,501534.0,"Almiñe, Nixon","Altarejos St., Dist. 3",R,,443,455,12,active
4,500447.0,"Almodiel, Fe","Altarejos St., Dist. 3",R,O151660,nr,nr,nr,active


<class 'pandas.core.frame.DataFrame'>
Index: 2156 entries, 0 to 3626
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Control No.       2156 non-null   float64
 1   Account Name      2156 non-null   object 
 2   Service Address   2156 non-null   object 
 3   Type              2155 non-null   object 
 4   Meter Serial No.  304 non-null    object 
 5   Previous Reading  2156 non-null   object 
 6   Present Reading   2147 non-null   object 
 7   Consumption       2156 non-null   object 
 8   Status            2039 non-null   object 
dtypes: float64(1), object(8)
memory usage: 168.4+ KB


None

In [122]:
subset = ['Account Name', 'Control No.', 'Service Address']
duplicated = final_df[final_df.duplicated(subset=subset, keep=False)].copy()

display(duplicated.sort_values('Account Name'))
display(len(duplicated))

if len(duplicated) > 0:
    dropped_rows = final_df.duplicated(subset=subset, keep='first')
    final_df = final_df.drop_duplicates(subset=subset, keep='first').copy()

    print(f"Dropped {len(dropped_rows)} rows")

    display(final_df.info())
    display(final_df.head())

print('Final dataset length:', len(final_df))

Unnamed: 0,Control No.,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Consumption,Status


0

Final dataset length: 2156


In [124]:
final_df.to_csv(f'../../dataset/raw/{year}/{output_folder}/{month}_{year}.csv', index=False)

# Used for setting standard directory for compiled csv files

In [82]:
base_dir = Path('../../dataset/raw/2020/')
compiled_dir = base_dir / 'compiled'
os.makedirs(compiled_dir, exist_ok=True)

In [83]:
csv_files = [f for f in os.listdir(base_dir) if f.endswith('.csv')]
print(csv_files)

for file in csv_files:
    os.replace(base_dir / file, base_dir / 'compiled' / file)

['APR2020.csv', 'AUG2020.csv', 'DEC2020.csv', 'FEB2020.csv', 'JAN2020.csv', 'JUL2020.csv', 'JUN2020.csv', 'MAR2020.csv', 'MAR_APR2020.csv', 'MAY2020.csv', 'NOV2020.csv', 'OCT2020.csv', 'SEP2020.csv']


In [31]:
import re
from pathlib import Path

year = '2023'
base_dir = Path(f'../../dataset/raw/{year}/compiled/')
files = [f for f in base_dir.iterdir() if f.is_file()]
files.sort()

for f in files:
    orig_name = f.name
    new_name = re.sub(r'(\d)', r'_\1', orig_name, count=1)
    if orig_name != new_name:
        new_path = base_dir / new_name
        f.rename(new_path)
        print(f"Renamed: {orig_name} → {new_name}")
    else:
        print(f"Skipped (no digit): {orig_name}")

Renamed: APR2023.csv → APR_2023.csv
Renamed: AUG2023.csv → AUG_2023.csv
Renamed: DEC2023.csv → DEC_2023.csv
Renamed: FEB2023.csv → FEB_2023.csv
Renamed: JAN2023.csv → JAN_2023.csv
Renamed: JUL2023.csv → JUL_2023.csv
Renamed: JUN2023.csv → JUN_2023.csv
Renamed: MAR2023.csv → MAR_2023.csv
Renamed: MAY2023.csv → MAY_2023.csv
Renamed: OCT2023.csv → OCT_2023.csv
Renamed: SEP2023.csv → SEP_2023.csv
