In [1]:
import os 
import pandas as pd

# Compilation of raw/2024 AND raw/2025 datasets

Note: Manually replace the year of the year to be processed

Run this for excel files that have invalid column names such as 0, 2, 3, 4, 5, 6, 7, 8, 9, 13

In [176]:
import pandas as pd
from pathlib import Path
import os

year = '2025'

# 📁 Define the base directory
base_dir = Path(f'../../dataset/raw/{year}/')

def discover_header(file_path, sheet_name='Member Consumers Data'):
    # Temporarily read with no header to inspect rows
    temp_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None, usecols='A:N')

    for i, row in temp_df.iterrows():
        normalized = [str(cell).strip().lower() for cell in row]
        if any('name' in cell for cell in normalized):
            print(f"✅ {file_path}: Header found at row {i}")
            return i

    print("❌ Header not found — fallback to default row")
    return Exception("Header not found")

# 🧹 Sanitize sheet reader
def read_sheet(file_path, is_first):
    try:
        header_row = discover_header(file_path)
        return pd.read_excel(
            file_path,
            sheet_name='Member Consumers Data',
            skiprows=header_row if is_first else 6,
            usecols='A:N',
            header=None if not is_first else 0
        )
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")
        return pd.DataFrame()

# 🚦 Main loop
output_folder = 'staging'
os.makedirs(base_dir / output_folder, exist_ok=True)
columns_to_exclude = ["Water Meter Serial #", "Meter Serial No."]

for folder in base_dir.iterdir():
    if not folder.is_dir():
        continue

    compiled_df = pd.DataFrame()
    try:
        for subfolder in folder.iterdir():
            if not subfolder.is_dir():
                continue

            # Filter valid Excel files
            excel_files = [f for f in subfolder.iterdir()
                           if f.suffix in ('.xlsm', '.xlsx') and not f.name.startswith('~')]

            for i, file_path in enumerate(excel_files):
                new_df = read_sheet(file_path, is_first=(i == 0))

                # ✅ Validate new_df before appending
                if not new_df.empty and new_df.dropna(axis=1, how='all').shape[1] > 0:
                    new_df.dropna(thresh=5, inplace=True, axis=0)
                    compiled_df = pd.concat([compiled_df, new_df], ignore_index=True)
                    compiled_df = compiled_df.loc[:, ~compiled_df.columns.isin(columns_to_exclude)].copy()
                    compiled_df.dropna(how='all', inplace=True, axis=1)

        # 📤 Export compiled data
        month = folder.name[0:3].upper()
        output_file = base_dir / output_folder / f'{month}_{year}.csv'
        compiled_df.to_csv(output_file, index=False)
        print(f"✅ Compiled: {output_file}")

    except Exception as e:
        print(f"Error accessing {folder.name}: {e}")

✅ ..\..\dataset\raw\2025\April 2025\HAZEL-Altarejos, Lique St\BILLING FORM_ALTAREJOS ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\HAZEL-Altarejos, Lique St\BILLING FORM_AMICAN ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\HAZEL-Altarejos, Lique St\BILLING FORM_LIQUE ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_BARTOLABAC ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_GRAFILO ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_VILLAMOR ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JOSHUA-Amican St., Burgos\BILLING FORM_BINALIW, BURGOS.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025\JOSHUA-Amican St., Burgos\BILLING FORM_NHA DUPLEX.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2025\April 2025

Run this for excel files which are already compiled

In [266]:
# Handle the conversion of xlsx to csv for datasets already in compiled form prior to preprocessing
import os 
from pathlib import Path
import pandas as pd

year = '2020'
output_folder = 'compiled'

def discover_header(file_path, sheet_name='Sheet1'):
    # Temporarily read with no header to inspect rows
    temp_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None, usecols='A:H')

    for i, row in temp_df.iterrows():
        normalized = [str(cell).strip().lower() for cell in row]
        if any('name' in cell for cell in normalized):
            print(f"✅ {file_path}: Header found at row {i}")
            return i

    print("❌ Header not found — fallback to default row")
    return Exception("Header not found")

base_dir = Path(f'../../dataset/raw/{year}/')

files = [f for f in base_dir.iterdir() if f.suffix in ('.xlsm', '.xlsx') and not f.name.startswith('~')]

output_dir = base_dir / output_folder
os.makedirs(output_dir, exist_ok=True)

columns_to_exclude = ["Water Meter Serial #", "Meter Serial No."]

for file in files:
    print(file.name)
    month = file.name[0:3].upper()

    header_row = discover_header(file)
    df = pd.read_excel(file, sheet_name = 'Sheet1', skiprows=header_row, usecols='A:H', header = 0)
    final_df = df.loc[:, ~df.columns.isin(columns_to_exclude)].copy()
    final_df.dropna(how='all', inplace=True, axis=1)
    final_df.to_csv(output_dir / f'{month}_{year}.csv', index=False)

AUGUST 10-SEPTEMBER 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\AUGUST 10-SEPTEMBER 10, 2020.xlsx: Header found at row 2
DECEMBER 10, 2019-JANUARY 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\DECEMBER 10, 2019-JANUARY 10, 2020.xlsx: Header found at row 0
FEBRUARY 10-MARCH 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\FEBRUARY 10-MARCH 10, 2020.xlsx: Header found at row 0
JANUARY 10-FEBRUARY 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\JANUARY 10-FEBRUARY 10, 2020.xlsx: Header found at row 0
JULY 10-AUGUST 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\JULY 10-AUGUST 10, 2020.xlsx: Header found at row 2
JUNE 10-JULY 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\JUNE 10-JULY 10, 2020.xlsx: Header found at row 2
MARCH 10-MAY 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\MARCH 10-MAY 10, 2020.xlsx: Header found at row 0
MAY 10-JUNE 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\MAY 10-JUNE 10, 2020.xlsx: Header found at row 0
NOVEMBER 10-DECEMBER 10, 2020.xlsx
✅ ..\..\dataset\raw\2020\NOVEMBER 10-DECEMBER 10, 2020.xlsx: Header found at row 0


In [177]:
# Final dataset view
base_dir = Path(f'../../dataset/raw/{year}/{output_folder}')
files = [f for f in base_dir.iterdir() if f.is_file() and not(f.name == 'compiled' or f.name.startswith('COM') or f.name.startswith('STA'))]

for file in files:
    print(file.name)
    df = pd.read_csv(file)
    print(df.info())

APR_2025.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2164 entries, 0 to 2163
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Control No.           629 non-null    float64
 1   Account Name          629 non-null    object 
 2   Service Address       629 non-null    object 
 3   Type                  629 non-null    object 
 4   Previous Reading      629 non-null    object 
 5   Present Reading       627 non-null    object 
 6   Last Present Reading  550 non-null    object 
 7   Consumption           629 non-null    object 
 8   Status                600 non-null    object 
 9   0                     1535 non-null   float64
 10  2                     1535 non-null   object 
 11  3                     1535 non-null   object 
 12  4                     1535 non-null   object 
 13  5                     944 non-null    object 
 14  6                     1535 non-null   object 
 15  7       

# Dataset Final Preparations for Months of July 2024 to May 2025
Should be run manually

| Column | Description |
|--------|-------------|
| `df['0']` | Control numbers |
| `df['2']` | Account names |
| `df['3']` | Service addresses |
| `df['4']` | Types |
| `df['5']` | Meter serial numbers |
| `df['6']` | Previous readings |
| `df['7']` | Present readings |
| `df['8']` | Last present readings |
| `df['9']` | Consumptions |
| `df['13']` | Statuses |

In [262]:
# Final 2024 dataset preparation
from pathlib import Path
import pandas as pd

source_folder = 'staging'
output_folder = 'compiled'
month = 'MAY'
year = '2025'
base_dir = Path(f'../../dataset/raw/{year}/{source_folder}/{month}_{year}.csv')

df = pd.read_csv(base_dir)
final_df = pd.DataFrame()

if any(col_name in df.columns for col_name in ['0', '2', '3', '4', '5', '6', '7', '8', '9', '13']):
    temp_df = df[['0', '2', '3', '4', '5', '6', '7', '8', '9', '13']].copy()
    temp_df.dropna(how='all', inplace=True, axis=0)

    columns = df.columns.str.strip()
    print(columns)
    new_df = df[['Control No.', 'Account Name', 'Service Address', 'Type', 'Previous Reading', 'Present Reading', 'Consumption', 'Status']].copy()
    new_col_names = {'0': 'Control No.', '2': 'Account Name', '3': 'Service Address', '4': 'Type', '6': 'Previous Reading', '7': 'Present Reading', '9': 'Consumption', '13': 'Status'}
    temp_df.rename(columns=new_col_names, inplace=True)
    temp_df = temp_df[['Control No.', 'Account Name', 'Service Address', 'Type', 'Previous Reading', 'Present Reading', 'Consumption', 'Status']].copy()
    
    display(temp_df.info())
    display(temp_df.head())

    final_df = pd.concat([new_df, temp_df], axis=0, ignore_index=True)

else:
    final_df = df.copy()

final_df.dropna(how='all', inplace=True, axis=0)
display(final_df.head())
display(final_df.info())

Index(['Control No.', 'Account Name', 'Service Address', 'Type',
       'Previous Reading', 'Present Reading', 'Last Present Reading',
       'Consumption', 'Status', '0', '2', '3', '4', '5', '6', '7', '8', '9',
       '13'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 1539 entries, 81 to 2167
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Control No.       1539 non-null   float64
 1   Account Name      1539 non-null   object 
 2   Service Address   1539 non-null   object 
 3   Type              1539 non-null   object 
 4   Previous Reading  1539 non-null   object 
 5   Present Reading   1530 non-null   object 
 6   Consumption       1539 non-null   object 
 7   Status            1463 non-null   object 
dtypes: float64(1), object(7)
memory usage: 108.2+ KB


None

Unnamed: 0,Control No.,Account Name,Service Address,Type,Previous Reading,Present Reading,Consumption,Status
81,501224.0,"Almerol, Salvador","Amican St., Dist. 3",R,8099,8197,98,active
82,501581.0,"Almine, Jimmy","Amican St., Dist. 3",R,2054,2076,22,active
83,501220.0,"Almocera, Francisco Jr.","Amican St., Dist. 3",R,1060,1060,0,
84,501898.0,"Almodiel, Jenny","Amican St., Dist. 3",R,317,318,1,active
85,501899.0,"Almoete, Joey","Amican St., Dist. 3",R,nr,nr,nr,active


Unnamed: 0,Control No.,Account Name,Service Address,Type,Previous Reading,Present Reading,Consumption,Status
0,500118.0,"Abejero, Ernesto Jr.","Altarejos St., Dist. 3",R,7782,moist,moist,active
1,500072.0,"A.R/ Alcantara, Jorge","Altarejos St., Dist. 3",R,255,265,10,active
2,500810.0,"Almerol, Nilo","Altarejos St., Dist. 3",C,684,696,12,active
3,501534.0,"Almiñe, Nixon","Altarejos St., Dist. 3",R,527,546,19,active
4,500447.0,"Almodiel, Fe","Altarejos St., Dist. 3",R,nr,nr,nr,active


<class 'pandas.core.frame.DataFrame'>
Index: 2168 entries, 0 to 3706
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Control No.       2168 non-null   float64
 1   Account Name      2168 non-null   object 
 2   Service Address   2168 non-null   object 
 3   Type              2168 non-null   object 
 4   Previous Reading  2168 non-null   object 
 5   Present Reading   2156 non-null   object 
 6   Consumption       2168 non-null   object 
 7   Status            2069 non-null   object 
dtypes: float64(1), object(7)
memory usage: 152.4+ KB


None

In [263]:
subset = ['Account Name', 'Control No.', 'Service Address']
duplicated = final_df[final_df.duplicated(subset=subset, keep=False)].copy()

display(duplicated.sort_values('Account Name'))
display(len(duplicated))

if len(duplicated) > 0:
    dropped_rows = final_df.duplicated(subset=subset, keep='first')
    final_df = final_df.drop_duplicates(subset=subset, keep='first').copy()

    print(f"Dropped {len(dropped_rows)} rows")

    display(final_df.info())
    display(final_df.head())

print('Final dataset length:', len(final_df))

to_save = input("Save dataset? (y/n)")
if to_save.lower() == 'y':
    final_df.to_csv(f'../../dataset/raw/{year}/{output_folder}/{month}_{year}.csv', index=False)
    print(f"✅ Saved to {month}_{year}.csv")

Unnamed: 0,Control No.,Account Name,Service Address,Type,Previous Reading,Present Reading,Consumption,Status
3374,501615.0,"D.J/ Espenilla, Ronnie","Fabmar St., Dist. 2",R,0,,0,
3479,501615.0,"D.J/ Espenilla, Ronnie","Fabmar St., Dist. 2",R,985,985.0,0,


2

Dropped 2168 rows
<class 'pandas.core.frame.DataFrame'>
Index: 2167 entries, 0 to 3706
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Control No.       2167 non-null   float64
 1   Account Name      2167 non-null   object 
 2   Service Address   2167 non-null   object 
 3   Type              2167 non-null   object 
 4   Previous Reading  2167 non-null   object 
 5   Present Reading   2155 non-null   object 
 6   Consumption       2167 non-null   object 
 7   Status            2069 non-null   object 
dtypes: float64(1), object(7)
memory usage: 152.4+ KB


None

Unnamed: 0,Control No.,Account Name,Service Address,Type,Previous Reading,Present Reading,Consumption,Status
0,500118.0,"Abejero, Ernesto Jr.","Altarejos St., Dist. 3",R,7782,moist,moist,active
1,500072.0,"A.R/ Alcantara, Jorge","Altarejos St., Dist. 3",R,255,265,10,active
2,500810.0,"Almerol, Nilo","Altarejos St., Dist. 3",C,684,696,12,active
3,501534.0,"Almiñe, Nixon","Altarejos St., Dist. 3",R,527,546,19,active
4,500447.0,"Almodiel, Fe","Altarejos St., Dist. 3",R,nr,nr,nr,active


Final dataset length: 2167
✅ Saved to MAY_2025.csv


# Used for setting standard directory for compiled csv files

In [82]:
base_dir = Path('../../dataset/raw/2020/')
compiled_dir = base_dir / 'compiled'
os.makedirs(compiled_dir, exist_ok=True)

In [83]:
csv_files = [f for f in os.listdir(base_dir) if f.endswith('.csv')]
print(csv_files)

for file in csv_files:
    os.replace(base_dir / file, base_dir / 'compiled' / file)

['APR2020.csv', 'AUG2020.csv', 'DEC2020.csv', 'FEB2020.csv', 'JAN2020.csv', 'JUL2020.csv', 'JUN2020.csv', 'MAR2020.csv', 'MAR_APR2020.csv', 'MAY2020.csv', 'NOV2020.csv', 'OCT2020.csv', 'SEP2020.csv']


In [31]:
import re
from pathlib import Path

year = '2023'
base_dir = Path(f'../../dataset/raw/{year}/compiled/')
files = [f for f in base_dir.iterdir() if f.is_file()]
files.sort()

for f in files:
    orig_name = f.name
    new_name = re.sub(r'(\d)', r'_\1', orig_name, count=1)
    if orig_name != new_name:
        new_path = base_dir / new_name
        f.rename(new_path)
        print(f"Renamed: {orig_name} → {new_name}")
    else:
        print(f"Skipped (no digit): {orig_name}")

Renamed: APR2023.csv → APR_2023.csv
Renamed: AUG2023.csv → AUG_2023.csv
Renamed: DEC2023.csv → DEC_2023.csv
Renamed: FEB2023.csv → FEB_2023.csv
Renamed: JAN2023.csv → JAN_2023.csv
Renamed: JUL2023.csv → JUL_2023.csv
Renamed: JUN2023.csv → JUN_2023.csv
Renamed: MAR2023.csv → MAR_2023.csv
Renamed: MAY2023.csv → MAY_2023.csv
Renamed: OCT2023.csv → OCT_2023.csv
Renamed: SEP2023.csv → SEP_2023.csv
