In [1]:
import os 
import pandas as pd

# Compilation of raw/2024 datasets

In [155]:
import pandas as pd
from pathlib import Path

# 📁 Define the base directory
base_dir = Path('../../dataset/raw/2024/')

def discover_header(file_path, sheet_name='Member Consumers Data'):
    # Temporarily read with no header to inspect rows
    temp_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None, usecols='A:N')

    for i, row in temp_df.iterrows():
        normalized = [str(cell).strip().lower() for cell in row]
        if any('name' in cell for cell in normalized):
            print(f"✅ {file_path}: Header found at row {i}")
            return i

    print("❌ Header not found — fallback to default row")
    return Exception("Header not found")

# 🧹 Sanitize sheet reader
def read_sheet(file_path, is_first):
    try:
        header_row = discover_header(file_path)
        return pd.read_excel(
            file_path,
            sheet_name='Member Consumers Data',
            skiprows=header_row if is_first else 6,
            usecols='A:N',
            header=None if not is_first else 0
        )
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")
        return pd.DataFrame()

# 🚦 Main loop
for folder in base_dir.iterdir():
    if not folder.is_dir():
        continue

    compiled_df = pd.DataFrame()
    try:
        for subfolder in folder.iterdir():
            if not subfolder.is_dir():
                continue

            # Filter valid Excel files
            excel_files = [f for f in subfolder.iterdir()
                           if f.suffix in ('.xlsm', '.xlsx') and not f.name.startswith('~')]

            for i, file_path in enumerate(excel_files):
                new_df = read_sheet(file_path, is_first=(i == 0))

                # ✅ Validate new_df before appending
                if not new_df.empty and new_df.dropna(axis=1, how='all').shape[1] > 0:
                    new_df.dropna(thresh=5, inplace=True, axis=0)
                    compiled_df = pd.concat([compiled_df, new_df], ignore_index=True)

        # 📤 Export compiled data
        output_file = base_dir /'compiled' / f'{folder.name}.csv'
        compiled_df.to_csv(output_file, index=False)
        print(f"✅ Compiled: {output_file}")

    except Exception as e:
        print(f"Error accessing {folder.name}: {e}")

✅ ..\..\dataset\raw\2024\AUG_2024\HAZEL-Altarejos, Lique St\BILLING FORM_ALTAREJOS ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\AUG_2024\HAZEL-Altarejos, Lique St\BILLING FORM_LIQUE ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\AUG_2024\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_BARTOLABAC ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\AUG_2024\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_GRAFILO ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\AUG_2024\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_VILLAMOR ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\AUG_2024\JOSHUA-Amican St., Burgos\BILLING FORM_AMICAN ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\AUG_2024\JOSHUA-Amican St., Burgos\BILLING FORM_BINALIW, BURGOS.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\AUG_2024\JOSHUA-Amican St., Burgos\BILLING FORM_NHA DUPLEX.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\AUG_2024\JOSHUA-Amican St.

  compiled_df = pd.concat([compiled_df, new_df], ignore_index=True)


✅ ..\..\dataset\raw\2024\SEP_2024\JINKY-Bartolabac, Grafilo, Villamor St\BILLING FORM_VILLAMOR ST..xlsm: Header found at row 5


  compiled_df = pd.concat([compiled_df, new_df], ignore_index=True)


✅ ..\..\dataset\raw\2024\SEP_2024\JOSHUA-Amican St., Burgos\BILLING FORM_AMICAN ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\SEP_2024\JOSHUA-Amican St., Burgos\BILLING FORM_BINALIW, BURGOS.xlsm: Header found at row 5


  compiled_df = pd.concat([compiled_df, new_df], ignore_index=True)


✅ ..\..\dataset\raw\2024\SEP_2024\JOSHUA-Amican St., Burgos\BILLING FORM_NHA DUPLEX.xlsm: Header found at row 5


  compiled_df = pd.concat([compiled_df, new_df], ignore_index=True)


✅ ..\..\dataset\raw\2024\SEP_2024\JOSHUA-Amican St., Burgos\BILLING FORM_TALABA, BURGOS.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\SEP_2024\JOSHUA-Amican St., Burgos\BILLING FORM_TANGNAN, BURGOS.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\SEP_2024\JOSHUA-Amican St., Burgos\BILLING FORM_UPPER BLISS.xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\SEP_2024\JUVY-Bailon, Balcavem, Jones St\BILLING FORM_BAILON ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\SEP_2024\JUVY-Bailon, Balcavem, Jones St\BILLING FORM_BALCAVEM ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\SEP_2024\JUVY-Bailon, Balcavem, Jones St\BILLING FORM_JONES ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\SEP_2024\LUVILYN-Balintong, Don Juan, Flores, Moyot St\BILLING FORM_BALINTONG ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\2024\SEP_2024\LUVILYN-Balintong, Don Juan, Flores, Moyot St\BILLING FORM_DON JUAN REJUSO ST..xlsm: Header found at row 5
✅ ..\..\dataset\raw\202

In [156]:
# Handle the conversion of xlsx to csv

def discover_header(file_path, sheet_name='Sheet1'):
    # Temporarily read with no header to inspect rows
    temp_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None, usecols='A:H')

    for i, row in temp_df.iterrows():
        normalized = [str(cell).strip().lower() for cell in row]
        if any('name' in cell for cell in normalized):
            print(f"✅ {file_path}: Header found at row {i}")
            return i

    print("❌ Header not found — fallback to default row")
    return Exception("Header not found")

year = '2024'
base_dir = Path(f'../../dataset/raw/{year}/')

files = [f for f in base_dir.iterdir() if f.suffix in ('.xlsm', '.xlsx') and not f.name.startswith('~')]

output_dir = base_dir / 'compiled'
os.makedirs(output_dir, exist_ok=True)

for file in files:
    print(file.name)
    month = file.name[0:3].upper()

    header_row = discover_header(file)
    df = pd.read_excel(file, sheet_name = 'Sheet1', skiprows=header_row, usecols='A:H', header = 0)
    df.to_csv(output_dir / f'{month}_{year}.csv', index=False)

APRIL 1-30, 2024.xlsx
✅ ..\..\dataset\raw\2024\APRIL 1-30, 2024.xlsx: Header found at row 0
FEBRUARY 1-29, 2024.xlsx
✅ ..\..\dataset\raw\2024\FEBRUARY 1-29, 2024.xlsx: Header found at row 0
JANUARY 1-30, 2024.xlsx
✅ ..\..\dataset\raw\2024\JANUARY 1-30, 2024.xlsx: Header found at row 0
June 1-30, 2024.xlsx
✅ ..\..\dataset\raw\2024\June 1-30, 2024.xlsx: Header found at row 1
MARCH 1-31, 2024.xlsx
✅ ..\..\dataset\raw\2024\MARCH 1-31, 2024.xlsx: Header found at row 0
MAY 1-31, 2024.xlsx
✅ ..\..\dataset\raw\2024\MAY 1-31, 2024.xlsx: Header found at row 0


In [157]:
# Final 2024 dataset preparation
base_dir = Path('../../dataset/raw/2024/compiled/')
files = [f for f in base_dir.iterdir() if f.is_file()]

for file in files:
    print(file.name)
    df = pd.read_csv(file)
    df.dropna(how='all', inplace=True, axis=1)
    display(df.info())

APR_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Control Number   2129 non-null   object
 1   Consumer's Name  2129 non-null   object
 2   Address          2129 non-null   object
 3   Previous         2081 non-null   object
 4   Present          2082 non-null   object
 5   Cons.            2065 non-null   object
 6   Amount           1892 non-null   object
dtypes: object(7)
memory usage: 116.6+ KB


None

AUG_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Control No.           684 non-null    float64
 1   Account Name          684 non-null    object 
 2   Service Address       684 non-null    object 
 3   Type                  682 non-null    object 
 4   Meter Serial No.      90 non-null     object 
 5   Previous Reading      684 non-null    object 
 6   Present Reading       683 non-null    object 
 7   Last Present Reading  593 non-null    object 
 8   Consumption           684 non-null    object 
 9   Status                657 non-null    object 
 10  0                     1455 non-null   float64
 11  2                     1455 non-null   object 
 12  3                     1455 non-null   object 
 13  4                     1455 non-null   object 
 14  5                     154 non-null    object 
 15  6       

None

DEC_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2159 entries, 0 to 2158
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Control No.           684 non-null    float64
 1   Account Name          684 non-null    object 
 2   Service Address       684 non-null    object 
 3   Type                  683 non-null    object 
 4   Meter Serial No.      115 non-null    object 
 5   Previous Reading      684 non-null    object 
 6   Present Reading       683 non-null    object 
 7   Last Present Reading  599 non-null    object 
 8   Consumption           684 non-null    object 
 9   Status                653 non-null    object 
 10  0                     1474 non-null   float64
 11  2                     1475 non-null   object 
 12  3                     1475 non-null   object 
 13  4                     1475 non-null   object 
 14  5                     183 non-null    object 
 15  6       

None

FEB_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2112 entries, 0 to 2111
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Control Number   2112 non-null   object
 1   Consumer's Name  2112 non-null   object
 2   Address          2112 non-null   object
 3   Previous         2069 non-null   object
 4   Present          2073 non-null   object
 5   Cons.            2058 non-null   object
 6   Amount           1875 non-null   object
dtypes: object(7)
memory usage: 115.6+ KB


None

JAN_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2115 entries, 0 to 2114
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Control Number   2114 non-null   object
 1   Consumer's Name  2115 non-null   object
 2   Address          2115 non-null   object
 3   Previous         2058 non-null   object
 4   Present          2056 non-null   object
 5   Cons.            2045 non-null   object
 6   Amount           1848 non-null   object
dtypes: object(7)
memory usage: 115.8+ KB


None

JUL_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2122 entries, 0 to 2121
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Control No.           609 non-null    float64
 1   Account Name          609 non-null    object 
 2   Service Address       609 non-null    object 
 3   Type                  604 non-null    object 
 4   Meter Serial No.      90 non-null     object 
 5   Previous Reading      609 non-null    object 
 6   Present Reading       609 non-null    object 
 7   Last Present Reading  609 non-null    object 
 8   Consumption           609 non-null    object 
 9   Status                583 non-null    object 
 10  0                     1513 non-null   float64
 11  2                     1513 non-null   object 
 12  3                     1460 non-null   object 
 13  4                     1513 non-null   object 
 14  5                     153 non-null    object 
 15  6       

None

JUN_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2133 entries, 0 to 2132
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Control Number   2132 non-null   float64
 1   Consumer's Name  2132 non-null   object 
 2   Address          2132 non-null   object 
 3   Previous         2106 non-null   object 
 4   Present          2108 non-null   object 
 5   Cons.            2133 non-null   object 
 6   Amount           1918 non-null   object 
dtypes: float64(1), object(6)
memory usage: 116.8+ KB


None

MAR_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2113 entries, 0 to 2112
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Control Number   2112 non-null   object
 1   Consumer's Name  2113 non-null   object
 2   Address          2113 non-null   object
 3   Previous         2078 non-null   object
 4   Present          2059 non-null   object
 5   Cons.            2047 non-null   object
 6   Amount           1870 non-null   object
dtypes: object(7)
memory usage: 115.7+ KB


None

MAY_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2131 entries, 0 to 2130
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Control Number   2131 non-null   int64 
 1   Consumer's Name  2131 non-null   object
 2   Address          2131 non-null   object
 3   Previous         2087 non-null   object
 4   Present          2092 non-null   object
 5   Cons.            2076 non-null   object
 6   Amount           1905 non-null   object
dtypes: int64(1), object(6)
memory usage: 116.7+ KB


None

NOV_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Control No.           685 non-null    float64
 1   Account Name          685 non-null    object 
 2   Service Address       685 non-null    object 
 3   Type                  684 non-null    object 
 4   Meter Serial No.      115 non-null    object 
 5   Previous Reading      685 non-null    object 
 6   Present Reading       685 non-null    object 
 7   Last Present Reading  600 non-null    object 
 8   Consumption           685 non-null    object 
 9   Status                656 non-null    object 
 10  0                     1475 non-null   float64
 11  2                     1475 non-null   object 
 12  3                     1475 non-null   object 
 13  4                     1475 non-null   object 
 14  5                     183 non-null    object 
 15  6       

None

OCT_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2153 entries, 0 to 2152
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Control No.           685 non-null    float64
 1   Account Name          685 non-null    object 
 2   Service Address       685 non-null    object 
 3   Type                  685 non-null    object 
 4   Meter Serial No.      115 non-null    object 
 5   Previous Reading      685 non-null    object 
 6   Present Reading       684 non-null    object 
 7   Last Present Reading  594 non-null    object 
 8   Consumption           685 non-null    object 
 9   Status                659 non-null    object 
 10  0                     1468 non-null   float64
 11  2                     1468 non-null   object 
 12  3                     1468 non-null   object 
 13  4                     1468 non-null   object 
 14  5                     184 non-null    object 
 15  6       

None

SEP_2024.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2145 entries, 0 to 2144
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Control No.           682 non-null    float64
 1   Account Name          682 non-null    object 
 2   Service Address       682 non-null    object 
 3   Type                  682 non-null    object 
 4   Meter Serial No.      90 non-null     object 
 5   Previous Reading      682 non-null    object 
 6   Present Reading       682 non-null    object 
 7   Last Present Reading  596 non-null    object 
 8   Consumption           682 non-null    object 
 9   Status                651 non-null    object 
 10  0                     1462 non-null   float64
 11  2                     1462 non-null   object 
 12  3                     1463 non-null   object 
 13  4                     1460 non-null   object 
 14  5                     161 non-null    object 
 15  6       

None

# Dataset Final Preparations for Months of Jul-Dec 2024
Should be run manually

| Column | Description |
|--------|-------------|
| `df['0']` | Control numbers |
| `df['2']` | Account names |
| `df['3']` | Service addresses |
| `df['4']` | Types |
| `df['5']` | Meter serial numbers |
| `df['6']` | Previous readings |
| `df['7']` | Present readings |
| `df['8']` | Last present readings |
| `df['9']` | Consumptions |
| `df['13']` | Statuses |

In [21]:
# Final 2024 dataset preparation
from pathlib import Path
import pandas as pd

month = 'DEC'
year = '2024'
base_dir = Path(f'../../dataset/raw/{year}/compiled/{month}_{year}.csv')
col_name = 13

df = pd.read_csv(base_dir)
display(df.head())
display(df.columns)

df[f'{col_name}'].value_counts()

Unnamed: 0,Control No.,SC,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Last Present Reading,Consumption,...,4,5,6,7,8,9,10,11,12,13
0,500118.0,,"Abejero, Ernesto Jr.","Altarejos St., Dist. 3",R,,7699,7714,7699,15,...,,,,,,,,,,
1,501026.0,,"Barrun, Ana","Altarejos St., Dist. 3",R,,1051,1082,1051,31,...,,,,,,,,,,
2,500810.0,,"Almerol, Nilo","Altarejos St., Dist. 3",C,,621,641,621,20,...,,,,,,,,,,
3,501534.0,,"Almiñe, Nixon","Altarejos St., Dist. 3",R,,412,443,412,31,...,,,,,,,,,,
4,500447.0,,"Almodiel, Fe","Altarejos St., Dist. 3",R,O151660,nr,nr,nr,nr,...,,,,,,,,,,


Index(['Control No.', 'SC', 'Account Name', 'Service Address', 'Type',
       'Meter Serial No.', 'Previous Reading', 'Present Reading',
       'Last Present Reading', 'Consumption', 'Maintenance Service Charge',
       'Other Charges & Penalty', 'Previous Unpaid Balance', 'Status', '0',
       '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13'],
      dtype='object')

13
active           1070
disconnected      324
meter deffect       1
Name: count, dtype: int64

In [22]:
temp_df = df[['0', '2', '3', '4', '5', '6', '7', '8', '9', '13']].copy()
temp_df.dropna(how='all', inplace=True, axis=0)
display(temp_df.head())
display(temp_df.info())

Unnamed: 0,0,2,3,4,5,6,7,8,9,13
82,501400.0,"Alcantara, Sheena","Lique St., Dist. 1",C,,1184,1219,1184,35,active
83,500256.0,"Almodal, Evelyn","Lique St., Dist. 1",C,,6522,6561,6522,39,active
84,501966.0,"Almodal, Mabel","Lique St., Dist. 4",C,,1146,1215,1146,69,active
85,500506.0,"Almiñe, Filben","Lique St., Dist. 1",R,028200-02,357,380,357,23,active
86,501705.0,"Almodiel, Ullysis","Lique St., Dist. 4",R,,1144,1175,1144,31,active


<class 'pandas.core.frame.DataFrame'>
Index: 1475 entries, 82 to 2158
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       1474 non-null   float64
 1   2       1475 non-null   object 
 2   3       1475 non-null   object 
 3   4       1475 non-null   object 
 4   5       183 non-null    object 
 5   6       1475 non-null   object 
 6   7       1470 non-null   object 
 7   8       1216 non-null   object 
 8   9       1475 non-null   object 
 9   13      1395 non-null   object 
dtypes: float64(1), object(9)
memory usage: 126.8+ KB


None

In [23]:
df.columns
new_df = df[['Control No.', 'Account Name', 'Service Address', 'Type', 'Meter Serial No.', 'Previous Reading', 'Present Reading', 'Consumption', 'Status']].copy()
display(new_df.head())

Unnamed: 0,Control No.,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Consumption,Status
0,500118.0,"Abejero, Ernesto Jr.","Altarejos St., Dist. 3",R,,7699,7714,15,active
1,501026.0,"Barrun, Ana","Altarejos St., Dist. 3",R,,1051,1082,31,active
2,500810.0,"Almerol, Nilo","Altarejos St., Dist. 3",C,,621,641,20,active
3,501534.0,"Almiñe, Nixon","Altarejos St., Dist. 3",R,,412,443,31,active
4,500447.0,"Almodiel, Fe","Altarejos St., Dist. 3",R,O151660,nr,nr,nr,active


In [24]:
new_col_names = {'0': 'Control No.', '2': 'Account Name', '3': 'Service Address', '4': 'Type', '5': 'Meter Serial No.', '6': 'Previous Reading', '7': 'Present Reading', '9': 'Consumption', '13': 'Status'}
temp_df.rename(columns=new_col_names, inplace=True)
temp_df = temp_df[['Control No.', 'Account Name', 'Service Address', 'Type', 'Meter Serial No.', 'Previous Reading', 'Present Reading', 'Consumption', 'Status']].copy()

display(temp_df.head())

Unnamed: 0,Control No.,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Consumption,Status
82,501400.0,"Alcantara, Sheena","Lique St., Dist. 1",C,,1184,1219,35,active
83,500256.0,"Almodal, Evelyn","Lique St., Dist. 1",C,,6522,6561,39,active
84,501966.0,"Almodal, Mabel","Lique St., Dist. 4",C,,1146,1215,69,active
85,500506.0,"Almiñe, Filben","Lique St., Dist. 1",R,028200-02,357,380,23,active
86,501705.0,"Almodiel, Ullysis","Lique St., Dist. 4",R,,1144,1175,31,active


In [25]:
concat_df = pd.concat([new_df, temp_df], axis=0, ignore_index=True)
concat_df.head()

Unnamed: 0,Control No.,Account Name,Service Address,Type,Meter Serial No.,Previous Reading,Present Reading,Consumption,Status
0,500118.0,"Abejero, Ernesto Jr.","Altarejos St., Dist. 3",R,,7699,7714,15,active
1,501026.0,"Barrun, Ana","Altarejos St., Dist. 3",R,,1051,1082,31,active
2,500810.0,"Almerol, Nilo","Altarejos St., Dist. 3",C,,621,641,20,active
3,501534.0,"Almiñe, Nixon","Altarejos St., Dist. 3",R,,412,443,31,active
4,500447.0,"Almodiel, Fe","Altarejos St., Dist. 3",R,O151660,nr,nr,nr,active


In [None]:
concat_df.to_csv(f'../../dataset/raw/{year}/compiled/{month}{year}.csv', index=False)

# Used for setting standard directory for compiled csv files

In [82]:
base_dir = Path('../../dataset/raw/2020/')
compiled_dir = base_dir / 'compiled'
os.makedirs(compiled_dir, exist_ok=True)

In [83]:
csv_files = [f for f in os.listdir(base_dir) if f.endswith('.csv')]
print(csv_files)

for file in csv_files:
    os.replace(base_dir / file, base_dir / 'compiled' / file)

['APR2020.csv', 'AUG2020.csv', 'DEC2020.csv', 'FEB2020.csv', 'JAN2020.csv', 'JUL2020.csv', 'JUN2020.csv', 'MAR2020.csv', 'MAR_APR2020.csv', 'MAY2020.csv', 'NOV2020.csv', 'OCT2020.csv', 'SEP2020.csv']


In [31]:
import re
from pathlib import Path

year = '2023'
base_dir = Path(f'../../dataset/raw/{year}/compiled/')
files = [f for f in base_dir.iterdir() if f.is_file()]
files.sort()

for f in files:
    orig_name = f.name
    new_name = re.sub(r'(\d)', r'_\1', orig_name, count=1)
    if orig_name != new_name:
        new_path = base_dir / new_name
        f.rename(new_path)
        print(f"Renamed: {orig_name} → {new_name}")
    else:
        print(f"Skipped (no digit): {orig_name}")

Renamed: APR2023.csv → APR_2023.csv
Renamed: AUG2023.csv → AUG_2023.csv
Renamed: DEC2023.csv → DEC_2023.csv
Renamed: FEB2023.csv → FEB_2023.csv
Renamed: JAN2023.csv → JAN_2023.csv
Renamed: JUL2023.csv → JUL_2023.csv
Renamed: JUN2023.csv → JUN_2023.csv
Renamed: MAR2023.csv → MAR_2023.csv
Renamed: MAY2023.csv → MAY_2023.csv
Renamed: OCT2023.csv → OCT_2023.csv
Renamed: SEP2023.csv → SEP_2023.csv
