# Record Cleaning

**Metadata Summary**  
- 📅 **Date of Retrieval:** JULY 1, 2025  
- 🌐 **Source of Data:** LGU San Jacinto Treasury Records
- 📄 **License/Permission:**  
- 🧑‍💼 **Prepared by:** MARK JUNE E. ALMOJUELA

# PREPARATIONS

In [None]:
# Initial Library Import
import pandas as pd
import numpy as np
import os

In [3]:
# File Exploration

data_dir_path = "../../dataset/raw/"
try:
    years = [year for year in os.listdir(data_dir_path) if os.path.isdir(os.path.join(data_dir_path, year))]
    print("Available years: ", years, '\n')

    # List xlsx files in the directory
    for year in years:
        csv_files = [f for f in os.listdir(f'{data_dir_path}{year}') if f.endswith(".csv")]
        print(f"{year}: {csv_files}; \nCOUNT: {len(csv_files)}")
except FileNotFoundError:
    print("Directory not found")
except NotADirectoryError:
    print("Path is not a directory")
except PermissionError:
    print("Permission denied")
except Exception as e:
    print(f"An error occurred: {e}")

Available years:  ['2020', '2021', '2022', '2023', '2024', '2025'] 

2020: ['APR2020.csv', 'AUG2020.csv', 'DEC2020.csv', 'FEB2020.csv', 'JAN2020.csv', 'JUL2020.csv', 'JUN2020.csv', 'MAR2020.csv', 'MAR_APR2020.csv', 'MAY2020.csv', 'NOV2020.csv', 'OCT2020.csv', 'SEP2020.csv']; 
COUNT: 13
2021: []; 
COUNT: 0
2022: ['APR2022.csv', 'DEC2022.csv', 'FEB2022.csv', 'JAN2022.csv', 'JUL2022.csv', 'JUN2022.csv', 'MAR2022.csv', 'MAY2022.csv', 'NOV2022.csv', 'OCT2022.csv']; 
COUNT: 10
2023: []; 
COUNT: 0
2024: []; 
COUNT: 0
2025: []; 
COUNT: 0


# INITIAL NORMALIZATION

In [33]:
# Initialization
%pip install dash 
%pip install missingno

import dash
from dash import dcc, html
import plotly.express as px
import pandas as pd
import re
import os
from dash import dcc, html, dash_table
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import missingno as msno
from dash.dependencies import Input, Output
import io
import base64
from PIL import Image
import matplotlib.pyplot as plt

# NOTE: Run this cell only once

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
year = "2020"
month = "JAN"

# Paths and Filenames
raw_data_dir_path = "../../dataset/raw/"
cleaned_data_dir_path = "../../dataset/preprocessed/"


In [35]:
# Dataset Exploration

data_dir_path = raw_data_dir_path

raw_df = pd.read_csv(f"{data_dir_path}{year}/{month}{year}.csv", encoding="latin-1")

def check_instance(x):
    if isinstance(x, str) and re.search(r"[a-zA-Z]", x):
        non_numeric_value_list.append(x)
    else:   
        pass

print(f"Shape of {month}{year}:", raw_df.shape)
display(raw_df.head())
print(f"\nDuplicate records in {month}{year}:", raw_df.duplicated().sum())
print(f"Duplicate control number count in {month}{year}:", raw_df["Control Number"].duplicated().sum())

print(f"\nMissing values in {month}{year}:\n", raw_df.isnull().sum())

non_numeric_value_list = []

raw_df["Present"].apply(lambda x: check_instance(x))
raw_df["Previous"].apply(lambda x: check_instance(x))

non_numeric_value_list = set(non_numeric_value_list)

print(f"\nNon-numeric values in {month}{year}: {non_numeric_value_list}")
print(f"Non-numeric values count in {month}{year}: {len(non_numeric_value_list)}")

display(raw_df[raw_df["Previous"].isin(non_numeric_value_list)])


Shape of JAN2020: (1579, 8)


Unnamed: 0,Control Number,Consumer's Name,Address,Water Meter Serial #,Previous,Present,Cons.,Amount
0,501549.0,"Albaño, Lilane",Alicante St.,,198,207,9.0,60.0
1,500750.0,"Aljecera, Marcelino",Alicante St.,,DEFECT,3023,,60.0
2,500990.0,"Almiñana, Irus",Alicante St.,,437,471,34.0,204.0
3,500505.0,"Almiñe, Edison",Alicante St.,95022096.0,DEFECT,DEFECT,,60.0
4,501542.0,"Almiñe, Filben",Alicante St.,,3211,3252,41.0,246.0



Duplicate records in JAN2020: 0
Duplicate control number count in JAN2020: 18

Missing values in JAN2020:
 Control Number            7
Consumer's Name           0
Address                   0
Water Meter Serial #    642
Previous                160
Present                 241
Cons.                   539
Amount                  379
dtype: int64

Non-numeric values in JAN2020: {'DI21', 'DI61', 'NR', 'DISC.', 'DI07', 'DI43', 'DE12', 'TEMP. CLOSED', 'DEFECT'}
Non-numeric values count in JAN2020: 9


Unnamed: 0,Control Number,Consumer's Name,Address,Water Meter Serial #,Previous,Present,Cons.,Amount
1,500750.0,"Aljecera, Marcelino",Alicante St.,,DEFECT,3023,,60.00
3,500505.0,"Almiñe, Edison",Alicante St.,95022096,DEFECT,DEFECT,,60.00
5,500431.0,"Almiñe, Franchie",Alicante St.,121006093,DISC.,,,
6,500263.0,"Almodal, Arna",Alicante St.,9588526,DEFECT,5185,,60.00
23,501232.0,"Anabe, Antonio",Alicante St.,08-82569,DEFECT,1087,,60.00
...,...,...,...,...,...,...,...,...
1499,500716.0,"Espenilla, Analyn",Moyot St.,028118-02,DEFECT,DEFECT,,60.00
1501,500211.0,"Espenilla, Dominador",Moyot St.,,DEFECT,DEFECT,,60.00
1508,501389.0,"Grencio, Joel",Moyot St.,,DEFECT,DEFECT,,60.00
1509,501179.0,Giray Josephine,Moyot St.,10087010,DEFECT,DEFECT,,60.00


In [36]:
# Dataset Cleaning Script: Billing Data Preprocessor (JAN 2020)
# Author: [Your Name or Team]
# Purpose: Detect anomalies, correct missing values, and export a cleaned dataset
# ----------------------------------------------------------------------------------

# Known Anomaly Codes (Reading Flags)
invalid_reading_codes = {
    'DE12': 'Invalid Reading',
    'NR': 'Invalid Reading',
    'DI61': 'Invalid Reading',
    'DI21': 'Invalid Reading',
    'DI43': 'Invalid Reading',
    'DI07': 'Invalid Reading',
    'DISC.': 'Disconnected',
    'DEFECT': 'Defect',
    'TEMP. CLOSED': 'Temporarily Closed'
}

# Helper Function: Check if Value Is Numeric
def is_numeric(value):
    try:
        if pd.isna(value):
            return False
        float(value)
        return True
    except (ValueError, TypeError):
        return False

# Core Cleaning Function: Processes Raw Billing Data
def clean_billing_df(df):
    # Initialize output columns
    df["Cleaned Previous"] = None
    df["Cleaned Present"] = None
    df["Connection Status"] = None
    df["Record Status"] = None

    # Row-wise cleansing logic
    for index, row in df.iterrows():
        present_val = row["Present"]
        previous_val = row["Previous"]

        present = str(present_val) if pd.notna(present_val) else ""
        previous = str(previous_val) if pd.notna(previous_val) else ""

        # Case 1: Anomaly code detected in Present or Previous
        if present in invalid_reading_codes or previous in invalid_reading_codes:
            code = present if present in invalid_reading_codes else previous
            df.at[index, "Connection Status"] = invalid_reading_codes[code]

            # Retain usable numeric data if available
            if pd.isna(row["Previous"]) and pd.isna(row["Present"]):
                df.at[index, "Record Status"] = "Empty"
            else:
                df.at[index, "Record Status"] = "Corrected"
            
            df.at[index, "Cleaned Previous"] = row["Previous"] if is_numeric(row["Previous"]) else (
                row["Present"] if is_numeric(row["Present"]) else None)
            df.at[index, "Cleaned Present"] = row["Present"] if is_numeric(row["Present"]) else (
                row["Previous"] if is_numeric(row["Previous"]) else None)

            # Sanitize raw values for downstream use
            df.at[index, "Previous"] = None if previous in invalid_reading_codes else row["Previous"]
            df.at[index, "Present"] = None if present in invalid_reading_codes else row["Present"]

        # Case 2: Missing Present but valid Previous
        elif pd.isna(row["Present"]) and is_numeric(row["Previous"]):
            df.at[index, "Cleaned Previous"] = row["Previous"]
            df.at[index, "Cleaned Present"] = row["Previous"]
            df.at[index, "Record Status"] = "Corrected"
            df.at[index, "Connection Status"] = "Active"

        # Case 3: Missing Previous but valid Present
        elif is_numeric(row["Present"]) and pd.isna(row["Previous"]):
            df.at[index, "Cleaned Previous"] = row["Present"]
            df.at[index, "Cleaned Present"] = row["Present"]
            df.at[index, "Record Status"] = "Corrected"
            df.at[index, "Connection Status"] = "Active"

        # Case 4: Both values are valid and numeric
        elif is_numeric(row["Present"]) and is_numeric(row["Previous"]):
            df.at[index, "Cleaned Previous"] = row["Previous"]
            df.at[index, "Cleaned Present"] = row["Present"]
            df.at[index, "Record Status"] = "Unchanged"
            df.at[index, "Connection Status"] = "Active"

        # Case 5: Both values missing
        elif pd.isna(row["Present"]) and pd.isna(row["Previous"]):
            df.at[index, "Connection Status"] = "Unknown"
            df.at[index, "Cleaned Previous"] = None
            df.at[index, "Cleaned Present"] = None
            df.at[index, "Record Status"] = "Empty"

        # Case 6: Fallback condition
        else:
            df.at[index, "Connection Status"] = "Unknown"

    return df

# 📐 Desired Output Column Order (Report-Ready)
ordered_columns = [
    "Control Number",
    "Consumer's Name",
    "Address",
    "Water Meter Serial #",
    "Previous",
    "Present",
    "Cleaned Previous",
    "Cleaned Present",
    "Record Status",
    "Connection Status"
]

# 📦 Load Raw CSV
file_path = f"{raw_data_dir_path}{year}/{month}{year}.csv"
raw_df = pd.read_csv(file_path, encoding="latin-1")

# 🧼 Run Cleaning Logic
cleaned_df = clean_billing_df(raw_df)

# 🧾 Reorder Columns for Consistent Output
final_columns = [col for col in ordered_columns if col in cleaned_df.columns]
ordered_cleaned_df = cleaned_df[final_columns]

# 👀 Preview Top 5 Cleaned Records
print(f"\n📊 Cleaned Preview: {month} {year}")
display(ordered_cleaned_df.head())

# 📈 Status Summary
print("\n🧾 Record Status Summary:")
print(ordered_cleaned_df["Record Status"].value_counts())

print("\n🧾 Dataset Shape: ")
print(ordered_cleaned_df.shape)

# 💾 Save Cleaned Dataset
cleaned_file_path = f"{cleaned_data_dir_path}{year}/{month}{year}_preprocessed.csv"
os.makedirs(f"{cleaned_data_dir_path}/{year}", exist_ok=True)
ordered_cleaned_df.to_csv(cleaned_file_path, index=False)

print(f"\n✅ Cleaned data exported successfully to: {cleaned_file_path}")


📊 Cleaned Preview: JAN 2020


Unnamed: 0,Control Number,Consumer's Name,Address,Water Meter Serial #,Previous,Present,Cleaned Previous,Cleaned Present,Record Status,Connection Status
0,501549.0,"Albaño, Lilane",Alicante St.,,198.0,207.0,198.0,207.0,Unchanged,Active
1,500750.0,"Aljecera, Marcelino",Alicante St.,,,3023.0,3023.0,3023.0,Corrected,Defect
2,500990.0,"Almiñana, Irus",Alicante St.,,437.0,471.0,437.0,471.0,Unchanged,Active
3,500505.0,"Almiñe, Edison",Alicante St.,95022096.0,,,,,Corrected,Defect
4,501542.0,"Almiñe, Filben",Alicante St.,,3211.0,3252.0,3211.0,3252.0,Unchanged,Active



🧾 Record Status Summary:
Record Status
Unchanged    1086
Corrected     368
Empty         125
Name: count, dtype: int64

🧾 Dataset Shape: 
(1579, 10)

✅ Cleaned data exported successfully to: ../../dataset/preprocessed/2020/JAN2020_preprocessed.csv


In [38]:
# Load your cleaned dataset
df = ordered_cleaned_df

# 1. Dataset Overview
shape_info = f"Dataset Shape: {df.shape[0]} rows × {df.shape[1]} columns"

# 2. Create figures with white background
# Common layout settings
white_bg = dict(
    plot_bgcolor='white',
    paper_bgcolor='white',
    font=dict(color='black'),
    xaxis=dict(showgrid=False, linecolor='black', showline=True),
    yaxis=dict(showgrid=False, linecolor='black', showline=True)
)

# Record Status Distribution
status_counts = df["Record Status"].value_counts().reset_index()
status_counts.columns = ["Status", "Count"]
status_fig = px.pie(status_counts, names="Status", values="Count", 
                   title="Record Status Distribution",
                   color_discrete_sequence=px.colors.qualitative.Set3)
status_fig.update_layout(white_bg)

# Connection Status Distribution
connection_counts = df["Connection Status"].value_counts().reset_index()
connection_counts.columns = ["Status", "Count"]
connection_fig = px.bar(connection_counts, x="Status", y="Count", 
                       title="Connection Status Distribution",
                       color="Status",
                       color_discrete_sequence=px.colors.qualitative.Pastel)
connection_fig.update_layout(white_bg)

# Summary Statistics for Selected Columns
selected_columns = ['Present', 'Previous', 'Cleaned Present', 'Cleaned Previous']
summary_stats = df[selected_columns].describe().reset_index()

# Initialize the app
app = dash.Dash(__name__)

# App layout
app.layout = html.Div(style={'backgroundColor': 'white'}, children=[
    html.Div([
        html.H1(f"{month} {year} Billing Data Analysis", 
               style={"textAlign": "center", "margin-bottom": "20px", "color": "black"}),
        
        # Dataset Overview Card
        html.Div([
            html.H3("📊 Dataset Overview", style={"color": "black"}),
            html.P(shape_info, style={"color": "black"}),
            html.Hr(),
            
            # Summary Statistics for Selected Columns
            html.H4("Summary Statistics (Numeric Columns)", style={"color": "black", "marginTop": "20px"}),
            dash_table.DataTable(
                columns=[{"name": i, "id": i} for i in summary_stats.columns],
                data=summary_stats.to_dict('records'),
                style_table={'overflowX': 'auto'},
                style_cell={
                    'textAlign': 'left', 
                    'padding': '8px',
                    'color': 'black',
                    'backgroundColor': 'white',
                    'border': '1px solid #ddd'
                },
                style_header={
                    'backgroundColor': 'rgb(230, 230, 230)',
                    'fontWeight': 'bold',
                    'color': 'black'
                },
                style_data_conditional=[
                    {
                        'if': {'row_index': 'odd'},
                        'backgroundColor': 'rgb(248, 248, 248)',
                    }
                ]
            )
        ], style={
            'padding': '20px', 
            'margin': '10px', 
            'border': '1px solid #ddd', 
            'border-radius': '5px',
            'backgroundColor': 'white'
        }),
        
        # First Row: Status Distributions
        html.Div([
            html.Div([
                dcc.Graph(figure=status_fig)
            ], className="six columns"),
            
            html.Div([
                dcc.Graph(figure=connection_fig)
            ], className="six columns")
        ], className="row"),
        
        # Missing Values Analysis
        html.Div([
            html.H3("Missing Values Analysis", style={"color": "black"}),
            html.Div([
                html.H4("Missing Values by Column", style={"color": "black"}),
                dash_table.DataTable(
                    columns=[
                        {"name": "Column", "id": "Column"},
                        {"name": "Missing Values", "id": "Missing Values"},
                        {"name": "Percentage", "id": "Percentage"}
                    ],
                    data=[
                        {
                            "Column": col,
                            "Missing Values": df[col].isna().sum(),
                            "Percentage": f"{(df[col].isna().sum() / len(df)) * 100:.2f}%"
                        }
                        for col in selected_columns
                    ],
                    style_table={'overflowX': 'auto'},
                    style_cell={
                        'textAlign': 'left',
                        'padding': '8px',
                        'color': 'black',
                        'backgroundColor': 'white',
                        'border': '1px solid #ddd'
                    },
                    style_header={
                        'backgroundColor': 'rgb(230, 230, 230)',
                        'fontWeight': 'bold',
                        'color': 'black'
                    },
                    style_data_conditional=[
                        {
                            'if': {'row_index': 'odd'},
                            'backgroundColor': 'rgb(248, 248, 248)',
                        }
                    ]
                )
            ])
        ], style={
            'padding': '20px', 
            'margin': '10px', 
            'border': '1px solid #ddd', 
            'border-radius': '5px',
            'backgroundColor': 'white'
        }),
        
        # Data Sample
        html.Div([
            html.H3("Data Sample (First 10 Rows)", style={"color": "black"}),
            dash_table.DataTable(
                data=df[selected_columns].head(10).to_dict('records'),
                columns=[{"name": i, "id": i} for i in selected_columns],
                page_size=10,
                style_table={'overflowX': 'auto'},
                style_cell={
                    'textAlign': 'left',
                    'padding': '8px',
                    'maxWidth': '180px',
                    'textOverflow': 'ellipsis',
                    'whiteSpace': 'normal',
                    'color': 'black',
                    'backgroundColor': 'white',
                    'border': '1px solid #ddd'
                },
                style_header={
                    'backgroundColor': 'rgb(230, 230, 230)',
                    'fontWeight': 'bold',
                    'color': 'black'
                },
                style_data_conditional=[
                    {
                        'if': {'row_index': 'odd'},
                        'backgroundColor': 'rgb(248, 248, 248)',
                    }
                ],
                tooltip_data=[{
                    column: {'value': str(value), 'type': 'markdown'}
                    for column, value in row.items()
                } for row in df[selected_columns].head(10).to_dict('records')],
                tooltip_duration=None
            )
        ], style={
            'padding': '20px', 
            'margin': '10px', 
            'border': '1px solid #ddd', 
            'border-radius': '5px',
            'backgroundColor': 'white'
        })
    ])
])

# Add some basic styling
app.layout.children.insert(0, html.Div(style={
    'fontFamily': 'Arial, sans-serif', 
    'margin': '0 auto', 
    'maxWidth': '1200px',
    'backgroundColor': 'white',
    'color': 'black'
}))

if __name__ == "__main__":
    app.run(debug=True)