## Data Assessment of Waterflow Historical Data

**Metadata Summary**  
- 📅 **Date of Retrieval:** JULY 1, 2025  
- 🌐 **Source of Data:** LGU San Jacinto Treasury Records
- 📄 **License/Permission:**  
- 🧑‍💼 **Prepared by:** MARK JUNE E. ALMOJUELA

# Overview of the Datasets

In [7]:
# Initialization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 

In [None]:
# Dataset Availability and Compatibility Check

data_dir_path = "../../dataset/raw/"
try:
    years = [year for year in os.listdir(data_dir_path) if os.path.isdir(os.path.join(data_dir_path, year))]
    print("Available years: ", years, '\n')

    # List xlsx files in the directory
    for year in years:
        csv_files = [f for f in os.listdir(f'{data_dir_path}{year}') if f.endswith(".csv")]
        print(f"{year}: {csv_files}; \nCOUNT: {len(csv_files)}")
except FileNotFoundError:
    print("Directory not found")
except NotADirectoryError:
    print("Path is not a directory")
except PermissionError:
    print("Permission denied")
except Exception as e:
    print(f"An error occurred: {e}")

Available years:  ['2020', '2021', '2022', '2023', '2024', '2025'] 

2020: ['APR2020.csv', 'AUG2020.csv', 'DEC2020.csv', 'FEB2020.csv', 'JAN2020.csv', 'JUL2020.csv', 'JUN2020.csv', 'MAR2020.csv', 'MAY2020.csv', 'NOV2020.csv', 'OCT2020.csv', 'SEP2020.csv']; 
COUNT: 12
2021: []; 
COUNT: 0
2022: []; 
COUNT: 0
2023: []; 
COUNT: 0
2024: []; 
COUNT: 0
2025: []; 
COUNT: 0


In [32]:
import os
import pandas as pd

# ANSI styles for terminal color (optional: remove if not needed)
HEADER = "\033[95m"
OKBLUE = "\033[94m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"
RESET = "\033[0m"
BOLD = "\033[1m"

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Input and path setup
year = input("📅 Enter year: ")
data_dir_path = f"../../dataset/raw/{year}/"

# File discovery
csv_files = [f for f in os.listdir(data_dir_path) if f.endswith(".csv")]

for csv_file in csv_files:
    print(f"\n{BOLD}{OKGREEN}📂 Processing File: {csv_file}{RESET}")
    
    file_path = os.path.join(data_dir_path, csv_file)
    df = pd.read_csv(file_path, encoding='latin-1')

    print(f"{OKBLUE}📌 Shape:{RESET} {df.shape}")
    print(f"{OKBLUE}🧾 Columns:{RESET} {df.columns.tolist()}")
    print(f"{OKBLUE}🔍 Data Types:{RESET}\n{df.dtypes}")
    print(f"{OKBLUE}❓ Missing Values:{RESET}\n{df.isnull().sum()}")
    print(f"{OKBLUE}📊 Descriptive Stats:{RESET}\n{df.describe(include='all')}")
    print(f"{WARNING}Duplicated Records:{RESET} {df.duplicated().sum()}")
    print(f"{WARNING}Duplicated Consumer Name:{RESET} {df.duplicated('Consumer\'s Name').sum()}")
    print(f"{WARNING}Duplicated Control Number:{RESET} {df.duplicated('Control Number').sum()}")
    
    print(f"{OKBLUE}👀 Sample Data:{RESET}")
    print(df.head())
    
    print(f"{OKBLUE}🧠 Info Summary:{RESET}")
    df.info()


[1m[92m📂 Processing File: APR2020.csv[0m
[94m📌 Shape:[0m (1633, 8)
[94m🧾 Columns:[0m ['Control Number', "Consumer's Name", 'Address', 'Water Meter Serial #', 'Previous', 'Present', 'Cons.', 'Amount']
[94m🔍 Data Types:[0m
Control Number          float64
Consumer's Name          object
Address                  object
Water Meter Serial #     object
Previous                  int64
Present                   int64
Cons.                     int64
Amount                    int64
dtype: object
[94m❓ Missing Values:[0m
Control Number            1
Consumer's Name           0
Address                   0
Water Meter Serial #    698
Previous                  0
Present                   0
Cons.                     0
Amount                    0
dtype: int64
[94m📊 Descriptive Stats:[0m
        Control Number Consumer's Name    Address Water Meter Serial #  \
count      1632.000000            1633       1633                  935   
unique             NaN            1553         24        

# Duplicate value anomaly

In [40]:
year = "2020"
month = "JUN"
column_name = "Consumer\'s Name"
data_dir_path = f"../../dataset/raw/{year}/"

# File discovery
test_df = pd.read_csv(os.path.join(data_dir_path, f"{month}{year}.csv"), encoding='latin-1')

# Show all instances of duplicates (including the first occurrence)
duplicates_mask = test_df[column_name].duplicated(keep=False)  # keep=False marks all duplicates as True
all_duplicates = test_df[duplicates_mask].sort_values(column_name)
all_duplicates

Unnamed: 0,Control Number,Consumer's Name,Address,Water Meter Serial #,Previous,Present,Cons.,Amount
681,501114.0,"Albaño, Emie",Don Juan St.,,DISC.,DISC.,,
1471,501385.0,"Albaño, Emie",Moyot St.,,837,837,,60.0
302,501201.0,"Alcantara, Sheena",Bailon St.,28949,1983,1983,,60.0
1396,501400.0,"Alcantara, Sheena",Lique St.,14Y125171,807,818,11.0,66.0
1092,500795.0,"Almiñe, Edwin",Jones St.,51662,86,92,6.0,60.0
566,500994.0,"Almiñe, Edwin",Bartolabac St.,027452-02,DISC.,DISC.,,
10,500544.0,"Almodal, Noe",Alicante St.,,2418,,,
306,500442.0,"Almodal, Noe",Bailon St.,,1082,1128,46.0,276.0
568,500441.0,"Almodal, Noe",Bartolabac St.,02865-02,569,602,33.0,198.0
1095,500653.0,"Almodal, Noe",Jones St.,,1703,1707,4.0,60.0


In [39]:
year = "2020"
month = "JUN"
data_dir_path = f"../../dataset/raw/{year}/"

# File discovery
test_df = pd.read_csv(os.path.join(data_dir_path, f"{month}{year}.csv"), encoding='latin-1')

# Show all instances of duplicates (including the first occurrence)
duplicates_mask = test_df['Control Number'].duplicated(keep=False)  # keep=False marks all duplicates as True
all_duplicates = test_df[duplicates_mask].sort_values('Control Number')
all_duplicates

Unnamed: 0,Control Number,Consumer's Name,Address,Water Meter Serial #,Previous,Present,Cons.,Amount
1595,500134.0,"Sola, Manuel",Moyot St.,,2311,2327,16.0,96.0
1389,500134.0,"Sola, Manuel",Letada St.,028455-02,DISC.,DISC.,,
746,500593.0,"Almonte, Edgar",Esparrago St.,121537,,,,
1337,500593.0,"Almonte, Edgar Sr.",Letada St.,,2952,2964,12.0,72.0
1242,500701.0,"Genova, Noemi",Jones St.,11195,DISC.,DISC.,,
483,500701.0,"Genova, Noemi",Balcavem St.,,133,145,12.0,72.0
318,500756.0,"Altiche, Lolita",Bailon St.,2200103,5248,5314,66.0,396.0
809,500756.0,"Orteza, Blas",Esparrago St.,,DISC.,DISC.,,
1132,500849.0,"Barsaga, Josefa A",Jones St.,,DISC.,DISC.,,
1113,500849.0,"Anos, Benito",Jones St.,017982-02,1767,1783,16.0,96.0
