# Data Profiling of Input CSVs

This notebook performs an automated exploratory data analysis (EDA) on all CSV files found in the `../data/input` directory.

For each file, it provides:
*   File name and shape (rows, columns)
*   Data types and non-null counts (`.info()`)
*   A summary of missing values for columns that have them
*   Descriptive statistics for numerical columns
*   Descriptive statistics for categorical columns
*   A preview of the first 5 rows

In [6]:
import pandas as pd
from pathlib import Path

# Set pandas display options for better viewing in the notebook
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 120)

In [8]:
# Define the path to the raw data.
# This assumes the notebook is in 'notebooks/' and the data is in 'data/input/'.
input_dir = Path('../input')

# Check if the directory exists
if not input_dir.is_dir():
    print(f"Error: Directory not found at '{input_dir.resolve()}'")
    print("Please make sure your CSV files are in a 'data/input' folder relative to the project root.")
else:
    # Find all CSV files in the input directory, sorted for consistent order
    csv_files = sorted(list(input_dir.glob('*.csv')))

    if not csv_files:
        print(f"No CSV files found in '{input_dir.resolve()}'")
    else:
        print(f"Found {len(csv_files)} CSV file(s) to profile.\n")

        # Loop through each CSV and perform profiling
        for file_path in csv_files:
            print(f"--- PROFILING: {file_path.name} ---", end='\n' + '='*80 + '\n')
            try:
                df = pd.read_csv(file_path)

                print(f"Shape: {df.shape[0]} rows, {df.shape[1]} columns\n")

                print("Column Info & Data Types:")
                # .info() prints directly to stdout
                df.info()

                print(f"\nMissing Values (count per column):")
                missing_values = df.isnull().sum()
                if not missing_values[missing_values > 0].empty:
                    print(missing_values[missing_values > 0].to_string())
                else:
                    print("No missing values found.")

                print("\nDescriptive Statistics (Numerical):")
                # Use display() for rich notebook output of DataFrames
                display(df.describe())

                print("\nDescriptive Statistics (Categorical):")
                display(df.describe(include=['object', 'category']))

                print("\nFirst 5 Rows:")
                display(df.head())

                print('\n' + '='*80 + '\n\n')

            except Exception as e:
                print(f"Could not read or process file: {file_path.name}")
                print(f"Error: {e}\n")

Found 4 CSV file(s) to profile.

--- PROFILING: DATE_DIM.csv ---
Shape: 10958 rows, 8 columns

Column Info & Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10958 entries, 0 to 10957
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ADJ_DATE               10958 non-null  object
 1   DAY_TYPE_QLD           10958 non-null  object
 2   QLD_PUBLIC_HOLIDAY     10958 non-null  bool  
 3   TAS_PUBLIC_HOLIDAY     4833 non-null   object
 4   NSW_PUBLIC_HOLIDAY     4833 non-null   object
 5   VIC_PUBLIC_HOLIDAY     4833 non-null   object
 6   NSW_PUBLIC_HOLIDAY.1   4833 non-null   object
 7   IS_QLD_SCHOOL_HOLIDAY  312 non-null    object
dtypes: bool(1), object(7)
memory usage: 610.1+ KB

Missing Values (count per column):
TAS_PUBLIC_HOLIDAY        6125
NSW_PUBLIC_HOLIDAY        6125
VIC_PUBLIC_HOLIDAY        6125
NSW_PUBLIC_HOLIDAY.1      6125
IS_QLD_SCHOOL_HOLIDAY    10646

Descriptive Statis

Unnamed: 0,ADJ_DATE,DAY_TYPE_QLD,QLD_PUBLIC_HOLIDAY,TAS_PUBLIC_HOLIDAY,NSW_PUBLIC_HOLIDAY,VIC_PUBLIC_HOLIDAY,NSW_PUBLIC_HOLIDAY.1,IS_QLD_SCHOOL_HOLIDAY
count,10958,10958,10958,4833,4833,4833,4833,312
unique,10958,2,2,2,2,2,2,2
top,2020-04-09,WD,False,False,False,False,False,True
freq,1,7767,10898,4631,4593,4571,4593,308



Descriptive Statistics (Categorical):


Unnamed: 0,ADJ_DATE,DAY_TYPE_QLD,TAS_PUBLIC_HOLIDAY,NSW_PUBLIC_HOLIDAY,VIC_PUBLIC_HOLIDAY,NSW_PUBLIC_HOLIDAY.1,IS_QLD_SCHOOL_HOLIDAY
count,10958,10958,4833,4833,4833,4833,312
unique,10958,2,2,2,2,2,2
top,2020-04-09,WD,False,False,False,False,True
freq,1,7767,4631,4593,4571,4593,308



First 5 Rows:


Unnamed: 0,ADJ_DATE,DAY_TYPE_QLD,QLD_PUBLIC_HOLIDAY,TAS_PUBLIC_HOLIDAY,NSW_PUBLIC_HOLIDAY,VIC_PUBLIC_HOLIDAY,NSW_PUBLIC_HOLIDAY.1,IS_QLD_SCHOOL_HOLIDAY
0,2020-04-09,WD,False,False,False,False,False,
1,2020-04-20,WD,False,False,False,False,False,
2,2020-04-27,WD,False,False,False,False,False,
3,2020-05-01,WD,False,False,False,False,False,
4,2020-05-18,WD,False,False,False,False,False,





--- PROFILING: DISPATCHREGIONSUM.csv ---
Shape: 1287860 rows, 11 columns

Column Info & Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1287860 entries, 0 to 1287859
Data columns (total 11 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   SETTLEMENTDATE          1287860 non-null  object 
 1   RUNNO                   1287860 non-null  int64  
 2   REGIONID                1287860 non-null  object 
 3   TOTALDEMAND             1287860 non-null  float64
 4   AVAILABLEGENERATION     1287860 non-null  float64
 5   AVAILABLELOAD           1287860 non-null  float64
 6   DISPATCHABLEGENERATION  1287860 non-null  float64
 7   DISPATCHABLELOAD        1287860 non-null  float64
 8   NETINTERCHANGE          1287860 non-null  float64
 9   INITIALSUPPLY           1287860 non-null  float64
 10  CLEAREDSUPPLY           1287860 non-null  float64
dtypes: float64(8), int64(1), object(2)
memory usage: 108.1+ MB

Miss

Unnamed: 0,RUNNO,TOTALDEMAND,AVAILABLEGENERATION,AVAILABLELOAD,DISPATCHABLEGENERATION,DISPATCHABLELOAD,NETINTERCHANGE,INITIALSUPPLY,CLEAREDSUPPLY
count,1287860.0,1287860.0,1287860.0,1287860.0,1287860.0,1287860.0,1287860.0,1287860.0,1287860.0
mean,1.0,4152.477,7137.574,230.9666,4228.072,55.71933,19.88319,4225.197,4228.057
std,0.0,2733.424,4131.74,209.9928,2783.312,134.9474,600.6854,2763.746,2765.283
min,1.0,-231.82,1278.638,0.0,75.71,0.0,-2556.44,-215.242,-194.92
25%,1.0,1277.81,2404.61,56.0,1219.82,0.0,-417.95,1290.727,1292.53
50%,1.0,4438.64,8619.615,167.0,5137.485,0.0,-29.94,4593.674,4596.66
75%,1.0,6393.432,10351.14,383.0,6490.41,28.0,398.87,6506.779,6509.5
max,1.0,13763.96,17301.88,1226.0,13620.93,1151.66,2582.67,13675.5,13737.89



Descriptive Statistics (Categorical):


Unnamed: 0,SETTLEMENTDATE,REGIONID
count,1287860,1287860
unique,257530,5
top,2023-02-03 18:15:00.000,VIC1
freq,10,257572



First 5 Rows:


Unnamed: 0,SETTLEMENTDATE,RUNNO,REGIONID,TOTALDEMAND,AVAILABLEGENERATION,AVAILABLELOAD,DISPATCHABLEGENERATION,DISPATCHABLELOAD,NETINTERCHANGE,INITIALSUPPLY,CLEAREDSUPPLY
0,2025-06-01 08:00:00.000,1,VIC1,5495.03,9676.8142,638.0,6407.22,0.0,912.19,5468.73814,5517.76
1,2025-06-01 11:55:00.000,1,VIC1,3089.52,9903.74327,832.0,4635.06,370.0,1175.54,3494.16936,3520.32
2,2025-06-01 05:50:00.000,1,SA1,1352.35,3220.20471,607.0,1329.2,0.0,-23.15,1367.55534,1357.95
3,2025-06-01 13:10:00.000,1,QLD1,4678.27,10366.66881,942.0,5874.84,583.0,613.56,5265.0232,5274.96
4,2025-06-01 10:05:00.000,1,VIC1,4170.22,9906.69155,822.0,5910.17,5.0,1734.95,4283.06899,4236.63





--- PROFILING: TRADINGPRICE.csv ---
Shape: 2099090 rows, 5 columns

Column Info & Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2099090 entries, 0 to 2099089
Data columns (total 5 columns):
 #   Column          Dtype  
---  ------          -----  
 0   SETTLEMENTDATE  object 
 1   RUNNO           int64  
 2   REGIONID        object 
 3   PERIODID        int64  
 4   RRP             float64
dtypes: float64(1), int64(2), object(2)
memory usage: 80.1+ MB

Missing Values (count per column):
No missing values found.

Descriptive Statistics (Numerical):


Unnamed: 0,RUNNO,PERIODID,RRP
count,2099090.0,2099090.0,2099090.0
mean,1.0,135.7169,104.7862
std,0.0,86.00462,396.6349
min,1.0,1.0,-1000.0
25%,1.0,55.0,30.18
50%,1.0,133.0,75.27
75%,1.0,211.0,130.0
max,1.0,288.0,17500.0



Descriptive Statistics (Categorical):


Unnamed: 0,SETTLEMENTDATE,REGIONID
count,2099090,2099090
unique,419818,5
top,2023-10-27 06:00:00.000,QLD1
freq,5,419818



First 5 Rows:


Unnamed: 0,SETTLEMENTDATE,RUNNO,REGIONID,PERIODID,RRP
0,2023-10-27 06:00:00.000,1,QLD1,72,56.86
1,2023-10-27 06:30:00.000,1,SA1,78,21.63
2,2023-10-27 01:30:00.000,1,TAS1,18,37.65
3,2023-10-27 02:45:00.000,1,VIC1,33,52.6
4,2023-10-27 02:45:00.000,1,SA1,33,42.34





--- PROFILING: TRAINING_INDEPENDENT_INPUT.csv ---
Shape: 393551 rows, 23 columns

Column Info & Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393551 entries, 0 to 393550
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   STATION_NAME            393551 non-null  object 
 1   YEAR                    393551 non-null  int64  
 2   MONTH                   393551 non-null  int64  
 3   DAY_TYPE                393551 non-null  object 
 4   PERIOD_HH               393551 non-null  int64  
 5   DATE_TIME_HH            393551 non-null  object 
 6   IS_WORKDAY              393551 non-null  bool   
 7   IS_SCHOOL_HOLIDAY       393551 non-null  bool   
 8   AIR_TEMP                392624 non-null  float64
 9   AIR_TEMP_LAG_1          392623 non-null  float64
 10  AIR_TEMP_LAG_2          392622 non-null  float64
 11  AIR_TEMP_LAG_3          392621 non-null  float64
 12  AIR_TEMP_LAG_4   

Unnamed: 0,YEAR,MONTH,PERIOD_HH,AIR_TEMP,AIR_TEMP_LAG_1,AIR_TEMP_LAG_2,AIR_TEMP_LAG_3,AIR_TEMP_LAG_4,HUMIDITY,DEW_POINT_TEMP,DEW_POINT_LAG_1,DEW_POINT_LAG_2,DEW_POINT_LAG_3,DEW_POINT_LAG_4,WIND_SPEED,PV_POWER,APPARENT_TEMP
count,393551.0,393551.0,393551.0,392624.0,392623.0,392622.0,392621.0,392620.0,392624.0,392624.0,392623.0,392622.0,392621.0,392620.0,393550.0,112609.0,392623.0
mean,2013.728947,6.457349,24.499356,20.275404,20.275418,20.275435,20.275456,20.27548,70.242741,14.005145,14.00517,14.005196,14.005223,14.00525,3.461076,702.521829,19.390723
std,6.483232,3.45215,13.853875,5.662011,5.662012,5.662009,5.662001,5.661988,18.679924,5.473149,5.473133,5.473114,5.473096,5.473077,2.190652,997.301424,6.428816
min,2002.0,1.0,1.0,0.4,0.4,0.4,0.4,0.4,6.0,-15.7,-15.7,-15.7,-15.7,-15.7,0.0,0.0,-3.9
25%,2008.0,3.0,12.0,16.5,16.5,16.5,16.5,16.5,56.0,10.7,10.7,10.7,10.7,10.7,2.11,0.0,15.1
50%,2014.0,6.0,24.0,20.8,20.8,20.8,20.8,20.8,73.0,14.9,14.9,14.9,14.9,14.9,3.11,12.217,20.0
75%,2019.0,9.0,37.0,24.3,24.3,24.3,24.3,24.3,86.0,18.1,18.1,18.1,18.1,18.1,4.722222,1300.284,24.1
max,2025.0,12.0,48.0,42.4,42.4,42.4,42.4,42.4,101.0,26.8,26.8,26.8,26.8,26.8,30.31,4604.08,44.0



Descriptive Statistics (Categorical):


Unnamed: 0,STATION_NAME,DAY_TYPE,DATE_TIME_HH
count,393551,393551,393551
unique,1,2,393551
top,Archerfield,WD,2025-06-13 10:30:00.000
freq,393551,280368,1



First 5 Rows:


Unnamed: 0,STATION_NAME,YEAR,MONTH,DAY_TYPE,PERIOD_HH,DATE_TIME_HH,IS_WORKDAY,IS_SCHOOL_HOLIDAY,AIR_TEMP,AIR_TEMP_LAG_1,AIR_TEMP_LAG_2,AIR_TEMP_LAG_3,AIR_TEMP_LAG_4,HUMIDITY,DEW_POINT_TEMP,DEW_POINT_LAG_1,DEW_POINT_LAG_2,DEW_POINT_LAG_3,DEW_POINT_LAG_4,WIND_SPEED,PV_POWER,APPARENT_TEMP,IS_CALCULATED_APP_TEMP
0,Archerfield,2025,6,WD,21,2025-06-13 10:30:00.000,True,False,15.0,13.5,12.0,10.8,10.0,48.0,4.1,3.6,3.5,3.4,3.5,5.277778,,10.1,False
1,Archerfield,2025,6,WD,20,2025-06-13 10:00:00.000,True,False,13.5,12.0,10.8,10.0,8.5,51.0,3.6,3.5,3.4,3.5,3.1,5.277778,,8.5,False
2,Archerfield,2025,6,WD,19,2025-06-13 09:30:00.000,True,False,12.0,10.8,10.0,8.5,6.3,56.0,3.5,3.4,3.5,3.1,2.8,5.277778,,7.0,False
3,Archerfield,2025,6,WD,18,2025-06-13 09:00:00.000,True,False,10.8,10.0,8.5,6.3,4.5,60.0,3.4,3.5,3.1,2.8,1.7,4.722222,,6.1,False
4,Archerfield,2025,6,WD,17,2025-06-13 08:30:00.000,True,False,10.0,8.5,6.3,4.5,4.5,64.0,3.5,3.1,2.8,1.7,1.5,4.166667,,5.7,False





