In [6]:
import pandas as pd
from sktime.datasets import load_tsf_to_dataframe

In [7]:
df, metadata = load_tsf_to_dataframe('../../raw_data/australian_electricity_demand_dataset.tsf')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,series_value
series_name,state,timestamp,Unnamed: 3_level_1
T1,NSW,2002-01-01 00:00:00,5714.045004
T1,NSW,2002-01-01 00:30:00,5360.189078
T1,NSW,2002-01-01 01:00:00,5014.835118
T1,NSW,2002-01-01 01:30:00,4602.755516
T1,NSW,2002-01-01 02:00:00,4285.179828


In [8]:
# Flattening multi-index if needed
df = df.reset_index()

# Filter for New South Wales
df = df[df['state'] == 'NSW']

# Keep only relevant columns and rename timestamp
df = df[['timestamp', 'series_value']].rename(columns={'timestamp': 'ts'})

# Convert to datetime
df['ts'] = pd.to_datetime(df['ts'])

# Drop duplicates based on timestamp
df = df.drop_duplicates(subset='ts')

# Sort by timestamp and set as index
df = df.sort_values('ts').set_index('ts')

# Resample to regular 30-minute intervals
df = df.resample('30min').mean()

# Interpolate missing values (method='time' requires datetime index!)
df = df.interpolate(method='time')

# Reset index to move timestamp back to column
df = df.reset_index()


In [9]:
# Diagnostics and checks
missing_values = df.isnull().sum()
time_deltas = df['ts'].diff()
most_common_freq = time_deltas.mode()[0]
duplicate_timestamps = df['ts'].duplicated().sum()
irregular_intervals = df[time_deltas != most_common_freq]

# Print summary of issues
print("Missing values per column:\n", missing_values)
print("Most common frequency between timestamps:", most_common_freq)
print("Number of duplicate timestamps:", duplicate_timestamps)
print("Number of irregular intervals:", irregular_intervals.shape[0])

Missing values per column:
 ts              0
series_value    0
dtype: int64
Most common frequency between timestamps: 0 days 00:30:00
Number of duplicate timestamps: 0
Number of irregular intervals: 1


In [10]:
df.to_csv('../../transformed_data/cleaned_australian_electricity_demand.csv', index=False)