In [2]:
import pandas as pd

# Load the merged features data
merged_features = pd.read_csv('../clean_data/merged_features.csv')

# Display basic info
print("Shape:", merged_features.shape)
print("\nFirst few rows:")
print(merged_features.head())

print("\nData types:")
print(merged_features.dtypes)

print("\nMissing values:")
print(merged_features.isnull().sum())

print("\nBasic statistics:")
print(merged_features.describe())

Shape: (35040, 47)

First few rows:
  interval_start_local          load             datetime  temp   TEMP_qc  \
0  2021-01-01 00:00:00  43530.833333  2021-01-01 00:00:00  3.30  3.000000   
1  2021-01-01 01:00:00  43192.500000  2021-01-01 01:00:00  3.20  5.000000   
2  2021-01-01 02:00:00  43059.916667  2021-01-01 02:00:00  3.30  5.000000   
3  2021-01-01 03:00:00  43245.666667  2021-01-01 03:00:00  3.25  4.333333   
4  2021-01-01 04:00:00  43783.500000  2021-01-01 04:00:00  2.90  5.000000   

      DEW_C    DEW_qc  SLP_hPa    SLP_qc  WIND_DIR_deg  ...  SLP_qc_ramp_3h  \
0  2.800000  3.000000  1002.30  3.000000     25.000000  ...             NaN   
1  2.866667  5.000000  1003.20  7.666667    243.333333  ...             NaN   
2  3.175000  5.000000  1003.00  8.000000    312.500000  ...             NaN   
3  2.916667  4.333333  1003.15  7.000000    316.666667  ...        4.000000   
4  2.900000  5.000000  1002.50  7.000000    325.000000  ...        0.666667   

   WIND_DIR_deg_ramp_1h  W

In [4]:
# name all columns
print("\nColumn names:")
for col in merged_features.columns:
    print(col)


Column names:
interval_start_local
load
datetime
temp
TEMP_qc
DEW_C
DEW_qc
SLP_hPa
SLP_qc
WIND_DIR_deg
WIND_DIR_qc
WIND_SPD_ms
WIND_SPD_qc
hour
dayofweek
is_weekend
season
is_holiday
load_lag_1h
load_lag_24h
load_lag_168h
temp_sq
temp_x_hour
cooling_degree_hours
heating_degree_hours
temp_ramp_1h
temp_ramp_3h
temp_rollstd_6h
TEMP_qc_ramp_1h
TEMP_qc_ramp_3h
DEW_C_ramp_1h
DEW_C_ramp_3h
DEW_qc_ramp_1h
DEW_qc_ramp_3h
SLP_hPa_ramp_1h
SLP_hPa_ramp_3h
SLP_qc_ramp_1h
SLP_qc_ramp_3h
WIND_DIR_deg_ramp_1h
WIND_DIR_deg_ramp_3h
WIND_DIR_qc_ramp_1h
WIND_DIR_qc_ramp_3h
WIND_SPD_ms_ramp_1h
WIND_SPD_ms_ramp_3h
WIND_SPD_qc_ramp_1h
WIND_SPD_qc_ramp_3h
has_full_lags


In [5]:
# look at is_holiday column
merged_features['is_holiday'].value_counts()

is_holiday
False    33984
True      1056
Name: count, dtype: int64

In [6]:
# Look at temperature statistics
print("\nTemperature statistics:")
print(merged_features['temp'].describe())


Temperature statistics:
count    35033.000000
mean        20.362597
std          9.780091
min        -17.750000
25%         13.300000
50%         21.700000
75%         27.800000
max         42.800000
Name: temp, dtype: float64


In [8]:
# Look at where temp is less than -10 or greater than 120
extreme_temps = merged_features[(merged_features['temp'] < -10) | (merged_features['temp'] > 120)]
print("\nExtreme temperature records:")
print(extreme_temps)


Extreme temperature records:
      interval_start_local          load             datetime   temp  TEMP_qc  \
1078   2021-02-14 22:00:00  66296.916667  2021-02-14 22:00:00 -10.60      5.0   
1079   2021-02-14 23:00:00  65548.833333  2021-02-14 23:00:00 -10.85      5.0   
1080   2021-02-15 00:00:00  64477.666667  2021-02-15 00:00:00 -11.55      5.0   
1081   2021-02-15 01:00:00  59322.916667  2021-02-15 01:00:00 -11.70      5.0   
1082   2021-02-15 02:00:00  54252.500000  2021-02-15 02:00:00 -11.70      5.0   
...                    ...           ...                  ...    ...      ...   
26631  2024-01-15 15:00:00  70917.583333  2024-01-15 15:00:00 -11.46      4.2   
26632  2024-01-15 16:00:00  71249.583333  2024-01-15 16:00:00 -10.30      5.0   
26651  2024-01-16 11:00:00  72148.333333  2024-01-16 11:00:00 -10.60      5.0   
26652  2024-01-16 12:00:00  70151.666667  2024-01-16 12:00:00 -10.85      3.0   
26653  2024-01-16 13:00:00  67693.833333  2024-01-16 13:00:00 -11.10      5.0  

In [16]:
# Look for load outliers
load_outliers = merged_features[(merged_features['load'] < 0) | (merged_features['load'] > 85000)]
print("\nLoad outliers:")
print(load_outliers)


Load outliers:
      interval_start_local          load             datetime  temp  TEMP_qc  \
22840  2023-08-10 16:00:00  85186.000000  2023-08-10 16:00:00  35.6      5.0   
22841  2023-08-10 17:00:00  85424.333333  2023-08-10 17:00:00  36.7      5.0   
23080  2023-08-20 16:00:00  85110.500000  2023-08-20 16:00:00  40.0      5.0   
31841  2024-08-19 17:00:00  85276.583333  2024-08-19 17:00:00  38.3      5.0   
31864  2024-08-20 16:00:00  85431.583333  2024-08-20 16:00:00  33.9      5.0   
31865  2024-08-20 17:00:00  85517.750000  2024-08-20 17:00:00  35.0      5.0   
31888  2024-08-21 16:00:00  85059.583333  2024-08-21 16:00:00  30.6      5.0   

       DEW_C  DEW_qc  SLP_hPa  SLP_qc  WIND_DIR_deg  ...  SLP_qc_ramp_3h  \
22840   22.8     5.0   1008.8     5.0         220.0  ...             0.0   
22841   22.2     5.0   1008.2     5.0           NaN  ...             0.0   
23080   17.8     5.0   1016.2     5.0         360.0  ...             0.0   
31841   20.0     5.0   1014.9     5.0  