Correlation assumes a stable relationship so the correlation heatmaps are constrained to the last 24 hours and the pair grid to 7 days.

Over longer periods, there's a risk of including 

- Occupancy vs empty periods
- High/low pressure weather systems
- Seasonal daylight changes

These result in:

- Diluted or misleading correlations
- Hiding of short-term cause/effect relationships
- VOC <-> humidity <-> temperature relationships flattening out

The data is also constrained to daytime only as overnight the following would be observed:

- Flat or near-flat illuminance
- Minimal VOC generation
- Slow drift in temperature & humidity
- Long stretches of “nothing happening”

This adds data points but not much information and so weakens meaningful correlations

In [None]:
CORRELATION_PERIOD_HOURS=24
PAIR_GRID_DAYS=7

DAY_START = "06:00"
DAY_END   = "22:00"

In [None]:
%run ../pathutils.ipynb
%run ../database.ipynb
%run ../export.ipynb
%run health.ipynb
%run database.ipynb
%run utils.ipynb

In [None]:
# Determine the number of days of data required
days = max(CORRELATION_PERIOD_HOURS, PAIR_GRID_DAYS * 24) / 24

# Load the readings for each sensor and produce a combined data frame
bme280_df = load_sensor_readings("bme280", days)
veml7700_df = load_sensor_readings("veml7700", days)
sgp40_df = load_sensor_readings("sgp40", days)
combined_df = merge_sensor_readings([bme280_df, veml7700_df, sgp40_df])
combined_df.head()

## Source Data Preparation

In [None]:
import pandas as pd

# Strip the timezone from the timestamp, as this will cause the export to spreadsheet to fail (Excel can't
# handle dates with timezone information)
combined_df.index = combined_df.index.tz_localize(None)

# Extract daytime readings only
daytime_df = combined_df.between_time(DAY_START, DAY_END)

# Define the columns of interest
correlation_columns = [
    "temperature",
    "humidity",
    "pressure",
    "illuminance",
    "vocindex",
]

# Extract the data for the correlation heatmaps
end = daytime_df.index.max()
start_24h = end - pd.Timedelta(hours=CORRELATION_PERIOD_HOURS)
df_24h = daytime_df.loc[start_24h:end][correlation_columns].copy()

# Extract the data for the pair grid
start_7d = end - pd.Timedelta(days=PAIR_GRID_DAYS)
df_7d = daytime_df.loc[start_7d:end][correlation_columns].copy()

# Extract source frames for the correlations with numeric values and rows with empty values dropped
df_corr_24h = df_24h[correlation_columns].apply(pd.to_numeric, errors="coerce").dropna()
df_corr_7d = df_7d[correlation_columns].apply(pd.to_numeric, errors="coerce").dropna()

# Preview the data
display(df_corr_24h.head())
display(df_corr_7d.head())

## Correlation Analysis

In [None]:
pearson_24h = df_corr_24h.corr(method="pearson")
pearson_7d = df_corr_7d.corr(method="pearson")

spearman_24h = df_corr_24h.corr(method="spearman")
spearman_7d = df_corr_7d.corr(method="spearman")

In [None]:
# Get the export folder path
export_folder_path = get_export_folder_path("analysis")

# Define file names
pearson_24h_filename = f"all_all_pearson_correlation_heatmap_{CORRELATION_PERIOD_HOURS}_hour"
pearson_7d_filename = f"all_all_pearson_correlation_heatmap_{PAIR_GRID_DAYS}_day"
spearman_24h_filename = f"all_all_spearman_correlation_heatmap_{CORRELATION_PERIOD_HOURS}_hour"
spearman_7d_filename = f"all_all_spearman_correlation_heatmap_{PAIR_GRID_DAYS}_day"
pair_grid_filename = f"all_all_correlation_pair_grid_{CORRELATION_PERIOD_HOURS}_hour"

# Export the data to spreadsheets
export_to_spreadsheet(export_folder_path, f"{pearson_24h_filename}.xlsx", {
    f"Data - {CORRELATION_PERIOD_HOURS} hour": df_24h,
    f"Pearson - {CORRELATION_PERIOD_HOURS} hour": pearson_24h
})

export_to_spreadsheet(export_folder_path, f"{pearson_7d_filename}.xlsx", {
    f"Data - {PAIR_GRID_DAYS} day": df_7d,
    f"Pearson - {PAIR_GRID_DAYS} day": pearson_7d
})

export_to_spreadsheet(export_folder_path, f"{spearman_24h_filename}.xlsx", {
    f"Data - {CORRELATION_PERIOD_HOURS} hour": df_24h,
    f"Spearman - {CORRELATION_PERIOD_HOURS} hour": spearman_24h
})

export_to_spreadsheet(export_folder_path, f"{spearman_7d_filename}.xlsx", {
    f"Data - {PAIR_GRID_DAYS} day": df_7d,
    f"Spearman - {PAIR_GRID_DAYS} day": spearman_7d
})

export_to_spreadsheet(export_folder_path, f"{pair_grid_filename}.xlsx", {
    f"Data - {CORRELATION_PERIOD_HOURS} hour": df_24h
})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_theme(style="whitegrid")

# Mask the upper triangle as it just mirrors the lower
mask = np.triu(np.ones_like(pearson_24h, dtype=bool))

plt.figure(figsize=(12, 8))
ax = sns.heatmap(
    pearson_24h,
    mask=mask,
    annot=True,
    fmt=".2f",
    vmin=-1,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)

ax.set_title(f"Pearson Correlation Heatmap ({CORRELATION_PERIOD_HOURS} hours)")
plt.tight_layout()

# Export to PNG or PDF, if required
export_chart(export_folder_path, pearson_24h_filename, "png")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_theme(style="whitegrid")

# Mask the upper triangle as it just mirrors the lower
mask = np.triu(np.ones_like(pearson_7d, dtype=bool))

plt.figure(figsize=(12, 8))
ax = sns.heatmap(
    pearson_7d,
    mask=mask,
    annot=True,
    fmt=".2f",
    vmin=-1,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)

ax.set_title(f"Pearson Correlation Heatmap ({PAIR_GRID_DAYS} Day)")
plt.tight_layout()

# Export to PNG or PDF, if required
export_chart(export_folder_path, pearson_7d_filename, "png")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_theme(style="whitegrid")

# Mask the upper triangle as it just mirrors the lower
mask = np.triu(np.ones_like(spearman_24h, dtype=bool))

plt.figure(figsize=(12, 8))
ax = sns.heatmap(
    spearman_24h,
    mask=mask,
    annot=True,
    fmt=".2f",
    vmin=-1,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)

ax.set_title(f"Spearman Correlation Heatmap ({CORRELATION_PERIOD_HOURS} Hours)")
plt.tight_layout()

# Export to PNG or PDF, if required
export_chart(export_folder_path, spearman_24h_filename, "png")

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_theme(style="whitegrid")

# Mask the upper triangle as it just mirrors the lower
mask = np.triu(np.ones_like(spearman_7d, dtype=bool))

plt.figure(figsize=(12, 8))
ax = sns.heatmap(
    spearman_7d,
    mask=mask,
    annot=True,
    fmt=".2f",
    vmin=-1,
    vmax=1,
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)

ax.set_title(f"Spearman Correlation Heatmap ({PAIR_GRID_DAYS} Day)")
plt.tight_layout()

# Export to PNG or PDF, if required
export_chart(export_folder_path, spearman_7d_filename, "png")

plt.show()

In [None]:
g = sns.PairGrid(df_24h, diag_sharey=False)

# upper: light scatter
g.map_upper(
    sns.scatterplot,
    s=18,
    alpha=0.4,
)

# lower: regression with strong line colour
g.map_lower(
    sns.regplot,
    scatter_kws={"s": 16, "alpha": 0.3},
    line_kws={
        "color": "crimson",
        "linewidth": 2.5,
    },
)

# diagonal
g.map_diag(
    sns.histplot,
    kde=True,
)

g.fig.suptitle(
    "Pair Grid with Regression Lines (Lower) and Scatter Plots (Upper)",
    y=1.02
)

# Export to PNG or PDF, if required
export_chart(export_folder_path, pair_grid_filename, "png")

plt.show()