In [1]:
import pandas as pd
import numpy as np
from scipy import stats

  from pandas.core import (


In [2]:
# Load the data to get cluster 4 IDs
demo_df = pd.read_csv(r"C:\Users\sonis\Downloads\comp_socio_df.csv")
cluster4_ids = demo_df[demo_df['Cluster_Comp06_k07'] == 4]['ANON_ID'].tolist()

In [3]:
# Load the electricity data 
print("Electricity data")
elec_df = pd.read_csv(r"C:\Users\sonis\carbon_full_day.csv.gz", compression='gzip')
print(f"Total electricity records: {len(elec_df)}")

Electricity data
Total electricity records: 24246420


In [8]:
# Filter for cluster 4 customers only
cluster4_elec = elec_df[elec_df['ANON_ID'].isin(cluster4_ids)]
print(f"Cluster 4 electricity records: {len(cluster4_elec):,}")
print(f"Unique cluster 4 customers found: {cluster4_elec['ANON_ID'].nunique()}")

# Fix the copy warning and prepare data
cluster4_elec = cluster4_elec.copy()
cluster4_elec['DateTime'] = pd.to_datetime(cluster4_elec['DateTime'])

# Filter for April-December analysis periods only
cluster4_elec['month'] = cluster4_elec['DateTime'].dt.month
cluster4_elec['year'] = cluster4_elec['DateTime'].dt.year
cluster4_elec['hour'] = cluster4_elec['DateTime'].dt.hour
cluster4_elec['day_of_week'] = cluster4_elec['DateTime'].dt.dayofweek  # 0=Monday, 6=Sunday

# Filter for April-December in both years
analysis_data = cluster4_elec[cluster4_elec['month'].isin([4,5,6,7,8,9,10,11,12])].copy()

print("Cluster  4 Analysis")
print(f"Analysis period data: {len(analysis_data):,} records")

Cluster 4 electricity records: 2,672,251
Unique cluster 4 customers found: 98
Cluster  4 Analysis
Analysis period data: 2,257,094 records


In [20]:
# Check for missing values 
print("Current missing values:")
print(f"Missing kWh: {cluster4_elec['ELEC_KWH'].isna().sum()}")
print(f"Missing CO2: {cluster4_elec['carbon_emissions'].isna().sum()}")
print(f"Missing intensity: {cluster4_elec['actual_intensity'].isna().sum()}")

# Check the specific dates
print(f"\nChecking your specific problem dates:")
problem_dates = ['2023-10-20', '2023-10-21', '2024-06-11', '2024-06-12']

for date in problem_dates:
    date_data = cluster4_elec[cluster4_elec['DateTime'].dt.date == pd.to_datetime(date).date()]
    unique_hours = date_data['DateTime'].dt.hour.nunique()
    total_records = len(date_data)
    print(f"  {date}: {total_records} records, {unique_hours} unique hours")
    
    if total_records > 0:
        print(f"    Time range: {date_data['DateTime'].min()} to {date_data['DateTime'].max()}")

Current missing values:
Missing kWh: 0
Missing CO2: 0
Missing intensity: 0

Checking your specific problem dates:
  2023-10-20: 4302 records, 22 unique hours
    Time range: 2023-10-20 00:00:00 to 2023-10-20 21:30:00
  2023-10-21: 0 records, 0 unique hours
  2024-06-11: 4325 records, 23 unique hours
    Time range: 2024-06-11 00:00:00 to 2024-06-11 22:30:00
  2024-06-12: 1767 records, 10 unique hours
    Time range: 2024-06-12 14:30:00 to 2024-06-12 23:30:00


These dates were already removed in previous file by removing both carbon and electricity data where missing so they don't affect the current analysis. 

In [9]:
# WHOLE DAY kWh ANALYSIS 
print("\nWHOLE DAY kWh ANALYSIS")

# Use all data 
kwh_data = analysis_data.copy()
print(f"Whole day data: {len(kwh_data):,} records")

# Calculate daily kWh totals per customer (all 48 intervals)
daily_kwh_totals = kwh_data.groupby([
    'ANON_ID', 'treatment', 'period', 
    kwh_data['DateTime'].dt.date
]).agg({
    'ELEC_KWH': 'sum'  # Daily kWh total (all day)
}).reset_index()

daily_kwh_totals.columns = ['ANON_ID', 'treatment', 'period', 'date', 'daily_total_kwh']
daily_kwh_totals['date'] = pd.to_datetime(daily_kwh_totals['date'])
daily_kwh_totals['year_month'] = daily_kwh_totals['date'].dt.to_period('M')

print(f"Daily kWh totals calculated: {len(daily_kwh_totals):,} customer-days")

# Calculate monthly averages for whole day
monthly_kwh_avg = daily_kwh_totals.groupby([
    'ANON_ID', 'treatment', 'period', 'year_month'
]).agg({
    'daily_total_kwh': 'mean'
}).reset_index()

print(f"Monthly kWh averages calculated: {len(monthly_kwh_avg):,} customer-months")


WHOLE DAY kWh ANALYSIS
Whole day data: 2,257,094 records
Daily kWh totals calculated: 47,813 customer-days
Monthly kWh averages calculated: 1,711 customer-months


In [10]:
# FULL DAY CO2 ANALYSIS (All 48 intervals)

print("\nFULL DAY CO2 ANALYSIS")

# Calculate daily CO2 totals per customer (all day)
daily_co2_totals = analysis_data.groupby([
    'ANON_ID', 'treatment', 'period', 
    analysis_data['DateTime'].dt.date
]).agg({
    'carbon_emissions': 'sum'  # Daily CO2 total (all 48 intervals)
}).reset_index()

daily_co2_totals.columns = ['ANON_ID', 'treatment', 'period', 'date', 'daily_co2_grams']
daily_co2_totals['date'] = pd.to_datetime(daily_co2_totals['date'])
daily_co2_totals['year_month'] = daily_co2_totals['date'].dt.to_period('M')

print(f"Daily CO2 totals calculated: {len(daily_co2_totals):,} customer-days")

# Calculate monthly averages for CO2
monthly_co2_avg = daily_co2_totals.groupby([
    'ANON_ID', 'treatment', 'period', 'year_month'
]).agg({
    'daily_co2_grams': 'mean'
}).reset_index()

print(f"Monthly CO2 averages calculated: {len(monthly_co2_avg):,} customer-months")


FULL DAY CO2 ANALYSIS
Daily CO2 totals calculated: 47,813 customer-days
Monthly CO2 averages calculated: 1,711 customer-months


In [12]:
# WHOLE DAY kWh GROUP ANALYSIS
# Calculate monthly group averages for WHOLE DAY kWh
monthly_group_avg = monthly_kwh_avg.groupby(['treatment', 'period', 'year_month']).agg({
    'daily_total_kwh': ['mean', 'std', 'count']
}).reset_index()

monthly_group_avg.columns = ['treatment', 'period', 'year_month', 'mean_total_kwh', 'std_total_kwh', 'count_total_kwh']

# Create comparison dataset for whole day kWh
pre_total_data = monthly_group_avg[monthly_group_avg['period'] == 'Pre'].copy()
post_total_data = monthly_group_avg[monthly_group_avg['period'] == 'Post'].copy()

total_comparison = pd.merge(
    pre_total_data[['treatment', 'year_month', 'mean_total_kwh']],
    post_total_data[['treatment', 'year_month', 'mean_total_kwh']],
    on=['treatment'],
    suffixes=('_pre', '_post')
)

# Calculate whole day kWh changes
total_comparison['total_kwh_change'] = total_comparison['mean_total_kwh_post'] - total_comparison['mean_total_kwh_pre']
total_comparison['total_kwh_pct_change'] = (total_comparison['total_kwh_change'] / total_comparison['mean_total_kwh_pre']) * 100

# Calculate whole day intervention effect
control_total_changes = total_comparison[total_comparison['treatment'] == 'Control']
intervention_total_changes = total_comparison[total_comparison['treatment'] == 'Intervention']

total_effects = pd.DataFrame({
    'month_pre': control_total_changes['year_month_pre'].values,
    'month_post': intervention_total_changes['year_month_post'].values,
    'control_total_kwh_change': control_total_changes['total_kwh_change'].values,
    'intervention_total_kwh_change': intervention_total_changes['total_kwh_change'].values,
})

total_effects['net_total_kwh_reduction'] = (
    total_effects['intervention_total_kwh_change'] - total_effects['control_total_kwh_change']
)

control_baseline_total_kwh = control_total_changes['mean_total_kwh_pre'].values
total_effects['net_total_kwh_pct_reduction'] = (
    total_effects['net_total_kwh_reduction'] / control_baseline_total_kwh
) * 100

print("\nWHOLE DAY kWh RESULTS")
print(f"Average daily TOTAL kWh reduction: {total_effects['net_total_kwh_reduction'].mean():.3f} kWh/day")
print(f"Average percentage TOTAL kWh reduction: {total_effects['net_total_kwh_pct_reduction'].mean():.2f}%")


WHOLE DAY kWh RESULTS
Average daily TOTAL kWh reduction: -0.251 kWh/day
Average percentage TOTAL kWh reduction: -2.33%


In [13]:
# FULL DAY CO2 GROUP ANALYSIS

# Calculate monthly group averages for FULL DAY CO2
monthly_co2_group_avg = monthly_co2_avg.groupby(['treatment', 'period', 'year_month']).agg({
    'daily_co2_grams': ['mean', 'std', 'count']
}).reset_index()

monthly_co2_group_avg.columns = ['treatment', 'period', 'year_month', 'mean_co2', 'std_co2', 'count_co2']

# Create comparison dataset for CO2
pre_co2_data = monthly_co2_group_avg[monthly_co2_group_avg['period'] == 'Pre'].copy()
post_co2_data = monthly_co2_group_avg[monthly_co2_group_avg['period'] == 'Post'].copy()

co2_comparison = pd.merge(
    pre_co2_data[['treatment', 'year_month', 'mean_co2']],
    post_co2_data[['treatment', 'year_month', 'mean_co2']],
    on=['treatment'],
    suffixes=('_pre', '_post')
)

# Calculate CO2 changes
co2_comparison['co2_change'] = co2_comparison['mean_co2_post'] - co2_comparison['mean_co2_pre']
co2_comparison['co2_pct_change'] = (co2_comparison['co2_change'] / co2_comparison['mean_co2_pre']) * 100

# Calculate CO2 intervention effect
control_co2_changes = co2_comparison[co2_comparison['treatment'] == 'Control']
intervention_co2_changes = co2_comparison[co2_comparison['treatment'] == 'Intervention']

co2_effects = pd.DataFrame({
    'month_pre': control_co2_changes['year_month_pre'].values,
    'month_post': intervention_co2_changes['year_month_post'].values,
    'control_co2_change': control_co2_changes['co2_change'].values,
    'intervention_co2_change': intervention_co2_changes['co2_change'].values,
})

co2_effects['net_co2_reduction'] = (
    co2_effects['intervention_co2_change'] - co2_effects['control_co2_change']
)

control_baseline_co2 = control_co2_changes['mean_co2_pre'].values
co2_effects['net_co2_pct_reduction'] = (
    co2_effects['net_co2_reduction'] / control_baseline_co2
) * 100

print("\nFULL DAY CO2 RESULTS")
print(f"Average daily CO2 reduction: {co2_effects['net_co2_reduction'].mean():.1f} grams/day")
print(f"Average percentage CO2 reduction: {co2_effects['net_co2_pct_reduction'].mean():.2f}%")


FULL DAY CO2 RESULTS
Average daily CO2 reduction: -28.0 grams/day
Average percentage CO2 reduction: -1.73%


In [14]:
# Statistical significance testing
total_kwh_tstat, total_kwh_pval = stats.ttest_1samp(total_effects['net_total_kwh_reduction'], 0)
co2_tstat, co2_pval = stats.ttest_1samp(co2_effects['net_co2_reduction'], 0)

print(f"WHOLE DAY kWh reduction significance: t={total_kwh_tstat:.3f}, p={total_kwh_pval:.4f}")
print(f"FULL DAY CO2 reduction significance: t={co2_tstat:.3f}, p={co2_pval:.4f}")

if total_kwh_pval < 0.05:
    print("WHOLE DAY kWh reduction is statistically significant (p < 0.05)")
else:
    print("WHOLE DAY kWh reduction is not statistically significant (p ≥ 0.05)")
    
if co2_pval < 0.05:
    print("FULL DAY CO2 reduction is statistically significant (p < 0.05)")
else:
    print("FULL DAY CO2 reduction is not statistically significant (p ≥ 0.05)")

# SUMMARY
# Calculate monthly reductions
monthly_total_reduction = total_effects['net_total_kwh_reduction'].mean() * 30.44
monthly_co2_reduction = co2_effects['net_co2_reduction'].mean() * 30.44

# Calculate annual reductions  
annual_total_reduction = monthly_total_reduction * 12
annual_co2_reduction = monthly_co2_reduction * 12

# Calculate baselines
monthly_baseline_total = control_baseline_total_kwh.mean() * 30.44
annual_baseline_total = control_baseline_total_kwh.mean() * 365.25 

monthly_baseline_co2 = control_baseline_co2.mean() * 30.44  
annual_baseline_co2 = control_baseline_co2.mean() * 365.25

# Calculate percentage reductions
monthly_total_pct = (monthly_total_reduction / monthly_baseline_total) * 100
monthly_co2_pct = (monthly_co2_reduction / monthly_baseline_co2) * 100

# Display results
print(f"\nMONTHLY REDUCTIONS per household:")
print(f"  Whole Day kWh: {monthly_total_reduction:.1f} kWh/month ({monthly_total_pct:.2f}%)")
print(f"  Full Day CO2: {monthly_co2_reduction/1000:.2f} kg CO2/month ({monthly_co2_pct:.2f}%)")

print(f"\nANNUAL REDUCTIONS per household:")
print(f"  Whole Day kWh: {annual_total_reduction:.1f} kWh/year")
print(f"  Full Day CO2: {annual_co2_reduction/1000:.1f} kg CO2/year")

WHOLE DAY kWh reduction significance: t=-3.739, p=0.0003
FULL DAY CO2 reduction significance: t=-2.492, p=0.0148
WHOLE DAY kWh reduction is statistically significant (p < 0.05)
FULL DAY CO2 reduction is statistically significant (p < 0.05)

MONTHLY REDUCTIONS per household:
  Whole Day kWh: -7.6 kWh/month (-2.65%)
  Full Day CO2: -0.85 kg CO2/month (-1.93%)

ANNUAL REDUCTIONS per household:
  Whole Day kWh: -91.7 kWh/year
  Full Day CO2: -10.2 kg CO2/year


Earlier peak hour kwh reductions were proportionally larger (-8.6%)

In [16]:
# Best and worst performing months
print(f"\nBEST & WORST PERFORMING MONTHS")

best_total_month = total_effects.loc[total_effects['net_total_kwh_reduction'].idxmin()]
worst_total_month = total_effects.loc[total_effects['net_total_kwh_reduction'].idxmax()]

best_co2_month = co2_effects.loc[co2_effects['net_co2_reduction'].idxmin()] 
worst_co2_month = co2_effects.loc[co2_effects['net_co2_reduction'].idxmax()]

print(f"Whole Day kWh Reduction:")
print(f"  Best month: {best_total_month['month_post']} ({best_total_month['net_total_kwh_reduction']:.3f} kWh/day)")
print(f"  Worst month: {worst_total_month['month_post']} ({worst_total_month['net_total_kwh_reduction']:.3f} kWh/day)")

print(f"\nCO2 Reduction:")
print(f"  Best month: {best_co2_month['month_post']} ({best_co2_month['net_co2_reduction']:.1f} grams/day)")
print(f"  Worst month: {worst_co2_month['month_post']} ({worst_co2_month['net_co2_reduction']:.1f} grams/day)")


BEST & WORST PERFORMING MONTHS
Whole Day kWh Reduction:
  Best month: 2024-07 (-1.653 kWh/day)
  Worst month: 2024-09 (0.789 kWh/day)

CO2 Reduction:
  Best month: 2024-07 (-224.1 grams/day)
  Worst month: 2024-08 (153.5 grams/day)
