# Carbon Data

### No data for these dates 

October 20, 2023: Missing: 22:00, 22:30, 23:00, 23:30 (4 slots)

October 21, 2023: Missing: ENTIRE DAY (all 48 half-hour slots from 00:00 to 23:30)

June 11, 2024: Missing: 23:00, 23:30 (2 slots)

June 12, 2024: Missing: 00:00 through 14:00 (29 slots)
Available: 14:30 through 23:30 (19 slots)

In [68]:
import os
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [69]:
def fetch_range_chunked(start_date, end_date):
    max_days = 30  # chunk size
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    
    all_data = []
    
    while start <= end:
        chunk_end = min(start + timedelta(days=max_days - 1), end)
        s = start.strftime("%Y-%m-%dT00:00Z")
        e = chunk_end.strftime("%Y-%m-%dT23:30Z")
        
        print(f"Fetching data from {s} to {e}...")
        url = f"https://api.carbonintensity.org.uk/intensity/{s}/{e}"
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        
        records = []
        for item in data['data']:
            records.append({
                'from': pd.to_datetime(item['from']),
                'to': pd.to_datetime(item['to']),
                'forecast': item['intensity']['forecast'],
                'actual': item['intensity']['actual'],
                'index': item['intensity']['index']
            })
        
        df_chunk = pd.DataFrame(records)
        all_data.append(df_chunk)
        
        start = chunk_end + timedelta(days=1)
    
    df_all = pd.concat(all_data).reset_index(drop=True)
    df_all['day_of_week'] = df_all['to'].dt.day_name()
    return df_all

In [70]:
# FETCH CONTINUOUS DATA FOR ENTIRE STUDY PERIOD
print("Fetching continuous carbon data for entire study period...")
carbon_continuous = fetch_range_chunked("2023-01-01", "2025-01-01")

print(f"Total carbon records fetched: {len(carbon_continuous)}")
print(f"Date range: {carbon_continuous['from'].min()} to {carbon_continuous['to'].max()}")
print(f"Missing actual values: {carbon_continuous['actual'].isna().sum()}")

Fetching continuous carbon data for entire study period...
Fetching data from 2023-01-01T00:00Z to 2023-01-30T23:30Z...
Fetching data from 2023-01-31T00:00Z to 2023-03-01T23:30Z...
Fetching data from 2023-03-02T00:00Z to 2023-03-31T23:30Z...
Fetching data from 2023-04-01T00:00Z to 2023-04-30T23:30Z...
Fetching data from 2023-05-01T00:00Z to 2023-05-30T23:30Z...
Fetching data from 2023-05-31T00:00Z to 2023-06-29T23:30Z...
Fetching data from 2023-06-30T00:00Z to 2023-07-29T23:30Z...
Fetching data from 2023-07-30T00:00Z to 2023-08-28T23:30Z...
Fetching data from 2023-08-29T00:00Z to 2023-09-27T23:30Z...
Fetching data from 2023-09-28T00:00Z to 2023-10-27T23:30Z...
Fetching data from 2023-10-28T00:00Z to 2023-11-26T23:30Z...
Fetching data from 2023-11-27T00:00Z to 2023-12-26T23:30Z...
Fetching data from 2023-12-27T00:00Z to 2024-01-25T23:30Z...
Fetching data from 2024-01-26T00:00Z to 2024-02-24T23:30Z...
Fetching data from 2024-02-25T00:00Z to 2024-03-25T23:30Z...
Fetching data from 2024-03

In [71]:
# Clean missing carbon data after fetching
if carbon_continuous['actual'].isna().sum() > 0:
    print(f"Removing {carbon_continuous['actual'].isna().sum()} records with missing carbon intensity...")
    carbon_continuous = carbon_continuous.dropna(subset=['actual']).copy()
    print(f"Clean carbon records: {len(carbon_continuous)}")

Removing 5 records with missing carbon intensity...
Clean carbon records: 35009


In [72]:
# Save the continuous dataset
carbon_continuous.to_csv("carbon_continuous.csv", index=False)
print("Saved continuous carbon data to 'carbon_continuous.csv")

# Display sample data
print("\nSample of continuous carbon data:")
print(carbon_continuous.head())

Saved continuous carbon data to 'carbon_continuous.csv

Sample of continuous carbon data:
                       from                        to  forecast  actual index  \
0 2022-12-31 23:30:00+00:00 2023-01-01 00:00:00+00:00        75    65.0   low   
1 2023-01-01 00:00:00+00:00 2023-01-01 00:30:00+00:00        73    72.0   low   
2 2023-01-01 00:30:00+00:00 2023-01-01 01:00:00+00:00        63    80.0   low   
3 2023-01-01 01:00:00+00:00 2023-01-01 01:30:00+00:00        71    72.0   low   
4 2023-01-01 01:30:00+00:00 2023-01-01 02:00:00+00:00        76    65.0   low   

  day_of_week  
0      Sunday  
1      Sunday  
2      Sunday  
3      Sunday  
4      Sunday  


# Merging the data

In [73]:
# Step 0.1: Define date ranges
pre_start = pd.Timestamp("2023-02-01 00:00:00")
pre_end   = pd.Timestamp("2024-01-31 23:59:59")

post_start = pd.Timestamp("2024-04-01 00:00:00")
post_end   = pd.Timestamp("2024-12-31 23:59:59")

In [74]:
def process_carbon_data_for_all_hours(carbon_df):
    """Process carbon data for all hours, filtered for electricity data periods only"""
    carbon_df = carbon_df.copy()
    carbon_df['from'] = pd.to_datetime(carbon_df['from']).dt.tz_localize(None)
    
    # Filter for periods where electricity data exists
    pre_mask = (carbon_df['from'] >= pd.Timestamp("2023-02-02 00:00:00")) & \
               (carbon_df['from'] <= pd.Timestamp("2024-01-31 23:59:59"))
    post_mask = (carbon_df['from'] >= pd.Timestamp("2024-04-01 00:00:00")) & \
                (carbon_df['from'] <= pd.Timestamp("2024-12-31 23:59:59"))
    
    carbon_df = carbon_df[pre_mask | post_mask].copy()
    carbon_df['datetime_30min'] = carbon_df['from']
    
    carbon_30min = carbon_df[['datetime_30min', 'actual']].copy()
    carbon_30min.rename(columns={'actual': 'actual_intensity'}, inplace=True)
    
    return carbon_30min

In [75]:
def preprocess_electricity_with_carbon_full_day(file_name, start_date, end_date, treatment, period, carbon_data):
    """Process electricity data and merge with carbon data for all hours"""
    df = pd.read_csv(file_name)
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'Time'})
    else:
        df.rename(columns={df.columns[0]: 'Time'}, inplace=True)
    
    df['Time'] = pd.to_datetime(df['Time'], format="%Y-%m-%d %H:%M:%S")
    print(f"Processing {file_name} -> Min Time: {df['Time'].min()}, Max Time: {df['Time'].max()}")
    
    # Filter by time range
    df = df[(df['Time'] >= start_date) & (df['Time'] <= end_date)]
    
    # Convert from wide to long format
    df_long = df.melt(id_vars=['Time'], var_name='ANON_ID', value_name='ELEC_KWH')
    df_long.drop_duplicates(subset=['Time', 'ANON_ID'], inplace=True)
    
    print(f"  Electricity records: {len(df_long):,}")
    print(f"  Missing electricity values: {df_long['ELEC_KWH'].isna().sum():,} ({df_long['ELEC_KWH'].isna().mean()*100:.2f}%)")
    
    df_long['datetime_30min'] = df_long['Time']
    
    # Use inner join to keep only records where BOTH electricity and carbon data exist
    df_with_carbon = pd.merge(df_long, carbon_data, on='datetime_30min', how='inner')
    
    print(f"  Records after inner join: {len(df_with_carbon):,}")
    print(f"  Records removed due to missing carbon data: {len(df_long) - len(df_with_carbon):,}")
    
    df_with_carbon['carbon_emissions'] = df_with_carbon['ELEC_KWH'] * df_with_carbon['actual_intensity']
    df_with_carbon['treatment'] = treatment
    df_with_carbon['period'] = period
    df_with_carbon['DateTime'] = df_with_carbon['Time']
    
    final_columns = ['ANON_ID', 'DateTime', 'ELEC_KWH', 'actual_intensity', 'carbon_emissions', 'treatment', 'period']
    df_final = df_with_carbon[final_columns].copy()
    
    return df_final

In [76]:
# Process Carbon Data
carbon_continuous = pd.read_csv('carbon_continuous.csv')
all_carbon = process_carbon_data_for_all_hours(carbon_continuous)
print(f"Carbon data processed: {len(all_carbon):,} records")

Carbon data processed: 30,550 records


In [77]:
# Process Electricity Data
electricity_folder = "D:\LSE\Capstone Project\FLASH data" 

file_configs = [
    ("controlePreConsolide.csv", pre_start, pre_end, "Control", "Pre"),
    ("controleConsolide.csv", post_start, post_end, "Control", "Post"),
    ("interventionPreConsolide.csv", pre_start, pre_end, "Intervention", "Pre"),
    ("interventionConsolide.csv", post_start, post_end, "Intervention", "Post")
]

processed_datasets = {}
dataset_names = ["df_control_pre", "df_control_post", 
                "df_intervention_pre", "df_intervention_post"]

for i, (filename, start_date, end_date, treatment, period) in enumerate(file_configs):
    file_path = os.path.join(electricity_folder, filename)
    dataset_name = dataset_names[i]
    
    processed_datasets[dataset_name] = preprocess_electricity_with_carbon_full_day(
        file_path, start_date, end_date, treatment, period, all_carbon)

# Extract datasets
df_control_pre = processed_datasets["df_control_pre"]
df_control_post = processed_datasets["df_control_post"]
df_intervention_pre = processed_datasets["df_intervention_pre"]
df_intervention_post = processed_datasets["df_intervention_post"]

print("\nDataset shapes:")
for name, df in processed_datasets.items():
    print(f"{name}: {df.shape}")

Processing D:\LSE\Capstone Project\FLASH data\controlePreConsolide.csv -> Min Time: 2023-02-02 00:00:00, Max Time: 2024-01-31 23:30:00
  Electricity records: 5,540,480
  Missing electricity values: 51,770 (0.93%)
  Records after inner join: 5,511,360
  Records removed due to missing carbon data: 29,120
Processing D:\LSE\Capstone Project\FLASH data\controleConsolide.csv -> Min Time: 2024-04-01 00:00:00, Max Time: 2024-12-31 23:30:00
  Electricity records: 5,706,668
  Missing electricity values: 481,188 (8.43%)
  Records after inner join: 5,690,982
  Records removed due to missing carbon data: 15,686
Processing D:\LSE\Capstone Project\FLASH data\interventionPreConsolide.csv -> Min Time: 2023-02-02 00:00:00, Max Time: 2024-01-31 23:30:00
  Electricity records: 6,722,577
  Missing electricity values: 72,256 (1.07%)
  Records after inner join: 6,687,360
  Records removed due to missing carbon data: 35,217
Processing D:\LSE\Capstone Project\FLASH data\interventionConsolide.csv -> Min Time: 2

In [78]:
# Combine all datasets
all_datasets = [df for df in [df_control_pre, df_control_post, df_intervention_pre, df_intervention_post] if not df.empty]
final_df = pd.concat(all_datasets, ignore_index=True)

print(f"\nFinal dataset shape: {final_df.shape}")
print(f"Unique households: {final_df['ANON_ID'].nunique()}")

# Clean any remaining missing electricity data
initial_records = len(final_df)
final_df_clean = final_df.dropna(subset=['ELEC_KWH']).copy()
records_removed = initial_records - len(final_df_clean)

if records_removed > 0:
    print(f"\nRemoved {records_removed:,} records with missing electricity data")
    print(f"Final clean dataset: {len(final_df_clean):,} records")
else:
    print(f"\nNo missing data found - dataset is clean: {len(final_df_clean):,} records")


Final dataset shape: (25366884, 7)
Unique households: 1173

Removed 1,120,464 records with missing electricity data
Final clean dataset: 24,246,420 records


In [79]:
# Shorten ANON_IDs
unique_ids = final_df_clean['ANON_ID'].unique()
id_mapping = {long_id: f"H{i}" for i, long_id in enumerate(unique_ids)}
final_df_clean['ANON_ID'] = final_df_clean['ANON_ID'].map(id_mapping)

In [82]:
print("\nSample of final data:")
print(final_df_clean.head())

final_df_clean.to_csv("carbon_full_day_clean.csv", index=False)
print(f"\nSaved to 'carbon_full_day_clean.csv'")


Sample of final data:
  ANON_ID            DateTime  ELEC_KWH  actual_intensity  carbon_emissions  \
0      H0 2023-02-02 00:00:00     0.140              64.0             8.960   
1      H0 2023-02-02 00:30:00     0.138              64.0             8.832   
2      H0 2023-02-02 01:00:00     0.128              66.0             8.448   
3      H0 2023-02-02 01:30:00     0.149              65.0             9.685   
4      H0 2023-02-02 02:00:00     0.137              66.0             9.042   

  treatment period  
0   Control    Pre  
1   Control    Pre  
2   Control    Pre  
3   Control    Pre  
4   Control    Pre  

Saved to 'carbon_full_day_clean.csv'


In [86]:
# Compressed CSV
final_df_clean.to_csv("carbon_full_day_clean.csv.gz", compression='gzip', index=False)