In [26]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression   
# Convert the MM-DD HH:MM format to a datetime index for easier processing
# The data spans multiple years (2021-2024), we'll use 2023 as the base year
import datetime

In [27]:
training_pd = pd.read_csv("analysis/epoch_30/job_details_enhanced.csv")

In [28]:
training_pd

Unnamed: 0,job_id,submit_time,submit_datetime,request_time,request_processors,carbon_consideration,queue_length_at_submission,submission_timestamp,submission_datetime,user_id,...,completion_time,completion_datetime,wait_time,actual_runtime,action1,action2,scheduled,completed,carbon_emissions,carbon_reward
0,7500,5889054,2023-04-13T19:00:00,7028,1,0.373879,1,5889054,2023-04-13T19:00:00,-1,...,5898282,2023-04-13T21:33:48,2200.0,7028,0.0,7.0,True,True,164.257667,-0.000614
1,7501,5890305,2023-04-13T19:20:51,1107,1,0.694510,1,5891254,2023-04-13T19:36:40,-1,...,5913961,2023-04-14T01:55:07,22549.0,1107,0.0,10.0,True,True,25.439475,-0.000177
2,7514,5903854,2023-04-13T23:06:40,21512,256,0.888709,4,5912854,2023-04-14T01:36:40,-1,...,5970814,2023-04-14T17:42:40,45448.0,21512,0.0,8.0,True,True,124965.559467,-1.110580
3,7515,5903891,2023-04-13T23:07:17,13741,32,0.571370,4,5912854,2023-04-14T01:36:40,-1,...,5949302,2023-04-14T11:44:08,31670.0,13741,1.0,10.0,True,True,9943.302489,-0.056813
4,7524,5905986,2023-04-13T23:42:12,11287,128,0.465253,4,5912854,2023-04-14T01:36:40,-1,...,5925248,2023-04-14T05:03:14,7975.0,11287,2.0,1.0,True,True,34917.253333,-0.162454
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,8519,6708487,2023-04-23T06:37:13,62,4,0.353738,0,6708487,2023-04-23T06:37:13,-1,...,6708549,2023-04-23T06:38:15,0.0,62,,,True,True,5.109144,-0.000018
1020,8520,6708512,2023-04-23T06:37:38,324,4,0.433960,0,6708512,2023-04-23T06:37:38,-1,...,6708836,2023-04-23T06:43:02,0.0,324,,,True,True,26.699400,-0.000116
1021,8521,6708732,2023-04-23T06:41:18,57,1,0.359237,0,6708732,2023-04-23T06:41:18,-1,...,6708789,2023-04-23T06:42:15,0.0,57,,,True,True,1.174279,-0.000004
1022,8522,6708906,2023-04-23T06:44:12,16,1,0.870514,0,6708906,2023-04-23T06:44:12,-1,...,6708922,2023-04-23T06:44:28,0.0,16,,,True,True,0.329622,-0.000003


### Creating carbon intensity features

In [40]:
# Load carbon intensity data
carbon_data = pd.read_csv("data/DK-DK2_hourly_carbon_intensity_noFeb29.csv")


def parse_carbon_timestamp(row):
    """Convert MM-DD HH:MM format to datetime objects for each year"""
    date_time_str = row['MM-DD HH:MM (UTC)']
    month_day, hour_min = date_time_str.split(' ')
    month, day = map(int, month_day.split('-'))
    hour, minute = map(int, hour_min.split(':'))
    
    results = {}
    for year in [2021, 2022, 2023, 2024]:
        try:
            dt = datetime.datetime(year, month, day, hour, minute)
            results[year] = dt
        except ValueError:
            # Handle Feb 29 for non-leap years
            if month == 2 and day == 29:
                dt = datetime.datetime(year, 2, 28, hour, minute)
                results[year] = dt
            else:
                raise
    return results

# Create a more usable carbon intensity dataframe
carbon_intensity_data = []
for idx, row in carbon_data.iterrows():
    timestamps = parse_carbon_timestamp(row)
    for year in [2021, 2022, 2023, 2024]:
        carbon_intensity_data.append({
            'timestamp': timestamps[year],
            'carbon_intensity': row[str(year)],
            'year': year
        })

carbon_df = pd.DataFrame(carbon_intensity_data)
carbon_df['timestamp'] = pd.to_datetime(carbon_df['timestamp'])
carbon_df = carbon_df.sort_values('timestamp').reset_index(drop=True)

In [41]:
# Check if the dataset already has datetime columns (from updated validate.py)
if 'submit_datetime' in training_pd.columns and pd.notna(training_pd['submit_datetime'].iloc[0]):
    # Convert ISO format strings to datetime objects
    training_pd['submit_datetime'] = pd.to_datetime(training_pd['submit_datetime'])
else:
    # For old format, we need to convert simulation time to real datetime
    # Using 2023-01-01 as base date (same as updated validate.py)
    base_datetime = datetime.datetime(2023, 1, 1, 0, 0, 0)
    min_submit_time = training_pd['submit_time'].min()
    
    # Convert simulation time to datetime by treating as seconds offset from base
    training_pd['submit_datetime'] = pd.to_datetime(base_datetime) + pd.to_timedelta(
        training_pd['submit_time'] - min_submit_time, unit='s'
    )

# Extract year from submit datetime to determine which carbon intensity data to use
training_pd['submit_year'] = training_pd['submit_datetime'].dt.year

# Function to get carbon intensity for a specific timestamp
def get_carbon_intensity_at_time(submit_datetime, hours_offset=0):
    """Get carbon intensity at submit_datetime + hours_offset"""
    target_time = submit_datetime + pd.Timedelta(hours=hours_offset)
    
    # Find the closest carbon intensity record
    # First filter by year
    year = target_time.year
    year_data = carbon_df[carbon_df['year'] == year].copy()
    
    if len(year_data) == 0:
        # If no data for that year, use 2023 as default
        year_data = carbon_df[carbon_df['year'] == 2023].copy()
    
    # Create target timestamp for the same year
    target_time_same_year = datetime.datetime(
        year_data.iloc[0]['year'], 
        target_time.month, 
        target_time.day, 
        target_time.hour, 
        0  # Round to nearest hour
    )
    
    # Handle edge cases like Feb 29 in non-leap years
    try:
        target_timestamp = pd.Timestamp(target_time_same_year)
    except ValueError:
        if target_time.month == 2 and target_time.day == 29:
            target_timestamp = pd.Timestamp(year_data.iloc[0]['year'], 2, 28, target_time.hour, 0)
        else:
            raise
    # Find closest match
    time_diffs = abs(year_data['timestamp'] - target_timestamp)
    closest_idx = time_diffs.idxmin()
    
    return year_data.loc[closest_idx, 'carbon_intensity']

# Test the function
test_submit_time = training_pd['submit_datetime'].iloc[0]
test_carbon = get_carbon_intensity_at_time(test_submit_time, 1)


In [42]:
# Create 48 carbon intensity variables

# Create all 48 variables in batches for better performance
batch_size = 8
for batch_start in range(1, 49, batch_size):
    batch_end = min(batch_start + batch_size, 49)
    
    for hour in range(batch_start, batch_end):
        col_name = f'carbon_intensity_hour_{hour}'
        training_pd[col_name] = training_pd['submit_datetime'].apply(
            lambda x: get_carbon_intensity_at_time(x, hour)
        )


# Display the first few rows with some of the new variables
display_cols = ['job_id', 'submit_datetime'] + [f'carbon_intensity_hour_{i}' for i in range(1, 6)]
training_pd[display_cols].head()


Unnamed: 0,job_id,submit_datetime,carbon_intensity_hour_1,carbon_intensity_hour_2,carbon_intensity_hour_3,carbon_intensity_hour_4,carbon_intensity_hour_5
0,7500,2023-04-13 19:00:00,91.73,83.48,83.0,78.89,80.23
1,7501,2023-04-13 19:20:51,91.73,83.48,83.0,78.89,80.23
2,7514,2023-04-13 23:06:40,80.23,80.79,81.94,80.55,87.23
3,7515,2023-04-13 23:07:17,80.23,80.79,81.94,80.55,87.23
4,7524,2023-04-13 23:42:12,80.23,80.79,81.94,80.55,87.23


### Carbon emission model

ce = carbon emission
ci = carbon consideration

#### Model specification

ce = carbon consideration + carbon consideration * node hours + carbon intensity_baselines + queue_length_at_submission


In [68]:
training_pd['node_hours'] = training_pd['request_processors'] * training_pd['request_time']
training_pd['node_hours_x_carbon_consideration'] = training_pd['node_hours'] * training_pd['carbon_consideration']
training_pd

Unnamed: 0,job_id,submit_time,submit_datetime,request_time,request_processors,carbon_consideration,queue_length_at_submission,submission_timestamp,submission_datetime,user_id,...,carbon_intensity_hour_42,carbon_intensity_hour_43,carbon_intensity_hour_44,carbon_intensity_hour_45,carbon_intensity_hour_46,carbon_intensity_hour_47,carbon_intensity_hour_48,node_hours,node_hours_x_carbon_connsideration,node_hours_x_carbon_consideration
0,7500,5889054,2023-04-13 19:00:00,7028,1,0.373879,1,5889054,2023-04-13T19:00:00,-1,...,47.47,46.96,48.12,52.95,54.14,54.03,53.45,7028,2.627622e+03,2.627622e+03
1,7501,5890305,2023-04-13 19:20:51,1107,1,0.694510,1,5891254,2023-04-13T19:36:40,-1,...,47.47,46.96,48.12,52.95,54.14,54.03,53.45,1107,7.688226e+02,7.688226e+02
2,7514,5903854,2023-04-13 23:06:40,21512,256,0.888709,4,5912854,2023-04-14T01:36:40,-1,...,54.14,54.03,53.45,52.05,53.68,53.50,54.82,5507072,4.894184e+06,4.894184e+06
3,7515,5903891,2023-04-13 23:07:17,13741,32,0.571370,4,5912854,2023-04-14T01:36:40,-1,...,54.14,54.03,53.45,52.05,53.68,53.50,54.82,439712,2.512382e+05,2.512382e+05
4,7524,5905986,2023-04-13 23:42:12,11287,128,0.465253,4,5912854,2023-04-14T01:36:40,-1,...,54.14,54.03,53.45,52.05,53.68,53.50,54.82,1444736,6.721678e+05,6.721678e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,8519,6708487,2023-04-23 06:37:13,62,4,0.353738,0,6708487,2023-04-23T06:37:13,-1,...,89.45,84.29,82.36,86.86,86.35,82.40,85.79,248,8.772702e+01,8.772702e+01
1020,8520,6708512,2023-04-23 06:37:38,324,4,0.433960,0,6708512,2023-04-23T06:37:38,-1,...,89.45,84.29,82.36,86.86,86.35,82.40,85.79,1296,5.624122e+02,5.624122e+02
1021,8521,6708732,2023-04-23 06:41:18,57,1,0.359237,0,6708732,2023-04-23T06:41:18,-1,...,89.45,84.29,82.36,86.86,86.35,82.40,85.79,57,2.047651e+01,2.047651e+01
1022,8522,6708906,2023-04-23 06:44:12,16,1,0.870514,0,6708906,2023-04-23T06:44:12,-1,...,89.45,84.29,82.36,86.86,86.35,82.40,85.79,16,1.392822e+01,1.392822e+01


In [69]:
# Show statistics for the carbon intensity variables
carbon_intensity_cols = [f'carbon_intensity_hour_{i}' for i in range(1, 49)]
# Define relevant features for regression
job_features = [
    'node_hours',
    'node_hours_x_carbon_consideration',
    'request_processors',    
    'carbon_consideration',   
    'queue_length_at_submission',  
]

# Add all 48 carbon intensity variables
carbon_features = [f'carbon_intensity_hour_{i}' for i in range(1, 49)]

# Combine all features
feature_columns = job_features + carbon_features

In [72]:
X = training_pd[feature_columns]

In [82]:
# Find rows in training_pd where 'carbon_emissions' is NA
# Assign these rows to a variable for inspection or further processing
rows_with_na_carbon_emissions = training_pd[training_pd['carbon_emissions'].isna()]

# Note: This code replaces the original logic which prepared 'y' for model training
# by removing rows with NA carbon_emissions. The subsequent cell (model.fit(X, y))
# will likely fail because 'y' is not defined in this selection, and the original
# filtering of NA values from the target variable is no longer performed here.
rows_with_na_carbon_emissions['carbon_emissions']

6     NaN
10    NaN
13    NaN
14    NaN
15    NaN
       ..
472   NaN
473   NaN
474   NaN
476   NaN
477   NaN
Name: carbon_emissions, Length: 369, dtype: float64

In [80]:
model = LinearRegression()
model.fit(X, y)  # Train the model

ValueError: Found input variables with inconsistent numbers of samples: [1024, 655]