In [1]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Original data
train = pd.read_csv('power-laws-forecasting-energy-consumption-training-data.csv', sep=';', index_col=0)
test = pd.read_csv('power-laws-forecasting-energy-consumption-submission-format.csv', sep=';', index_col=0)

weather = pd.read_csv('power-laws-forecasting-energy-consumption-weather.csv', sep=';')
meta = pd.read_csv('power-laws-forecasting-energy-consumption-metadata.csv', sep=';')

  mask |= (ar1 == a)


In [2]:
# Extrac features and convert the time into cyclical variables
def process_time(df):
    
    # Convert timestamp into a pandas datatime object
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.set_index('Timestamp')
    
    # Extract units of time from the timestamp
    df['min'] = df.index.minute
    df['hour'] = df.index.hour
    df['wday'] = df.index.dayofweek
    df['mday'] = df.index.day
    df['yday'] = df.index.dayofyear
    df['month'] = df.index.month
    df['year'] = df.index.year
    
    # Create a time of day to represent hours and minutes
    df['time'] = df['hour'] + (df['min'] / 60)
    df = df.drop(columns=['hour', 'min'])
    
    # Cyclical variable transformations
    
    # wday has period of 6
    df['wday_sin'] = np.sin(2 * np.pi * df['wday'] / 6)
    df['wday_cos'] = np.cos(2 * np.pi * df['wday'] / 6)
    
    # yday has period of 365
    df['yday_sin'] = np.sin(2 * np.pi * df['yday'] / 365)
    df['yday_cos'] = np.cos(2 * np.pi * df['yday'] / 365)
    
    # month has period of 12
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    # time has period of 24
    df['time_sin'] = np.sin(2 * np.pi * df['time'] / 24)
    df['time_cos'] = np.cos(2 * np.pi * df['time'] / 24)
    
    # turn the index into a column
    df = df.reset_index(level=0)
    
    return df

In [3]:
# Feature engineering of the time for train and test
train = process_time(train)
test = process_time(test)

In [4]:
# Function to add weather information into a dataset
def add_weather(df, weather):
    
    # Keep track of the original length of the dataset
    original_length = len(df)
    
    # Convert timestamp to a pandas datetime object
    weather['Timestamp'] = pd.to_datetime(weather['Timestamp'])
    weather = weather.set_index('Timestamp')
    
    # Round the  weather data to the nearest 15 minutes
    weather.index = weather.index.round(freq='15 min')
    weather = weather.reset_index(level=0)
    
    # Merge the building data with the weather data
    df = pd.merge(df, weather, how = 'left', on = ['Timestamp', 'SiteId'])
    
    # Drop the duplicate temperature measurements, keeping the closest location
    df = df.sort_values(['Timestamp', 'SiteId', 'Distance'])
    df = df.drop_duplicates(['Timestamp', 'SiteId'], keep='first')
    
    # Checking length of new data
    new_length = len(df)
    
    # Check to make sure the length of the dataset has not changed
    assert original_length == new_length, 'New Length must match original length'

    return df

In [5]:
# Get weather information for both train and test data
train = add_weather(train, weather)
test = add_weather(test, weather)

In [6]:
# List of ids and new dataframe to hold meta information
id_list = set(meta['SiteId'])
all_meta = pd.DataFrame(columns=['SiteId', 'wday', 'off'])

In [7]:
# Iterate through each site and find days off
for site in id_list:
    # Extract the metadata information for the site
    meta_slice = meta[meta['SiteId'] == site]
    
    # Create a new dataframe for the site
    site_meta = pd.DataFrame(columns=['SiteId', 'wday', 'off'],
                            index = [0, 1, 2, 3, 4, 5, 6])
    
    site_meta['wday'] = [0, 1, 2, 3, 4, 5, 6]
    site_meta['SiteId'] = site
    
    # Record the days off
    site_meta.loc[0, 'off'] = float(meta_slice['MondayIsDayOff'])
    site_meta.loc[1, 'off'] = float(meta_slice['TuesdayIsDayOff'])
    site_meta.loc[2, 'off'] = float(meta_slice['WednesdayIsDayOff'])
    site_meta.loc[3, 'off'] = float(meta_slice['ThursdayIsDayOff'])
    site_meta.loc[4, 'off'] = float(meta_slice['FridayIsDayOff'])
    site_meta.loc[5, 'off'] = float(meta_slice['SaturdayIsDayOff'])
    site_meta.loc[6, 'off'] = float(meta_slice['SundayIsDayOff'])
    
    # Append the resulting dataframe to all site dataframe
    all_meta = all_meta.append(site_meta) 

In [8]:
# Find the days off in the training and testing data
train = train.merge(all_meta, how = 'left', on = ['SiteId', 'wday'])
test = test.merge(all_meta, how = 'left', on = ['SiteId', 'wday'])

# Save files to csv
train.to_csv('train_corrected.csv', index = False)
test.to_csv('test_corrected.csv', index = False)

In [1]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np

# Imputing missing values in temp and value
from sklearn.impute import SimpleImputer

# Best practice to scale features
from sklearn.preprocessing import MinMaxScaler

# Models used for prediction
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

from itertools import chain

# Turn off setting with copy warning
pd.options.mode.chained_assignment = None

In [2]:
# Read in the dataframes for training and testing
train = pd.read_csv('train_corrected.csv')
test = pd.read_csv('test_corrected.csv')

# Convert to datetimes
train['Timestamp'] = pd.to_datetime(train['Timestamp'])
test['Timestamp'] = pd.to_datetime(test['Timestamp'])

In [3]:
# Takes in a site id and returns a formatted training and testing set
def process(site):

    # Testing data
    test_df = test[test['ForecastId'] == site].sort_values(['Timestamp', 'Distance'])
    test_df = test_df.drop_duplicates(['Timestamp'], keep='first')

    # Training data
    train_df = train[train['ForecastId'] == site].sort_values(['Timestamp', 'Distance'])
    train_df = train_df.drop_duplicates(['Timestamp'], keep='first')

    # Only use past training data
    train_df = train_df[train_df['Timestamp'] < test_df['Timestamp'].min()]

    # If all training temperatures are missing, drop temperatures from both training and testing
    if (np.all(np.isnan(train_df['Temperature']))) or (np.all(np.isnan(test_df['Temperature']))):
        train_df = train_df.drop(labels = 'Temperature', axis=1)
        test_df = test_df.drop(labels= 'Temperature', axis=1)

    # Otherwise impute the missing temperatures
    else:
        temp_median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        temp_median_imputer.fit(train_df[['Temperature']])
        train_df['Temperature'] = temp_median_imputer.transform(train_df[['Temperature']])
        test_df['Temperature'] = temp_median_imputer.transform(test_df[['Temperature']])

    # Impute the missing values
    value_median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
    value_median_imputer.fit(train_df[['Value']])

    if pd.isnull(train_df['Value']).all():
        train_df['Value'] = 0
    else:
        train_df['Value'] = value_median_imputer.transform(train_df[['Value']])

    # Find the minimum date for converting timestamp to numeric
    min_date = min(train_df['Timestamp'])

    # Convert timestamp to numeric
    train_df['Timestamp'] = (train_df['Timestamp'] - min_date).dt.total_seconds()
    test_df['Timestamp']  = (test_df['Timestamp'] - min_date).dt.total_seconds()

    # Interval between measurements
    train_df['time_diff'] = train_df['Timestamp'].diff().fillna(0)
    test_df['time_diff'] = test_df['Timestamp'].diff().fillna(0)

    # Extract labels
    train_labels = train_df['Value']

    # Drop columns
    train_df = train_df.drop(columns = ['Distance', 'SiteId', 'ForecastId', 'Value'])
    test_df =   test_df.drop(columns = ['Distance', 'SiteId', 'ForecastId', 'Value'])


    # Scale the features between 0 and 1 (best practice for ML)
    scaler = MinMaxScaler()

    train_df.loc[:, :] = scaler.fit_transform(train_df.loc[:, :])
    test_df.loc[:, :] = scaler.transform(test_df.loc[:, :])

    return train_df, train_labels, test_df

In [4]:
# Trains and predicts for all datasets, makes predictions one site at a time
def predict():

    # List of trees to use in the random forest and extra trees model
    trees_list = list(range(50, 176, 25))

    # List of site ids
    site_list = list(set(train['ForecastId']))

    predictions = []

    # Keep track of the sites run so far
    number = len(site_list)
    count = 0

    # Iterate through every site
    for site in site_list:

        # Features and labels
        train_x, train_y, test_x = process(site)

        # Make sure only training on past data
        assert train_x['Timestamp'].max() < test_x['Timestamp'].min(), 'Training Data Must Come Before Testing Data'

        # Initialize list of predictions for site
        _predictions = np.array([0. for _ in range(len(test_x))])

        # Iterate through the number of trees
        for tree in trees_list:

            # Create a random forest and extra trees model with the number of trees
            model1 = RandomForestRegressor(n_estimators=tree, n_jobs=-1)
            model2 = ExtraTreesRegressor(n_estimators=tree, n_jobs=-1)

            # Fitting the model
            model1.fit(train_x, train_y)
            model2.fit(train_x, train_y)

            # Make predictions with each model
            _predictions += np.array(model1.predict(test_x))
            _predictions += np.array(model2.predict(test_x))

        # Average the predictions
        _predictions = _predictions / (len(trees_list) * 2)

        # Add the predictions to the list of all predictions
        predictions.append(list(_predictions))

        # Iterate the count
        count = count + 1

        # Keep track of number of buildings process so far
        if count % 100 == 0:
            print('Percentage Complete: {:.1f}%.'.format(100 * count / number))

    # Flatten the list
    predictions = list(chain(*predictions))

    return predictions

In [5]:
# Make a submission file given the list of predictions and name for the submission
def make_submission_file(predictions, name):

    # Read in the submission dataframe
    submit_df = pd.read_csv('power-laws-forecasting-energy-consumption-submission-format.csv', sep=';')

    # Assign the predictions as the value
    submit_df['Value'] = predictions

    # Save the submissions to the folder of final submissions
    submit_df.to_csv('%s.csv' % name, index = False)
    print('Predictions saved to %s.csv' % name)

In [6]:
# Make predictions
predictions = predict()

Percentage Complete: 1.4%.
Percentage Complete: 2.9%.
Percentage Complete: 4.3%.
Percentage Complete: 5.7%.
Percentage Complete: 7.2%.
Percentage Complete: 8.6%.
Percentage Complete: 10.0%.
Percentage Complete: 11.5%.
Percentage Complete: 12.9%.
Percentage Complete: 14.3%.
Percentage Complete: 15.8%.
Percentage Complete: 17.2%.
Percentage Complete: 18.6%.
Percentage Complete: 20.1%.
Percentage Complete: 21.5%.
Percentage Complete: 22.9%.
Percentage Complete: 24.4%.
Percentage Complete: 25.8%.
Percentage Complete: 27.2%.
Percentage Complete: 28.7%.
Percentage Complete: 30.1%.
Percentage Complete: 31.5%.
Percentage Complete: 33.0%.
Percentage Complete: 34.4%.
Percentage Complete: 35.8%.
Percentage Complete: 37.3%.
Percentage Complete: 38.7%.
Percentage Complete: 40.1%.
Percentage Complete: 41.6%.
Percentage Complete: 43.0%.
Percentage Complete: 44.5%.
Percentage Complete: 45.9%.
Percentage Complete: 47.3%.
Percentage Complete: 48.8%.
Percentage Complete: 50.2%.
Percentage Complete: 51.6%

In [7]:
# Save predictions with a sensible name
make_submission_file(predictions, 'submission_filename')

Predictions saved to submission_filename.csv
