In [1]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import catboost as cated

In [2]:
# %load process_cat
#!/usr/bin/env python3
import os
import pandas as pd
import numpy as np


def pro_cat(targets, observed, estimated, test):

    date_calc_re_ob = estimated.set_index('date_forecast')['date_calc'].resample('H').first().to_frame()
    date_calc_re_te = test.set_index('date_forecast')['date_calc'].resample('H').first().to_frame()

    # resample observed, estimated, and test data to 1 hour
    observed_re = observed.set_index('date_forecast').resample('H').mean(numeric_only=1).dropna(how='all').reset_index()
    estimated_re = estimated.set_index('date_forecast').resample('H').mean(numeric_only=1).dropna(how='all').reset_index()
    test_re = test.set_index('date_forecast').resample('H').mean(numeric_only=1).dropna(how='all').reset_index()

    estimated_re = estimated_re.merge(date_calc_re_ob, left_on='date_forecast', right_index=True)
    test_re = test_re.merge(date_calc_re_te, left_on='date_forecast', right_index=True)

    # dropped due to redundancy, inconsistency, lack of relevance or for further processing
    columns_to_drop = ['wind_speed_v_10m:ms', 'wind_speed_w_1000hPa:ms', 'wind_speed_u_10m:ms', 'snow_drift:idx', 'snow_density:kgm3', 'elevation:m']
    test_re = test_re.drop(columns=columns_to_drop)
    observed_re = observed_re.drop(columns=columns_to_drop)
    estimated_re = estimated_re.drop(columns=columns_to_drop)

    # fuse observed and estimated data
    weather_data, test_re = delta(observed_re, estimated_re, test_re)
    # fuse with target values
    fused = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')
    fused = clock_feat(fused, 'time')
    test_re = clock_feat(test_re, 'date_forecast')

    fused = fused[fused['pv_measurement'] != 0]
    fused = consec(fused)

    # calculate the difference
    fused['diff'] = fused['pv_measurement'].diff().fillna(0)
    # create an indicator for constant stretches
    fused['constant'] = (fused['diff'] == 0).astype(int)
    # use the indicator to mark stretches
    fused['block'] = (fused['constant'].diff() != 0).astype(int).cumsum()
    # get size of each constant block
    block_sizes = fused.groupby('block')['constant'].sum()
    # identify blocks that are constant for more than 2 consecutive time points
    constant_blocks = block_sizes[block_sizes > 2].index
    # remove the constant stretches
    filtered = fused[~fused['block'].isin(constant_blocks)]

    # clean auxillary
    trgts = filtered[['time', 'pv_measurement']]
    filtered = filtered.drop(columns=['diff', 'constant', 'block'])

    # drop non-feature columns
    filtered = filtered.drop(columns=['time', 'date_forecast', 'pv_measurement','date_calc'])
    test_re = test_re.drop(columns=['date_forecast','date_calc'])

    return filtered, test_re, trgts


def clock_feat(df, col):
    # use as features in the model
    df[col] = pd.to_datetime(df[col])
    df['hour'] = df[col].dt.hour
    df['month'] = df[col].dt.month
    df['year'] = df[col].dt.year

    return df


def delta(observed, estimated, test):
    # create time-delta for estimated data
    estimated['time_delta'] = (estimated['date_calc'] - estimated['date_forecast']).dt.total_seconds() / 3600
    observed['time_delta'] = 0  # since observed data is not forecasting ahead
    test['time_delta'] = (test['date_calc'] - test['date_forecast']).dt.total_seconds() / 3600

    # indicator variable for estimated data
    estimated['is_estimated'] = 1
    observed['is_estimated'] = 0
    test['is_estimated'] = 1
    df = pd.concat([observed, estimated], axis=0).sort_values(by='date_forecast')

    return df, test


def consec(df, threshold=4, threshold_zerko=15, threshold_zerko_no_rad=20):
    # attempt to remove consecutive measurements, given the specified threshold
    mask = (df['pv_measurement'] != df['pv_measurement'].shift(1)).cumsum()
    df['group'] = df.groupby(mask).transform('count')['pv_measurement']

    df["first_group"] = False
    df['first_group'] = df['group'] != df['group'].shift(1)

    # masks to remove rows
    mask_non_zerko = (df['group'] >= threshold) & (
        df["pv_measurement"] > 0) & (df["first_group"] == False)

    mask_zerko = (df['group'] >= threshold_zerko) & (
        df["pv_measurement"] == 0) & (df["direct_rad:W"] > 10)

    mask_zerko_no_rad = (df['group'] >= threshold_zerko_no_rad) & (
        df["pv_measurement"] == 0) & (df["direct_rad:W"] < 10)
    mask = mask_non_zerko | mask_zerko | mask_zerko_no_rad

    df = df.loc[~mask]
    df = df.drop(columns=["group", "first_group", "direct_rad:W"])

    return df.reset_index(drop=1)


In [3]:
loc = ['A', 'B', 'C']
#loc = 'A'
cat_all = []

for loc in loc:
    train = pd.read_parquet(f'./data/{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'./data/{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'./data/{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'./data/{loc}/X_test_estimated.parquet')
     
    X_train_cat, X_test_cat, targets_cat = pro_cat(train, X_train_observed, X_train_estimated, X_test_estimated)
    X_test_cat = X_test_cat.drop(["direct_rad:W"], axis=1)

    feat = ['dew_or_rime:idx' ,'is_in_shadow:idx']

    for i in feat:
        X_train_cat[i] = X_train_cat[i].astype(int)
        X_test_cat[i] = X_test_cat[i].astype(int)

    params = {
        "loss_function": 'MAE',
        "learning_rate": 0.1,
        "silent": True,
        "cat_features": feat,
    }

    model_cat = cated.CatBoostRegressor(**params)
    model_cat.fit(X_train_cat, targets_cat['pv_measurement'], plot=True)

    cat_preds = model_cat.predict(X_test_cat)
    cat_preds = np.clip(cat_preds, 0, None)
    cat_all.append(cat_preds)

kot_preds = np.array(cat_all).flatten()
df = pd.DataFrame(kot_preds, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('yet_another_cat.csv', index=0)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [20]:
y_test = pd.read_csv('yet_another_cat.csv')