## Import libraries

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBRegressor

np.random.seed(42)

## Load source datasets

In [2]:
train = pd.read_csv("../input/mh-wsmlc/train.csv")
print(f"train: {train.shape}")
train.head()

train: (175296, 18)


Unnamed: 0,Year,Month,Day,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag
0,2009,1,1,0,0,0,0,0,0,0.0,5.0,1010,75.34,106.15,0.499,346.1,3.1,0
1,2009,1,1,0,30,0,0,0,0,1.0,5.0,1010,80.81,112.28,0.49,346.1,3.1,0
2,2009,1,1,1,0,0,0,0,4,0.0,5.0,1010,78.27,118.5,0.482,347.9,3.2,0
3,2009,1,1,1,30,0,0,0,4,0.0,4.0,1010,78.27,124.78,0.478,347.9,3.1,0
4,2009,1,1,2,0,0,0,0,4,0.0,4.0,1010,76.45,131.12,0.475,350.0,3.0,0


In [3]:
test = pd.read_csv("../input/mh-wsmlc/test.csv")
print(f"test: {test.shape}")
test.head()

test: (17520, 18)


Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,2019,1,1,0,0,7,18.4,18.8,1008,97.7,106.23,3.5,190,2.3,0,,,
1,2019,1,1,0,30,3,18.4,18.6,1008,98.92,112.36,3.5,187,2.5,0,,,
2,2019,1,1,1,0,3,18.2,18.5,1008,98.35,118.58,3.5,184,2.8,0,,,
3,2019,1,1,1,30,3,18.2,18.3,1008,99.58,124.86,3.5,185,3.0,0,,,
4,2019,1,1,2,0,0,18.0,18.0,1008,99.71,131.2,3.6,186,3.1,0,,,


## Feature Engineering

In [4]:
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [5]:
train['Date'] = train.apply(lambda row: pd.to_datetime(str(row['Year']).split('.')[0] + \
                                                       str(row['Month']).split('.')[0].zfill(2) + \
                                                       str(row['Day']).split('.')[0].zfill(2) + ' ' + \
                                                       str(row['Hour']).split('.')[0].zfill(2) + ':' + \
                                                       str(row['Minute']).split('.')[0].zfill(2), format='%Y%m%d %H:%M'), axis=1)

train['Quarter'] = train['Date'].apply(lambda x: pd.to_datetime(x).quarter)
train['Week'] = train['Date'].apply(lambda x: pd.to_datetime(x).week)
train['DayofWeek'] = train['Date'].apply(lambda x: pd.to_datetime(x).dayofweek)
train['isWeekend'] = np.where(train['DayofWeek'].isin([5,6]),1,0)
train['season'] = train['Month'].apply(lambda x: 0 if x in [2,3] else 1 if x in [4,5,6] else 2 if x in [7,8] else 3 if x in [9,10,11] else 4)

train['Time Elapsed'] = train['Date'].apply(lambda x: (pd.to_datetime(x, format='%Y-%m-%d %H:%M') - \
                                                       pd.to_datetime(pd.to_datetime(x).strftime("%Y-%m-01 00:00"), 
                                                                      format='%Y-%m-%d %H:%M')).seconds / 60.0)

train.drop(['Date'], axis=1, inplace=True)
train.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,...,Precipitable Water,Wind Direction,Wind Speed,Fill Flag,Quarter,Week,DayofWeek,isWeekend,season,Time Elapsed
0,2009,1,1,0,0,0,0,0,0,0.0,...,0.499,346.1,3.1,0,1,1,3,0,4,0.0
1,2009,1,1,0,30,0,0,0,0,1.0,...,0.49,346.1,3.1,0,1,1,3,0,4,30.0
2,2009,1,1,1,0,0,0,0,4,0.0,...,0.482,347.9,3.2,0,1,1,3,0,4,60.0
3,2009,1,1,1,30,0,0,0,4,0.0,...,0.478,347.9,3.1,0,1,1,3,0,4,90.0
4,2009,1,1,2,0,0,0,0,4,0.0,...,0.475,350.0,3.0,0,1,1,3,0,4,120.0


In [6]:
test['Date'] = test.apply(lambda row: pd.to_datetime(str(row['Year']).split('.')[0] + \
                                                     str(row['Month']).split('.')[0].zfill(2) + \
                                                     str(row['Day']).split('.')[0].zfill(2) + ' ' + \
                                                     str(row['Hour']).split('.')[0].zfill(2) + ':' + \
                                                     str(row['Minute']).split('.')[0].zfill(2), format='%Y%m%d %H:%M'), axis=1)

test['Quarter'] = test['Date'].apply(lambda x: pd.to_datetime(x).quarter)
test['Week'] = test['Date'].apply(lambda x: pd.to_datetime(x).week)
test['DayofWeek'] = test['Date'].apply(lambda x: pd.to_datetime(x).dayofweek)
test['isWeekend'] = np.where(test['DayofWeek'].isin([5,6]),1,0)
test['season'] = test['Month'].apply(lambda x: 0 if x in [2,3] else 1 if x in [4,5,6] else 2 if x in [7,8] else 3 if x in [9,10,11] else 4)

test['Time Elapsed'] = test['Date'].apply(lambda x: (pd.to_datetime(x, format='%Y-%m-%d %H:%M') - \
                                                     pd.to_datetime(pd.to_datetime(x).strftime("%Y-%m-01 00:00"), 
                                                                    format='%Y-%m-%d %H:%M')).seconds / 60.0)

test.drop(['Date'], axis=1, inplace=True)
test.head()

Unnamed: 0,Year,Month,Day,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,...,Fill Flag,Clearsky DHI,Clearsky DNI,Clearsky GHI,Quarter,Week,DayofWeek,isWeekend,season,Time Elapsed
0,2019,1,1,0,0,7,18.4,18.8,1008,97.7,...,0,,,,1,1,1,0,4,0.0
1,2019,1,1,0,30,3,18.4,18.6,1008,98.92,...,0,,,,1,1,1,0,4,30.0
2,2019,1,1,1,0,3,18.2,18.5,1008,98.35,...,0,,,,1,1,1,0,4,60.0
3,2019,1,1,1,30,3,18.2,18.3,1008,99.58,...,0,,,,1,1,1,0,4,90.0
4,2019,1,1,2,0,0,18.0,18.0,1008,99.71,...,0,,,,1,1,1,0,4,120.0


In [7]:
train['Dew Point / Temperature'] = train.apply(lambda row: 0 if row['Temperature']==0 else row['Dew Point']/row['Temperature'], axis=1)
train['Dew Point / Pressure'] = train['Dew Point']/train['Pressure']
train['Precipitable Water / Dew Point'] = train.apply(lambda row: 0 if row['Dew Point']==0 else row['Precipitable Water']/row['Dew Point'], axis=1)
train['Temperature / Pressure'] = train['Temperature']/train['Pressure']
train['Humidity * Water'] = train['Relative Humidity'] * train['Precipitable Water']
train['Humidity * Wind'] = train['Relative Humidity'] * train['Wind Speed']
train['Humidity * Temp Diff'] = train['Relative Humidity'] * (train['Temperature'] - train['Dew Point'])

train = encode(train, 'Month', train['Month'].max())
train = encode(train, 'Quarter', train['Quarter'].max())
train = encode(train, 'season', train['season'].max())
train = encode(train, 'Week', train['Week'].max())
train = encode(train, 'DayofWeek', train['DayofWeek'].max())
train = encode(train, 'Day', train['Day'].max())
train = encode(train, 'Hour', train['Hour'].max())
train = encode(train, 'Solar Zenith Angle', 90)
train = encode(train, 'Wind Direction', 180)
train = encode(train, 'Time Elapsed', train['Time Elapsed'].max())

train.drop(['Month','Quarter','season','Day', 'Time Elapsed',
            'Week','DayofWeek','Wind Direction'], axis=1, inplace=True)
train.head()

Unnamed: 0,Year,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,...,Day_sin,Day_cos,Hour_sin,Hour_cos,Solar Zenith Angle_sin,Solar Zenith Angle_cos,Wind Direction_sin,Wind Direction_cos,Time Elapsed_sin,Time Elapsed_cos
0,2009,0,0,0,0,0,0,0.0,5.0,1010,...,0.201299,0.97953,0.0,1.0,0.903335,0.428935,-0.466387,0.884581,0.0,1.0
1,2009,0,30,0,0,0,0,1.0,5.0,1010,...,0.201299,0.97953,0.0,1.0,0.999882,0.015358,-0.466387,0.884581,0.133287,0.991077
2,2009,1,0,0,0,0,4,0.0,5.0,1010,...,0.201299,0.97953,0.269797,0.962917,0.913545,-0.406737,-0.409923,0.91212,0.264195,0.964469
3,2009,1,30,0,0,0,4,0.0,4.0,1010,...,0.201299,0.97953,0.269797,0.962917,0.654477,-0.756082,-0.409923,0.91212,0.390389,0.92065
4,2009,2,0,0,0,0,4,0.0,4.0,1010,...,0.201299,0.97953,0.519584,0.854419,0.267575,-0.963537,-0.34202,0.939693,0.509617,0.860402


In [8]:
test['Dew Point / Temperature'] = test.apply(lambda row: 0 if row['Temperature']==0 else row['Dew Point']/row['Temperature'], axis=1)
test['Dew Point / Pressure'] = test['Dew Point']/test['Pressure']
test['Precipitable Water / Dew Point'] = test.apply(lambda row: 0 if row['Dew Point']==0 else row['Precipitable Water']/row['Dew Point'], axis=1)
test['Temperature / Pressure'] = test['Temperature']/test['Pressure']
test['Humidity * Water'] = test['Relative Humidity'] * test['Precipitable Water']
test['Humidity * Wind'] = test['Relative Humidity'] * test['Wind Speed']
test['Humidity * Temp Diff'] = test['Relative Humidity'] * (test['Temperature'] - test['Dew Point'])

test = encode(test, 'Month', test['Month'].max())
test = encode(test, 'Quarter', test['Quarter'].max())
test = encode(test, 'season', test['season'].max())
test = encode(test, 'Week', test['Week'].max())
test = encode(test, 'DayofWeek', test['DayofWeek'].max())
test = encode(test, 'Day', test['Day'].max())
test = encode(test, 'Hour', test['Hour'].max())
test = encode(test, 'Solar Zenith Angle', 90)
test = encode(test, 'Wind Direction', 180)
test = encode(test, 'Time Elapsed', test['Time Elapsed'].max())

test.drop(['Month','Quarter','season','Day', 'Time Elapsed',
           'Week','DayofWeek','Wind Direction'], axis=1, inplace=True)
test.head()

Unnamed: 0,Year,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,...,Day_sin,Day_cos,Hour_sin,Hour_cos,Solar Zenith Angle_sin,Solar Zenith Angle_cos,Wind Direction_sin,Wind Direction_cos,Time Elapsed_sin,Time Elapsed_cos
0,2019,0,0,7,18.4,18.8,1008,97.7,106.23,3.5,...,0.201299,0.97953,0.0,1.0,0.905717,0.423883,0.34202,0.939693,0.0,1.0
1,2019,0,30,3,18.4,18.6,1008,98.92,112.36,3.5,...,0.201299,0.97953,0.0,1.0,0.999952,0.009774,0.241922,0.970296,0.133287,0.991077
2,2019,1,0,3,18.2,18.5,1008,98.35,118.58,3.5,...,0.201299,0.97953,0.269797,0.962917,0.91126,-0.411832,0.139173,0.990268,0.264195,0.964469
3,2019,1,30,3,18.2,18.3,1008,99.58,124.86,3.5,...,0.201299,0.97953,0.269797,0.962917,0.650244,-0.759725,0.173648,0.984808,0.390389,0.92065
4,2019,2,0,0,18.0,18.0,1008,99.71,131.2,3.6,...,0.201299,0.97953,0.519584,0.854419,0.262189,-0.965016,0.207912,0.978148,0.509617,0.860402


In [9]:
def shift_features(df, features, shifts):
    for feature in features:
        for shift in shifts:
            df[feature+"-"+str(shift)+"abs_shift"] = df[feature] - df[feature].shift(periods=shift, fill_value=0)
            df[feature+"+"+str(shift)+"abs_shift"] = df[feature] - df[feature].shift(periods=-shift, fill_value=0)

    df.replace(to_replace=np.inf, value=0, inplace=True)
    return df

In [10]:
features = ['Dew Point', 'Temperature', 'Pressure', 'Relative Humidity', 'Precipitable Water', 
            'Wind Speed', 'Solar Zenith Angle_sin', 'Solar Zenith Angle_cos', 'Wind Direction_sin',
            'Wind Direction_cos', 'Dew Point / Temperature', 'Dew Point / Pressure', 
            'Temperature / Pressure', 'Precipitable Water / Dew Point', 'Humidity * Water', 
            'Humidity * Wind', 'Humidity * Temp Diff']

shifts = [1, 2, 3, 6, 9, 12, 18, 24, 36, 48, 72]

train = shift_features(train, features, shifts)
train.head()

  after removing the cwd from sys.path.
  """


Unnamed: 0,Year,Hour,Minute,Clearsky DHI,Clearsky DNI,Clearsky GHI,Cloud Type,Dew Point,Temperature,Pressure,...,Humidity * Temp Diff-18abs_shift,Humidity * Temp Diff+18abs_shift,Humidity * Temp Diff-24abs_shift,Humidity * Temp Diff+24abs_shift,Humidity * Temp Diff-36abs_shift,Humidity * Temp Diff+36abs_shift,Humidity * Temp Diff-48abs_shift,Humidity * Temp Diff+48abs_shift,Humidity * Temp Diff-72abs_shift,Humidity * Temp Diff+72abs_shift
0,2009,0,0,0,0,0,0,0.0,5.0,1010,...,376.7,288.38,376.7,206.78,376.7,-102.79,376.7,55.38,376.7,276.7
1,2009,0,30,0,0,0,0,1.0,5.0,1010,...,323.24,234.91,323.24,86.21,323.24,-156.03,323.24,64.73,323.24,223.24
2,2009,1,0,0,0,0,4,0.0,5.0,1010,...,391.35,220.65,391.35,156.21,391.35,-73.35,391.35,215.23,391.35,391.35
3,2009,1,30,0,0,0,4,0.0,4.0,1010,...,313.08,142.32,313.08,-0.48,313.08,-151.62,313.08,137.02,313.08,218.81
4,2009,2,0,0,0,0,4,0.0,4.0,1010,...,305.8,217.4,305.8,-96.62,305.8,-146.9,305.8,126.24,305.8,210.31


In [11]:
test = shift_features(test, features, shifts)
test.head()

  after removing the cwd from sys.path.
  """


Unnamed: 0,Year,Hour,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle,Precipitable Water,...,Humidity * Temp Diff-18abs_shift,Humidity * Temp Diff+18abs_shift,Humidity * Temp Diff-24abs_shift,Humidity * Temp Diff+24abs_shift,Humidity * Temp Diff-36abs_shift,Humidity * Temp Diff+36abs_shift,Humidity * Temp Diff-48abs_shift,Humidity * Temp Diff+48abs_shift,Humidity * Temp Diff-72abs_shift,Humidity * Temp Diff+72abs_shift
0,2019,0,0,7,18.4,18.8,1008,97.7,106.23,3.5,...,39.08,29.127,39.08,29.118,39.08,-183.558,39.08,39.08,39.08,19.304
1,2019,0,30,3,18.4,18.6,1008,98.92,112.36,3.5,...,19.784,-9.7,19.784,-38.146,19.784,-202.854,19.784,19.784,19.784,-47.262
2,2019,1,0,3,18.2,18.5,1008,98.35,118.58,3.5,...,29.505,9.695,29.505,9.741,29.505,-192.119,29.505,29.505,29.505,-9.559
3,2019,1,30,3,18.2,18.3,1008,99.58,124.86,3.5,...,9.958,0.043,9.958,-84.072,9.958,-197.066,9.958,9.958,9.958,-92.826
4,2019,2,0,0,18.0,18.0,1008,99.71,131.2,3.6,...,0.0,0.0,0.0,-48.415,0.0,-184.758,0.0,0.0,0.0,-66.927


In [12]:
features = test.columns.tolist()
features.remove('Clearsky DHI')
features.remove('Clearsky DNI')
features.remove('Clearsky GHI')
features.remove('Hour')
features.remove('Solar Zenith Angle')
len(features)

412

In [13]:
cat_cols = ['Year','Minute','isWeekend','Pressure','Fill Flag','Cloud Type']

train[cat_cols] = train[cat_cols].astype(int)
test[cat_cols] = test[cat_cols].astype(int)
cat_cols_indices = [train[features].columns.get_loc(col) for col in cat_cols]
print(cat_cols_indices)

[0, 1, 10, 5, 9, 2]


In [14]:
num_cols = [col for col in features if col not in cat_cols]

scaler = RobustScaler().fit(train[num_cols])
train[num_cols] = scaler.transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

In [15]:
train_org = train.copy()
train = train[train['Solar Zenith Angle']<93].reset_index(drop=True).copy()
train = train[~train['Hour'].isin([1,2,3,4,5,6,7,8,9])].reset_index(drop=True).copy()
print(f"train_org: {train_org.shape} \ntrain: {train.shape}")

train_org: (175296, 417) 
train: (91563, 417)


In [16]:
test_org = test.copy()
test = test[test['Solar Zenith Angle']<93].reset_index(drop=True).copy()
test = test[~test['Hour'].isin([1,2,3,4,5,6,7,8,9])].reset_index(drop=True).copy()
print(f"test_org: {test_org.shape} \ntest: {test.shape}")

test_org: (17520, 417) 
test: (9150, 417)


## XGBoost

In [17]:
dhi_model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    booster='gbtree',
    sample_type='weighted',
    tree_method='hist',
    grow_policy='lossguide',
    num_round=5000,
    max_depth=12, 
    max_leaves=689,
    learning_rate=0.0988,
    subsample=0.8762,
    colsample_bytree=0.8943,
    colsample_bylevel=0.427,
    min_child_weight=3,
    reg_lambda=0.008,
    verbosity=0,
    random_state=42
)

dni_model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    booster='gbtree',
    sample_type='weighted',
    tree_method='hist',
    grow_policy='lossguide',
    num_round=5000,
    max_depth=12, 
    max_leaves=391,
    learning_rate=0.0984,
    subsample=0.9774,
    colsample_bytree=0.6261,
    colsample_bylevel=0.8697,
    min_child_weight=15,
    reg_lambda=0.0003,
    verbosity=0,
    random_state=42
)

ghi_model = XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    booster='gbtree',
    sample_type='weighted',
    tree_method='hist',
    grow_policy='lossguide',
    num_round=5000,
    max_depth=12, 
    max_leaves=367,
    learning_rate=0.0974,
    subsample=0.9711,
    colsample_bytree=0.6751,
    colsample_bylevel=0.99735,
    min_child_weight=11,
    reg_lambda=0.0003,
    verbosity=0,
    random_state=42
)

model_dict = {
    'Clearsky DHI': dhi_model,
    'Clearsky DNI': dni_model,
    'Clearsky GHI': ghi_model
}

In [18]:
for target in ['Clearsky DHI','Clearsky DNI','Clearsky GHI']:
    print(f"Training for target: {target}\n")
    
    splits = 10
    skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    oof_preds = np.zeros((train.shape[0],))
    model_preds = 0
    
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train[features], train['Year'])):
        X_train, X_valid = train.loc[train_idx][features], train.loc[valid_idx][features]
        y_train, y_valid = train.loc[train_idx][target], train.loc[valid_idx][target]
        
        model = model_dict.get(target)
        
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
                  early_stopping_rounds=100, verbose=50)
        
        model_preds += model.predict(test[features], iteration_range=(0, model.best_iteration)) / splits
        oof_preds[valid_idx] = model.predict(X_valid, iteration_range=(0, model.best_iteration))
        
        print(f"\nFold-{fold+1} | MSE: {mean_squared_error(y_valid, oof_preds[valid_idx])}\n")
    
    train[target+'_xgb'] = oof_preds
    test[target+'_xgb'] = model_preds
    
    model_mse = mean_squared_error(train[target], oof_preds)
    print(f"All Folds | MSE: {model_mse}\n")

Training for target: Clearsky DHI

[0]	validation_0-rmse:103.77632	validation_1-rmse:104.04672
[50]	validation_0-rmse:6.75139	validation_1-rmse:11.98702
[99]	validation_0-rmse:3.41574	validation_1-rmse:9.74407

Fold-1 | MSE: 95.56428426808876

[0]	validation_0-rmse:103.79492	validation_1-rmse:103.86987
[50]	validation_0-rmse:6.70651	validation_1-rmse:11.99223
[99]	validation_0-rmse:3.45726	validation_1-rmse:9.83024

Fold-2 | MSE: 96.91510282055285

[0]	validation_0-rmse:103.75532	validation_1-rmse:104.26159
[50]	validation_0-rmse:6.79169	validation_1-rmse:12.35116
[99]	validation_0-rmse:3.52429	validation_1-rmse:10.03363

Fold-3 | MSE: 101.39665080424163

[0]	validation_0-rmse:103.80440	validation_1-rmse:103.75857
[50]	validation_0-rmse:6.75351	validation_1-rmse:11.67451
[99]	validation_0-rmse:3.50521	validation_1-rmse:9.57343

Fold-4 | MSE: 91.81471030558156

[0]	validation_0-rmse:103.81124	validation_1-rmse:103.77512
[50]	validation_0-rmse:6.85081	validation_1-rmse:12.46129
[99]	vali

In [19]:
Xtrain = pd.merge(
    train_org, 
    train,
    how='left',
    on=features,
    sort=False
)

print(f"Xtrain: {Xtrain.shape}")
Xtrain.head()

Xtrain: (175296, 425)


Unnamed: 0,Year,Hour_x,Minute,Clearsky DHI_x,Clearsky DNI_x,Clearsky GHI_x,Cloud Type,Dew Point,Temperature,Pressure,...,Humidity * Temp Diff-72abs_shift,Humidity * Temp Diff+72abs_shift,Hour_y,Clearsky DHI_y,Clearsky DNI_y,Clearsky GHI_y,Solar Zenith Angle_y,Clearsky DHI_xgb,Clearsky DNI_xgb,Clearsky GHI_xgb
0,2009,0,0,0,0,0,0,-1.153846,-1.153846,1010,...,0.666735,0.48986,,,,,,,,
1,2009,0,30,0,0,0,0,-1.076923,-1.153846,1010,...,0.572114,0.395217,,,,,,,,
2,2009,1,0,0,0,0,4,-1.153846,-1.153846,1010,...,0.692664,0.692833,,,,,,,,
3,2009,1,30,0,0,0,4,-1.153846,-1.230769,1010,...,0.554131,0.387374,,,,,,,,
4,2009,2,0,0,0,0,4,-1.153846,-1.230769,1010,...,0.541246,0.372326,,,,,,,,


In [20]:
Xtest = pd.merge(
    test_org, 
    test,
    how='left',
    on=features,
    sort=False
)

print(f"Xtest: {Xtest.shape}")
Xtest.head()

Xtest: (17520, 425)


Unnamed: 0,Year,Hour_x,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle_x,Precipitable Water,...,Humidity * Temp Diff-72abs_shift,Humidity * Temp Diff+72abs_shift,Hour_y,Solar Zenith Angle_y,Clearsky DHI_y,Clearsky DNI_y,Clearsky GHI_y,Clearsky DHI_xgb,Clearsky DNI_xgb,Clearsky GHI_xgb
0,2019,0,0,7,0.261538,-0.092308,1008,0.416255,106.23,0.235775,...,0.069169,0.034175,,,,,,,,
1,2019,0,30,3,0.261538,-0.107692,1008,0.453956,112.36,0.235775,...,0.035016,-0.083671,,,,,,,,
2,2019,1,0,3,0.246154,-0.115385,1008,0.436341,118.58,0.235775,...,0.052222,-0.016923,,,,,,,,
3,2019,1,30,3,0.246154,-0.130769,1008,0.474351,124.86,0.235775,...,0.017625,-0.164336,,,,,,,,
4,2019,2,0,0,0.230769,-0.153846,1008,0.478368,131.2,0.272964,...,0.0,-0.118485,,,,,,,,


In [21]:
Xtrain['Clearsky DHI_xgb'] = Xtrain.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky DHI_xgb'], axis=1)
Xtrain['Clearsky DNI_xgb'] = Xtrain.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky DNI_xgb'], axis=1)
Xtrain['Clearsky GHI_xgb'] = Xtrain.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky GHI_xgb'], axis=1)

Xtrain['Clearsky DNI_xgb'] = Xtrain.apply(lambda row: 0 if row['Clearsky DHI_xgb']==0 else row['Clearsky DNI_xgb'], axis=1)
Xtrain['Clearsky GHI_xgb'] = Xtrain.apply(lambda row: 0 if row['Clearsky DHI_xgb']==0 else row['Clearsky GHI_xgb'], axis=1)

Xtrain.head()

Unnamed: 0,Year,Hour_x,Minute,Clearsky DHI_x,Clearsky DNI_x,Clearsky GHI_x,Cloud Type,Dew Point,Temperature,Pressure,...,Humidity * Temp Diff-72abs_shift,Humidity * Temp Diff+72abs_shift,Hour_y,Clearsky DHI_y,Clearsky DNI_y,Clearsky GHI_y,Solar Zenith Angle_y,Clearsky DHI_xgb,Clearsky DNI_xgb,Clearsky GHI_xgb
0,2009,0,0,0,0,0,0,-1.153846,-1.153846,1010,...,0.666735,0.48986,,,,,,0.0,0.0,0.0
1,2009,0,30,0,0,0,0,-1.076923,-1.153846,1010,...,0.572114,0.395217,,,,,,0.0,0.0,0.0
2,2009,1,0,0,0,0,4,-1.153846,-1.153846,1010,...,0.692664,0.692833,,,,,,0.0,0.0,0.0
3,2009,1,30,0,0,0,4,-1.153846,-1.230769,1010,...,0.554131,0.387374,,,,,,0.0,0.0,0.0
4,2009,2,0,0,0,0,4,-1.153846,-1.230769,1010,...,0.541246,0.372326,,,,,,0.0,0.0,0.0


In [22]:
print(f"""
Clearsky DHI: {mean_squared_error(Xtrain['Clearsky DHI_x'], Xtrain['Clearsky DHI_xgb'])}
Clearsky DNI: {mean_squared_error(Xtrain['Clearsky DNI_x'], Xtrain['Clearsky DNI_xgb'])}
Clearsky GHI: {mean_squared_error(Xtrain['Clearsky GHI_x'], Xtrain['Clearsky GHI_xgb'])}
"""
)


Clearsky DHI: 50.96046349953487
Clearsky DNI: 490.4230675917477
Clearsky GHI: 17.849072348869118



In [23]:
Xtest['Clearsky DHI_xgb'] = Xtest.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky DHI_xgb'], axis=1)
Xtest['Clearsky DNI_xgb'] = Xtest.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky DNI_xgb'], axis=1)
Xtest['Clearsky GHI_xgb'] = Xtest.apply(lambda row: 0 if row['Solar Zenith Angle_x']>=93 or row['Hour_x'] in [1,2,3,4,5,6,7,8,9] else row['Clearsky GHI_xgb'], axis=1)

Xtest['Clearsky DNI_xgb'] = Xtest.apply(lambda row: 0 if row['Clearsky DHI_xgb']==0 else row['Clearsky DNI_xgb'], axis=1)
Xtest['Clearsky GHI_xgb'] = Xtest.apply(lambda row: 0 if row['Clearsky DHI_xgb']==0 else row['Clearsky GHI_xgb'], axis=1)

Xtest.head()

Unnamed: 0,Year,Hour_x,Minute,Cloud Type,Dew Point,Temperature,Pressure,Relative Humidity,Solar Zenith Angle_x,Precipitable Water,...,Humidity * Temp Diff-72abs_shift,Humidity * Temp Diff+72abs_shift,Hour_y,Solar Zenith Angle_y,Clearsky DHI_y,Clearsky DNI_y,Clearsky GHI_y,Clearsky DHI_xgb,Clearsky DNI_xgb,Clearsky GHI_xgb
0,2019,0,0,7,0.261538,-0.092308,1008,0.416255,106.23,0.235775,...,0.069169,0.034175,,,,,,0.0,0.0,0.0
1,2019,0,30,3,0.261538,-0.107692,1008,0.453956,112.36,0.235775,...,0.035016,-0.083671,,,,,,0.0,0.0,0.0
2,2019,1,0,3,0.246154,-0.115385,1008,0.436341,118.58,0.235775,...,0.052222,-0.016923,,,,,,0.0,0.0,0.0
3,2019,1,30,3,0.246154,-0.130769,1008,0.474351,124.86,0.235775,...,0.017625,-0.164336,,,,,,0.0,0.0,0.0
4,2019,2,0,0,0.230769,-0.153846,1008,0.478368,131.2,0.272964,...,0.0,-0.118485,,,,,,0.0,0.0,0.0


In [24]:
Xtrain.to_pickle('./train_xgb.pkl', compression='gzip')
Xtest.to_pickle('./test_xgb.pkl', compression='gzip')

## Create submission file

In [25]:
submission = pd.read_csv("../input/mh-wsmlc/sample_submission.csv")
submission['Clearsky DHI'] = Xtest['Clearsky DHI_xgb']
submission['Clearsky DNI'] = Xtest['Clearsky DNI_xgb']
submission['Clearsky GHI'] = Xtest['Clearsky GHI_xgb']
submission.to_csv("./xgb_submission.csv", index=False)
submission.head()

Unnamed: 0,Clearsky DHI,Clearsky DNI,Clearsky GHI
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
