In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

### First catboost model:


In [4]:
# Load a single set for initial exploration
train_targets = pd.read_parquet('A/train_targets.parquet')
X_train_observed = pd.read_parquet('A/X_train_observed.parquet')

# Check the first few rows to understand the time intervals
print(train_targets.head())
print(X_train_observed.head())

# Check the time intervals in the targets
train_targets['time'] = pd.to_datetime(train_targets['time'])
print(train_targets['time'].diff().describe())

# Check the time intervals in the observed features
X_train_observed['date_forecast'] = pd.to_datetime(X_train_observed['date_forecast'])
print(X_train_observed['date_forecast'].diff().describe())

# If you see that the 'date_forecast' has a more frequent interval, this justifies resampling


                 time  pv_measurement
0 2019-06-02 22:00:00            0.00
1 2019-06-02 23:00:00            0.00
2 2019-06-03 00:00:00            0.00
3 2019-06-03 01:00:00            0.00
4 2019-06-03 02:00:00           19.36
        date_forecast  absolute_humidity_2m:gm3  air_density_2m:kgm3  \
0 2019-06-02 22:00:00                       7.7                1.230   
1 2019-06-02 22:15:00                       7.7                1.229   
2 2019-06-02 22:30:00                       7.7                1.228   
3 2019-06-02 22:45:00                       7.7                1.226   
4 2019-06-02 23:00:00                       7.7                1.225   

   ceiling_height_agl:m  clear_sky_energy_1h:J  clear_sky_rad:W  \
0           1744.900024                    0.0              0.0   
1           1734.000000                    0.0              0.0   
2           1723.500000                    0.0              0.0   
3           1713.400024                    0.0              0.0   
4   

In [5]:
def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """
    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Drop non-feature columns
    merged_data = merged_data.drop(columns=['time', 'date_forecast', 'pv_measurement'])
    
    return merged_data, test_resampled

In [6]:
locations = ['A', 'B', 'C']
all_predictions = []

for loc in locations:
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

   # Preprocess data
    X_train, X_test = preprocess_data(train, X_train_observed, X_train_estimated, X_test_estimated)
    y = train['pv_measurement'].values

    # Ensure X and y have the same length
    min_length = min(len(X_train), len(y))
    X_train, y_train = X_train.iloc[:min_length], y[:min_length]

    # Initialize and Train model
    model = CatBoostRegressor(loss_function='MAE', learning_rate=0.1, verbose=200)
    model.fit(X_train, y_train)

    # Make predictions using X_test_estimated data
    predictions = model.predict(X_test)
    
    # Store the predictions in all_predictions list
    all_predictions.append(predictions)

# Concatenate all predictions
final_predictions = np.concatenate(all_predictions)

  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 594.7644197	total: 73.9ms	remaining: 1m 13s
200:	learn: 192.8596165	total: 1.62s	remaining: 6.44s
400:	learn: 180.5368032	total: 2.77s	remaining: 4.13s
600:	learn: 174.3298784	total: 3.92s	remaining: 2.6s
800:	learn: 169.6346673	total: 5.04s	remaining: 1.25s
999:	learn: 164.5558480	total: 6.1s	remaining: 0us


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 92.3626894	total: 8.13ms	remaining: 8.12s
200:	learn: 35.2293994	total: 1.24s	remaining: 4.92s
400:	learn: 33.2446950	total: 2.4s	remaining: 3.59s
600:	learn: 32.1726808	total: 3.52s	remaining: 2.34s
800:	learn: 31.2183083	total: 4.67s	remaining: 1.16s
999:	learn: 30.4627761	total: 5.89s	remaining: 0us


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 61.9661959	total: 9.8ms	remaining: 9.79s
200:	learn: 26.2793889	total: 2.13s	remaining: 8.47s
400:	learn: 24.2466777	total: 4.49s	remaining: 6.7s
600:	learn: 23.1525407	total: 6.58s	remaining: 4.37s
800:	learn: 22.3760541	total: 8.71s	remaining: 2.17s
999:	learn: 21.6791932	total: 10.8s	remaining: 0us


#### Second Catboost model

In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame
df = pd.read_parquet('A/X_train_estimated.parquet')  # Replace with your actual file path

# Count NaN values for each feature
nan_counts = df.isna().sum()

# Display the count of NaNs for each column
print(nan_counts)

In [10]:
def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """

    # Removing consecutive duplicate non-zero values
    targets = targets[(targets['pv_measurement'] == 0) | (targets['pv_measurement'] != targets['pv_measurement'].shift())]
    targets.dropna(inplace=True)

    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()

    # Binary feature indicating whether the data is from an observed dataset or an estimated dataset.
    observed_resampled['estimated'] = 0
    estimated_resampled['estimated'] = 1
    test_resampled['estimated'] = 1
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Time-Based Features (training data)
    merged_data['hour'] = merged_data['date_forecast'].dt.hour
    merged_data['sin_hour'] = np.sin(2 * np.pi * merged_data['hour'] / 23)
    merged_data['cos_hour'] = np.cos(2 * np.pi * merged_data['hour'] / 23)
    merged_data['day_of_week'] = merged_data['date_forecast'].dt.dayofweek
    merged_data['sin_day_of_week'] = np.sin(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['cos_day_of_week'] = np.cos(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['month'] = merged_data['date_forecast'].dt.month
    merged_data['sin_month'] = np.sin(2 * np.pi * merged_data['month'] / 12)
    merged_data['cos_month'] = np.cos(2 * np.pi * merged_data['month'] / 12)

    # Time-Based Features (test data)
    test_resampled['hour'] = test_resampled['date_forecast'].dt.hour
    test_resampled['sin_hour'] = np.sin(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['cos_hour'] = np.cos(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['day_of_week'] = test_resampled['date_forecast'].dt.dayofweek
    test_resampled['sin_day_of_week'] = np.sin(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['cos_day_of_week'] = np.cos(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['month'] = test_resampled['date_forecast'].dt.month
    test_resampled['sin_month'] = np.sin(2 * np.pi * test_resampled['month'] / 12)
    test_resampled['cos_month'] = np.cos(2 * np.pi * test_resampled['month'] / 12)
    
    # fixing NaN value
    merged_data['ceiling_height_agl:m'].fillna(0, inplace=True)
    test_resampled['ceiling_height_agl:m'].fillna(0, inplace=True)
    merged_data['cloud_base_agl:m'].fillna(0, inplace=True)
    test_resampled['cloud_base_agl:m'].fillna(0, inplace=True)

    merged_data = merged_data.drop(columns=['time'])
    merged_data.to_csv(f'{loc}_csv/X_train.csv')
    test_resampled.to_csv(f'{loc}_csv/X_test.csv')

    # Drop non-feature columns
    # Drop 'snow_density:kgm3' as well, since it has 15769 values that are NaN
    merged_data = merged_data.drop(columns=['date_forecast', 'pv_measurement', 'snow_density:kgm3'])
    test_resampled = test_resampled.drop(columns=['date_forecast', 'snow_density:kgm3'])
    
    return merged_data, test_resampled, targets

locations = ['A', 'B', 'C']
all_predictions = []
all_predictions_rf = []


for loc in locations:
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

   # Preprocess data
    X_train, X_test, targets = preprocess_data(train, X_train_observed, X_train_estimated, X_test_estimated)
    y = targets['pv_measurement'].values

    # Ensure X and y have the same length
    min_length = min(len(X_train), len(y))
    X_train, y_train = X_train.iloc[:min_length], y[:min_length]
    
    X_train_data, X_eval_data, y_train_data, y_eval_data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model = CatBoostRegressor(loss_function='MAE', learning_rate=0.1, verbose=200, cat_features=['estimated'])
    model.fit(X_train, y_train, eval_set=(X_eval_data, y_eval_data))

    # Make predictions using X_test_estimated data
    predictions = model.predict(X_test)

    all_predictions.append(predictions)

    final_predictions = np.concatenate(all_predictions)
     
    # Initialize and Train RandomForest model
    model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
    model_rf.fit(X_train, y_train)

    # Make predictions using X_test data
    predictions_rf = model_rf.predict(X_test)
    
    # Store the RandomForest predictions in all_predictions_rf list
    all_predictions_rf.append(predictions_rf) 
    
    final_predictions_rf = np.concatenate(all_predictions_rf)
    
average_predictions = (np.array(final_predictions) + np.array(final_predictions_rf)) / 2.0
final_predictions = average_predictions
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.dropna(inplace=True)
  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 584.1834784	test: 558.4020079	best: 558.4020079 (0)	total: 23.4ms	remaining: 23.3s
200:	learn: 194.8668291	test: 184.1365848	best: 184.1365848 (200)	total: 1.3s	remaining: 5.16s
400:	learn: 181.0734462	test: 169.7899197	best: 169.7899197 (400)	total: 2.53s	remaining: 3.79s
600:	learn: 175.3667284	test: 164.4141928	best: 164.4141928 (600)	total: 3.69s	remaining: 2.45s
800:	learn: 169.0684819	test: 158.4555370	best: 158.4555370 (800)	total: 4.97s	remaining: 1.24s
999:	learn: 163.0838561	test: 152.9590618	best: 152.9590618 (999)	total: 6.15s	remaining: 0us

bestTest = 152.9590618
bestIteration = 999



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.dropna(inplace=True)
  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 88.7998725	test: 85.1185787	best: 85.1185787 (0)	total: 19.5ms	remaining: 19.5s
200:	learn: 25.9153672	test: 24.9559245	best: 24.9559245 (200)	total: 1.31s	remaining: 5.23s
400:	learn: 24.2225133	test: 23.2076510	best: 23.2076510 (400)	total: 2.46s	remaining: 3.68s
600:	learn: 23.1830841	test: 22.1132326	best: 22.1132326 (600)	total: 3.54s	remaining: 2.35s
800:	learn: 22.3784801	test: 21.3468834	best: 21.3468834 (800)	total: 4.59s	remaining: 1.14s
999:	learn: 21.4917870	test: 20.4864875	best: 20.4864875 (999)	total: 5.79s	remaining: 0us

bestTest = 20.4864875
bestIteration = 999



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.dropna(inplace=True)
  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 60.9356912	test: 61.9914051	best: 61.9914051 (0)	total: 14.6ms	remaining: 14.6s
200:	learn: 24.9020730	test: 24.0972116	best: 24.0972116 (200)	total: 1.34s	remaining: 5.34s
400:	learn: 23.2667294	test: 22.3957163	best: 22.3957163 (400)	total: 2.47s	remaining: 3.69s
600:	learn: 21.9796639	test: 21.0723451	best: 21.0723451 (600)	total: 3.72s	remaining: 2.47s
800:	learn: 21.1194233	test: 20.2007036	best: 20.2007036 (800)	total: 4.84s	remaining: 1.2s
999:	learn: 20.3835140	test: 19.4442294	best: 19.4442294 (999)	total: 6.06s	remaining: 0us

bestTest = 19.44422936
bestIteration = 999



#### Third Catboost model

In [17]:
def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """

    targets = targets[(targets['pv_measurement'] == 0) | (targets['pv_measurement'] != targets['pv_measurement'].shift())]
    targets.dropna(inplace=True)

    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()

    observed_resampled['estimated'] = 0
    estimated_resampled['estimated'] = 1
    test_resampled['estimated'] = 1
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Time-Based Features (training data)
    merged_data['hour'] = merged_data['date_forecast'].dt.hour
    merged_data['sin_hour'] = np.sin(2 * np.pi * merged_data['hour'] / 23)
    merged_data['cos_hour'] = np.cos(2 * np.pi * merged_data['hour'] / 23)
    merged_data['day_of_week'] = merged_data['date_forecast'].dt.dayofweek
    merged_data['sin_day_of_week'] = np.sin(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['cos_day_of_week'] = np.cos(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['month'] = merged_data['date_forecast'].dt.month
    merged_data['sin_month'] = np.sin(2 * np.pi * merged_data['month'] / 12)
    merged_data['cos_month'] = np.cos(2 * np.pi * merged_data['month'] / 12)

    # Time-Based Features (test data)
    test_resampled['hour'] = test_resampled['date_forecast'].dt.hour
    test_resampled['sin_hour'] = np.sin(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['cos_hour'] = np.cos(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['day_of_week'] = test_resampled['date_forecast'].dt.dayofweek
    test_resampled['sin_day_of_week'] = np.sin(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['cos_day_of_week'] = np.cos(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['month'] = test_resampled['date_forecast'].dt.month
    test_resampled['sin_month'] = np.sin(2 * np.pi * test_resampled['month'] / 12)
    test_resampled['cos_month'] = np.cos(2 * np.pi * test_resampled['month'] / 12)
    
    # fixing ceiling_height NaN value
    merged_data['ceiling_height_agl:m'].fillna(0, inplace=True)
    test_resampled['ceiling_height_agl:m'].fillna(0, inplace=True)
    merged_data['cloud_base_agl:m'].fillna(0, inplace=True)
    test_resampled['cloud_base_agl:m'].fillna(0, inplace=True)

    merged_data = merged_data.drop(columns=['time'])

    # Drop non-feature columns
    merged_data = merged_data.drop(columns=['date_forecast', 'pv_measurement', 'snow_density:kgm3'])
    test_resampled = test_resampled.drop(columns=['date_forecast', 'snow_density:kgm3'])
    
    return merged_data, test_resampled, targets

locations = ['A', 'B', 'C']
all_predictions = []

for loc in locations:
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

   # Preprocess data
    X_train, X_test, targets = preprocess_data(train, X_train_observed, X_train_estimated, X_test_estimated)
    y = targets['pv_measurement'].values

    # Ensure X and y have the same length
    min_length = min(len(X_train), len(y))
    X_train, y_train = X_train.iloc[:min_length], y[:min_length]

    X_train_data, X_eval_data, y_train_data, y_eval_data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Create catboost Pool objects
    train_pool = Pool(data=X_train_data, label=y_train_data, cat_features=['estimated'])
    eval_pool = Pool(data=X_eval_data, label=y_eval_data, cat_features=['estimated'])

    model = CatBoostRegressor(depth=8, learning_rate=0.1, verbose=200, loss_function='MAE')
    model.fit(train_pool, use_best_model=True, eval_set=eval_pool)

    # Make predictions using X_test_estimated data
    predictions = model.predict(X_test)
    
    # Store the predictions in all_predictions list
    all_predictions.append(predictions)

# Concatenate all predictions
final_predictions = np.concatenate(all_predictions)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.dropna(inplace=True)
  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()


0:	learn: 599.5227843	test: 565.7835768	best: 565.7835768 (0)	total: 22.5ms	remaining: 22.5s
200:	learn: 181.5629328	test: 179.8626817	best: 179.8626817 (200)	total: 2.23s	remaining: 8.88s
400:	learn: 156.9241865	test: 173.1655462	best: 173.0402510 (394)	total: 4.11s	remaining: 6.13s
600:	learn: 146.1850370	test: 170.1511460	best: 170.1511460 (600)	total: 6.03s	remaining: 4s
800:	learn: 136.2069123	test: 168.3420248	best: 168.2630777 (795)	total: 8.04s	remaining: 2s
999:	learn: 128.6809042	test: 166.9384161	best: 166.9384161 (999)	total: 10.1s	remaining: 0us

bestTest = 166.9384161
bestIteration = 999



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.dropna(inplace=True)
  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()


0:	learn: 87.0722141	test: 82.8041794	best: 82.8041794 (0)	total: 23ms	remaining: 23s
200:	learn: 21.5575889	test: 24.2817003	best: 24.2817003 (200)	total: 2.25s	remaining: 8.94s
400:	learn: 19.0673337	test: 23.5327542	best: 23.5327391 (399)	total: 4.24s	remaining: 6.33s
600:	learn: 17.5752954	test: 23.1788615	best: 23.1788615 (600)	total: 6.24s	remaining: 4.14s
800:	learn: 16.5446620	test: 22.9271459	best: 22.9271459 (800)	total: 8.29s	remaining: 2.06s
999:	learn: 15.3764937	test: 22.7602760	best: 22.7408096 (979)	total: 10.4s	remaining: 0us

bestTest = 22.74080964
bestIteration = 979

Shrink model to first 980 iterations.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  targets.dropna(inplace=True)
  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()


0:	learn: 59.3450919	test: 60.4605535	best: 60.4605535 (0)	total: 21.3ms	remaining: 21.3s
200:	learn: 21.2826803	test: 23.6359121	best: 23.6359121 (200)	total: 2.31s	remaining: 9.16s
400:	learn: 18.7938631	test: 22.6785067	best: 22.6772440 (398)	total: 4.43s	remaining: 6.61s
600:	learn: 17.4772096	test: 22.1808739	best: 22.1728443 (593)	total: 6.6s	remaining: 4.38s
800:	learn: 16.2210065	test: 21.8213346	best: 21.8209141 (798)	total: 8.97s	remaining: 2.23s
999:	learn: 15.2753344	test: 21.4827465	best: 21.4827465 (999)	total: 11.1s	remaining: 0us

bestTest = 21.48274654
bestIteration = 999



#### Fourth Catboost model

In [11]:
def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """
    
    targets = targets[(targets['pv_measurement'] == 0) | (targets['pv_measurement'] != targets['pv_measurement'].shift())]
    targets.dropna(inplace=True)

    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()

    # Binary feature indicating whether the data is from an observed dataset or an estimated dataset.
    observed_resampled['estimated'] = 0
    estimated_resampled['estimated'] = 1
    test_resampled['estimated'] = 1
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Time-Based Features (training data)
    merged_data['hour'] = merged_data['date_forecast'].dt.hour
    merged_data['sin_hour'] = np.sin(2 * np.pi * merged_data['hour'] / 23)
    merged_data['cos_hour'] = np.cos(2 * np.pi * merged_data['hour'] / 23)
    merged_data['day_of_week'] = merged_data['date_forecast'].dt.dayofweek
    merged_data['sin_day_of_week'] = np.sin(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['cos_day_of_week'] = np.cos(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['month'] = merged_data['date_forecast'].dt.month
    merged_data['sin_month'] = np.sin(2 * np.pi * merged_data['month'] / 12)
    merged_data['cos_month'] = np.cos(2 * np.pi * merged_data['month'] / 12)

    # Time-Based Features (test data)
    test_resampled['hour'] = test_resampled['date_forecast'].dt.hour
    test_resampled['sin_hour'] = np.sin(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['cos_hour'] = np.cos(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['day_of_week'] = test_resampled['date_forecast'].dt.dayofweek
    test_resampled['sin_day_of_week'] = np.sin(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['cos_day_of_week'] = np.cos(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['month'] = test_resampled['date_forecast'].dt.month
    test_resampled['sin_month'] = np.sin(2 * np.pi * test_resampled['month'] / 12)
    test_resampled['cos_month'] = np.cos(2 * np.pi * test_resampled['month'] / 12)
    
    # fixing ceiling_height NaN value
    merged_data['ceiling_height_agl:m'].fillna(0, inplace=True)
    test_resampled['ceiling_height_agl:m'].fillna(0, inplace=True)
    merged_data['cloud_base_agl:m'].fillna(0, inplace=True)
    test_resampled['cloud_base_agl:m'].fillna(0, inplace=True)

    merged_data = merged_data.drop(columns=['time'])
    merged_data.to_csv(f'{loc}_csv/X_train.csv')
    test_resampled.to_csv(f'{loc}_csv/X_test.csv')

    # Drop non-feature columns
    merged_data = merged_data.drop(columns=['date_forecast', 'pv_measurement', 'snow_density:kgm3'])
    test_resampled = test_resampled.drop(columns=['date_forecast', 'snow_density:kgm3'])
    
    return merged_data, test_resampled, targets

locations = ['A', 'B', 'C']
all_predictions = []
test_all = pd.DataFrame()

for loc in locations:
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

   # Preprocess data
    X_train, X_test, targets = preprocess_data(train, X_train_observed, X_train_estimated, X_test_estimated)
    y = targets['pv_measurement'].values
    test_all = pd.concat([test_all, X_test])

    # Ensure X and y have the same length
    min_length = min(len(X_train), len(y))
    X_train, y_train = X_train.iloc[:min_length], y[:min_length]
    
    X_train_data, X_eval_data, y_train_data, y_eval_data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Define a parameter grid to search over
    param_grid = {
        'depth': [6, 8, 10],  # Example: Try depths of 6, 8, and 10
        'learning_rate': [0.03, 0.1],  # Example: learning rates to try
        # Add other parameters here
    }

    # Create a CatBoostRegressor
    model = CatBoostRegressor(loss_function='MAE', learning_rate=0.1, verbose=200, cat_features=['estimated'])

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')

    # Fit the GridSearchCV object with your training data Pool
    grid_search.fit(X_train, y_train, eval_set=(X_eval_data, y_eval_data))  # You may need to convert your Pool to a dataframe if GridSearchCV doesn't accept Pool objects directly

    # Get the best parameters
    best_parameters = grid_search.best_params_
    print(f"Best parameters: {best_parameters}")

    # Get the best estimator (model with best parameters)
    best_model = grid_search.best_estimator_

    # Use the best model to predict on test data
    best_predictions = best_model.predict(X_test) 
    
    # Store the predictions in all_predictions list
    all_predictions.append(best_predictions)

# Concatenate all predictions
final_predictions = np.concatenate(all_predictions)

  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 566.0842919	test: 590.0878286	best: 590.0878286 (0)	total: 16.5ms	remaining: 16.5s
200:	learn: 195.3779004	test: 200.0654730	best: 200.0654730 (200)	total: 991ms	remaining: 3.94s
400:	learn: 186.5513045	test: 193.2308822	best: 193.2308822 (400)	total: 2.05s	remaining: 3.06s
600:	learn: 172.9431695	test: 182.0260045	best: 182.0260045 (600)	total: 3.16s	remaining: 2.1s
800:	learn: 167.3795232	test: 178.1135361	best: 178.1135361 (800)	total: 4.12s	remaining: 1.02s
999:	learn: 161.9761238	test: 174.6762992	best: 174.6762992 (999)	total: 5.08s	remaining: 0us

bestTest = 174.6762992
bestIteration = 999

0:	learn: 651.1799322	test: 588.9295180	best: 588.9295180 (0)	total: 7.41ms	remaining: 7.4s
200:	learn: 225.2812816	test: 199.8763099	best: 199.8763099 (200)	total: 1.02s	remaining: 4.05s
400:	learn: 215.3181886	test: 190.9953812	best: 190.9953812 (400)	total: 1.97s	remaining: 2.95s
600:	learn: 205.3997625	test: 182.9967116	best: 182.9967116 (600)	total: 2.96s	remaining: 1.96s
800:	

  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 85.4853552	test: 87.7916819	best: 87.7916819 (0)	total: 7.67ms	remaining: 7.66s
200:	learn: 26.8233922	test: 27.5605856	best: 27.5605856 (200)	total: 1.27s	remaining: 5.05s
400:	learn: 25.2391603	test: 26.3058739	best: 26.3058739 (400)	total: 2.71s	remaining: 4.05s
600:	learn: 24.4657685	test: 25.8144145	best: 25.8144145 (600)	total: 3.88s	remaining: 2.58s
800:	learn: 23.8848334	test: 25.4461074	best: 25.4461074 (800)	total: 5.2s	remaining: 1.29s
999:	learn: 23.2356823	test: 25.0395219	best: 25.0363345 (997)	total: 6.45s	remaining: 0us

bestTest = 25.0363345
bestIteration = 997

Shrink model to first 998 iterations.
0:	learn: 81.0901322	test: 88.1177859	best: 88.1177859 (0)	total: 10.1ms	remaining: 10.1s
200:	learn: 22.8811775	test: 27.3028093	best: 27.3028093 (200)	total: 1.56s	remaining: 6.22s
400:	learn: 20.9963603	test: 25.8535488	best: 25.8535488 (400)	total: 3.16s	remaining: 4.73s
600:	learn: 19.7775142	test: 25.1224506	best: 25.1224506 (600)	total: 4.74s	remaining: 3.1

  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 86.0083955	test: 62.6300018	best: 62.6300018 (0)	total: 8.13ms	remaining: 8.12s
200:	learn: 23.8495016	test: 40.7770775	best: 40.7528634 (193)	total: 1.51s	remaining: 6.03s
400:	learn: 22.7124974	test: 40.2668499	best: 40.2668499 (400)	total: 2.86s	remaining: 4.27s
600:	learn: 22.3343937	test: 40.0417648	best: 40.0417648 (600)	total: 4.33s	remaining: 2.87s
800:	learn: 21.6026564	test: 39.6054018	best: 39.6054018 (800)	total: 5.67s	remaining: 1.41s
999:	learn: 21.0345944	test: 39.2905406	best: 39.2905406 (999)	total: 7.01s	remaining: 0us

bestTest = 39.29054064
bestIteration = 999

0:	learn: 37.7514919	test: 62.9268272	best: 62.9268272 (0)	total: 12.5ms	remaining: 12.5s
200:	learn: 21.3904434	test: 35.9018115	best: 35.9018115 (200)	total: 1.53s	remaining: 6.07s
400:	learn: 18.7525387	test: 31.3161158	best: 31.3161158 (400)	total: 3.03s	remaining: 4.53s
600:	learn: 17.3637629	test: 29.4249902	best: 29.4249902 (600)	total: 4.56s	remaining: 3.03s
800:	learn: 16.8989750	test: 28.8

#### Fifth Catboost model

In [18]:
# Get feature importances
feature_importances = model.get_feature_importance()

# getting features from the preprocessed training data
feature_names = X_train.columns

# Create a DataFrame to display feature importance
df_feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(df_feature_importances)

                           Feature  Importance
2             ceiling_height_agl:m   27.590289
5                 cloud_base_agl:m   11.723520
3            clear_sky_energy_1h:J   10.992426
11                 direct_rad_1h:J    7.018726
28     relative_humidity_1000hPa:p    3.091342
51                           month    2.856558
10                    direct_rad:W    2.665039
37                     t_1000hPa:K    2.567534
7                   dew_point_2m:K    2.445892
0         absolute_humidity_2m:gm3    2.307893
4                  clear_sky_rad:W    2.254132
42             wind_speed_v_10m:ms    1.990925
53                       cos_month    1.691592
9                 diffuse_rad_1h:J    1.511760
8                    diffuse_rad:W    1.303005
1              air_density_2m:kgm3    1.281461
41             wind_speed_u_10m:ms    1.280894
35                 sun_elevation:d    1.258653
34                   sun_azimuth:d    1.192054
39                    visibility:m    1.183256
45           

In [None]:
def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """
    
    targets = targets[(targets['pv_measurement'] == 0) | (targets['pv_measurement'] != targets['pv_measurement'].shift())]
    targets.dropna(inplace=True)

    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()

    # Binary feature indicating whether the data is from an observed dataset or an estimated dataset.
    observed_resampled['estimated'] = 0
    estimated_resampled['estimated'] = 1
    test_resampled['estimated'] = 1
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Time-Based Features (training data)
    merged_data['hour'] = merged_data['date_forecast'].dt.hour
    merged_data['sin_hour'] = np.sin(2 * np.pi * merged_data['hour'] / 23)
    merged_data['cos_hour'] = np.cos(2 * np.pi * merged_data['hour'] / 23)
    merged_data['day_of_week'] = merged_data['date_forecast'].dt.dayofweek
    merged_data['sin_day_of_week'] = np.sin(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['cos_day_of_week'] = np.cos(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['month'] = merged_data['date_forecast'].dt.month
    merged_data['sin_month'] = np.sin(2 * np.pi * merged_data['month'] / 12)
    merged_data['cos_month'] = np.cos(2 * np.pi * merged_data['month'] / 12)

    # Time-Based Features (test data)
    test_resampled['hour'] = test_resampled['date_forecast'].dt.hour
    test_resampled['sin_hour'] = np.sin(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['cos_hour'] = np.cos(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['day_of_week'] = test_resampled['date_forecast'].dt.dayofweek
    test_resampled['sin_day_of_week'] = np.sin(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['cos_day_of_week'] = np.cos(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['month'] = test_resampled['date_forecast'].dt.month
    test_resampled['sin_month'] = np.sin(2 * np.pi * test_resampled['month'] / 12)
    test_resampled['cos_month'] = np.cos(2 * np.pi * test_resampled['month'] / 12)
    
    # fixing ceiling_height NaN value
    merged_data['ceiling_height_agl:m'].fillna(0, inplace=True)
    test_resampled['ceiling_height_agl:m'].fillna(0, inplace=True)
    merged_data['cloud_base_agl:m'].fillna(0, inplace=True)
    test_resampled['cloud_base_agl:m'].fillna(0, inplace=True)

    merged_data = merged_data.drop(columns=['time'])
    merged_data.to_csv(f'{loc}_csv/X_train.csv')
    test_resampled.to_csv(f'{loc}_csv/X_test.csv')

    # Drop non-feature columns
    merged_data = merged_data.drop(columns=['date_forecast', 'pv_measurement', 'snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'dew_or_rime:idx', 'prob_rime:p', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm'])
    test_resampled = test_resampled.drop(columns=['date_forecast', 'snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'dew_or_rime:idx', 'prob_rime:p', 'fresh_snow_1h:cm', 'fresh_snow_3h:cm', 'fresh_snow_6h:cm'])
    
    return merged_data, test_resampled, targets

locations = ['A', 'B', 'C']
all_predictions = []
test_all = pd.DataFrame()

for loc in locations:
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

   # Preprocess data
    X_train, X_test, targets = preprocess_data(train, X_train_observed, X_train_estimated, X_test_estimated)
    y = targets['pv_measurement'].values
    test_all = pd.concat([test_all, X_test])

    # Ensure X and y have the same length
    min_length = min(len(X_train), len(y))
    X_train, y_train = X_train.iloc[:min_length], y[:min_length]
    
    X_train_data, X_eval_data, y_train_data, y_eval_data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Define a parameter grid to search over
    param_grid = {
        'depth': [6, 8, 10],  # Example: Try depths of 6, 8, and 10
        'learning_rate': [0.03, 0.1],  # Example: learning rates to try
        # Add other parameters here
    }

    # Create a CatBoostRegressor
    model = CatBoostRegressor(loss_function='MAE', learning_rate=0.1, verbose=200, cat_features=['estimated'])

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')

    # Fit the GridSearchCV object with your training data Pool
    grid_search.fit(X_train, y_train, eval_set=(X_eval_data, y_eval_data))  # You may need to convert your Pool to a dataframe if GridSearchCV doesn't accept Pool objects directly

    # Get the best parameters
    best_parameters = grid_search.best_params_
    print(f"Best parameters: {best_parameters}")

    # Get the best estimator (model with best parameters)
    best_model = grid_search.best_estimator_

    # Use the best model to predict on test data
    best_predictions = best_model.predict(X_test) 
    
    # Store the predictions in all_predictions list
    all_predictions.append(best_predictions)

# Concatenate all predictions
final_predictions = np.concatenate(all_predictions)

In [None]:
# Postprocessing

# Convert 'final_predictions' to a pandas DataFrame
preds = pd.DataFrame(final_predictions, columns=['prediction'])

test_all.reset_index(drop=True, inplace=True)
preds.reset_index(drop=True, inplace=True)

preds.index = test_all.index

# Setting all night-time predictions to zero
preds.loc[test_all['is_day:idx'] == 0, 'prediction'] = 0

# Setting all negative values to 0
df = pd.DataFrame(preds, columns=['prediction'])
df['prediction'] = df['prediction'].apply(lambda x: max(0, x))