In [14]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """



    # Ensure data is sorted by date_forecast
    targets = targets.sort_values(by='time')
    observed = observed.sort_values(by='date_forecast')
    estimated = estimated.sort_values(by='date_forecast')
    test = test.sort_values(by='date_forecast')

    targets = targets[(targets['pv_measurement'] == 0) | (targets['pv_measurement'] != targets['pv_measurement'].shift())]
    targets.dropna(inplace=True)


    """   # Identify boolean columns
    # Forward fill NaNs for boolean columns
    for df in [observed, estimated, test]:
        df[boolean_features] = df[boolean_features].fillna(method='ffill')

    # Forward fill for time-series data (for non-boolean columns)
    for df in [observed, estimated, test]:
        df[df.columns.difference(boolean_features)] = df[df.columns.difference(boolean_features)].fillna(method='ffill') """


    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()

    """ # Round boolean columns after resampling
    for df in [observed_resampled, estimated_resampled, test_resampled]:
        df[boolean_features] = df[boolean_features].round(0) """

    observed_resampled['estimated'] = 0
    estimated_resampled['estimated'] = 1
    test_resampled['estimated'] = 1
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Time-Based Features (training data)
    merged_data['hour'] = merged_data['date_forecast'].dt.hour
    merged_data['sin_hour'] = np.sin(2 * np.pi * merged_data['hour'] / 23)
    merged_data['cos_hour'] = np.cos(2 * np.pi * merged_data['hour'] / 23)
    merged_data['day_of_week'] = merged_data['date_forecast'].dt.dayofweek
    merged_data['sin_day_of_week'] = np.sin(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['cos_day_of_week'] = np.cos(2 * np.pi * merged_data['day_of_week'] / 7)
    merged_data['month'] = merged_data['date_forecast'].dt.month
    merged_data['sin_month'] = np.sin(2 * np.pi * merged_data['month'] / 12)
    merged_data['cos_month'] = np.cos(2 * np.pi * merged_data['month'] / 12)

    # Time-Based Features (test data)
    test_resampled['hour'] = test_resampled['date_forecast'].dt.hour
    test_resampled['sin_hour'] = np.sin(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['cos_hour'] = np.cos(2 * np.pi * test_resampled['hour'] / 23)
    test_resampled['day_of_week'] = test_resampled['date_forecast'].dt.dayofweek
    test_resampled['sin_day_of_week'] = np.sin(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['cos_day_of_week'] = np.cos(2 * np.pi * test_resampled['day_of_week'] / 7)
    test_resampled['month'] = test_resampled['date_forecast'].dt.month
    test_resampled['sin_month'] = np.sin(2 * np.pi * test_resampled['month'] / 12)
    test_resampled['cos_month'] = np.cos(2 * np.pi * test_resampled['month'] / 12)
    
    # fixing ceiling_height NaN value
    merged_data['ceiling_height_agl:m'].fillna(0, inplace=True)
    test_resampled['ceiling_height_agl:m'].fillna(0, inplace=True)
    merged_data['cloud_base_agl:m'].fillna(0, inplace=True)
    test_resampled['cloud_base_agl:m'].fillna(0, inplace=True)

    merged_data = merged_data.drop(columns=['time'])

    # Drop non-feature columns
    merged_data = merged_data.drop(columns=['date_forecast', 'pv_measurement', 'snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'snow_melt_10min:mm'])
    test_resampled = test_resampled.drop(columns=['date_forecast', 'snow_density:kgm3', 'elevation:m', 'snow_drift:idx', 'snow_melt_10min:mm'])
    
    return merged_data, test_resampled, targets

locations = ['A', 'B', 'C']
all_predictions = []
all_predictions_rf = []
test_all = pd.DataFrame()

for loc in locations:
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

   # Preprocess data
    X_train, X_test, targets = preprocess_data(train, X_train_observed, X_train_estimated, X_test_estimated)
    targets.to_csv(f'{loc}_csv/processed_targets.csv')
    y = targets['pv_measurement'].values
    test_all = pd.concat([test_all, X_test])


    # Ensure X and y have the same length
    min_length = min(len(X_train), len(y))
    X_train, y_train = X_train.iloc[:min_length], y[:min_length]

    
    X_train_data, X_eval_data, y_train_data, y_eval_data = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model = CatBoostRegressor(loss_function='MAE', learning_rate=0.03, depth=8, cat_features=['estimated'])
    model.fit(X_train, y_train, eval_set=(X_eval_data, y_eval_data))

    # Make predictions using X_test_estimated data
    predictions = model.predict(X_test)
    
    # Store the predictions in all_predictions list
    all_predictions.append(predictions)

# postprocessing

# Concatenate all predictions
final_predictions = np.concatenate(all_predictions)

# Convert 'final_predictions' to a pandas DataFrame
preds = pd.DataFrame(final_predictions, columns=['prediction'])

test_all.reset_index(drop=True, inplace=True)
preds.reset_index(drop=True, inplace=True)

preds.index = test_all.index

# Setting all night-time predictions to zero
preds.loc[test_all['is_day:idx'] == 0, 'prediction'] = 0

# Save the final_predictions to CSV
df = pd.DataFrame(preds, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df['prediction'] = df['prediction'].apply(lambda x: max(0, x))
df.to_csv('final_predictions.csv', index=False)


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()


0:	learn: 618.6702711	test: 591.8065754	best: 591.8065754 (0)	total: 23.4ms	remaining: 23.4s
1:	learn: 604.5268519	test: 578.0992412	best: 578.0992412 (1)	total: 37.3ms	remaining: 18.6s
2:	learn: 592.6115666	test: 566.4269827	best: 566.4269827 (2)	total: 54.4ms	remaining: 18.1s
3:	learn: 578.3869474	test: 552.8352063	best: 552.8352063 (3)	total: 70.6ms	remaining: 17.6s
4:	learn: 565.0048426	test: 539.8357344	best: 539.8357344 (4)	total: 85.1ms	remaining: 16.9s
5:	learn: 551.4752275	test: 526.7303768	best: 526.7303768 (5)	total: 98ms	remaining: 16.2s
6:	learn: 540.3747328	test: 515.8563824	best: 515.8563824 (6)	total: 113ms	remaining: 16s
7:	learn: 527.6091331	test: 503.3751743	best: 503.3751743 (7)	total: 127ms	remaining: 15.8s
8:	learn: 516.0192414	test: 492.0315523	best: 492.0315523 (8)	total: 139ms	remaining: 15.3s
9:	learn: 504.6555114	test: 481.0595365	best: 481.0595365 (9)	total: 151ms	remaining: 14.9s
10:	learn: 492.7178608	test: 469.7040688	best: 469.7040688 (10)	total: 162ms	r

  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(thresh=30).reset_index()


0:	learn: 91.8847744	test: 88.1877100	best: 88.1877100 (0)	total: 18.2ms	remaining: 18.1s
1:	learn: 89.8124658	test: 86.1622648	best: 86.1622648 (1)	total: 29.3ms	remaining: 14.6s
2:	learn: 87.9176643	test: 84.2784250	best: 84.2784250 (2)	total: 40.3ms	remaining: 13.4s
3:	learn: 86.2474283	test: 82.6439643	best: 82.6439643 (3)	total: 50.9ms	remaining: 12.7s
4:	learn: 84.5676511	test: 80.9761405	best: 80.9761405 (4)	total: 61.4ms	remaining: 12.2s
5:	learn: 83.1137732	test: 79.5766480	best: 79.5766480 (5)	total: 72ms	remaining: 11.9s
6:	learn: 81.4357047	test: 77.9556924	best: 77.9556924 (6)	total: 82.5ms	remaining: 11.7s
7:	learn: 79.6977840	test: 76.2451887	best: 76.2451887 (7)	total: 92.6ms	remaining: 11.5s
8:	learn: 77.9148066	test: 74.4928010	best: 74.4928010 (8)	total: 103ms	remaining: 11.3s
9:	learn: 76.2223316	test: 72.8743662	best: 72.8743662 (9)	total: 112ms	remaining: 11.1s
10:	learn: 74.9673930	test: 71.6340957	best: 71.6340957 (10)	total: 123ms	remaining: 11s
11:	learn: 73.4

KeyboardInterrupt: 

In [2]:
# Get feature importances
feature_importances = model.get_feature_importance()

# Assuming X_train contains the feature names
feature_names = X_train.columns

# Create a DataFrame to display feature importance
df_feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(df_feature_importances)

                           Feature  Importance
2             ceiling_height_agl:m    6.910809
5                 cloud_base_agl:m    6.880218
52                       sin_month    6.792723
40               wind_speed_10m:ms    4.520384
42             wind_speed_v_10m:ms    4.285063
37                     t_1000hPa:K    4.268148
41             wind_speed_u_10m:ms    4.019408
49                 sin_day_of_week    3.662801
0         absolute_humidity_2m:gm3    3.411633
24               pressure_100m:hPa    3.193223
28     relative_humidity_1000hPa:p    3.088730
48                     day_of_week    3.070762
39                    visibility:m    2.950405
7                   dew_point_2m:K    2.865123
50                 cos_day_of_week    2.823794
1              air_density_2m:kgm3    2.665451
53                       cos_month    2.571613
21                msl_pressure:hPa    2.514945
29                sfc_pressure:hPa    2.402558
51                           month    2.358040
34           