In [4]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from catboost import CatBoostRegressor

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """

    # Ensure the datetime columns are in datetime format
    targets['time'] = pd.to_datetime(targets['time'])
    observed['date_forecast'] = pd.to_datetime(observed['date_forecast'])
    estimated['date_forecast'] = pd.to_datetime(estimated['date_forecast'])
    test['date_forecast'] = pd.to_datetime(test['date_forecast'])

    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Drop non-feature columns
    merged_data = merged_data.drop(columns=['time', 'date_forecast', 'pv_measurement'])
    
    return merged_data, test_resampled

locations = ['A', 'B', 'C']
all_predictions = []

for loc in locations:
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

   # Preprocess data
    X_train, X_test = preprocess_data(train, X_train_observed, X_train_estimated, X_test_estimated)
    y = train['pv_measurement'].values

    # Ensure X and y have the same length
    min_length = min(len(X_train), len(y))
    X_train, y_train = X_train.iloc[:min_length], y[:min_length]
    
    #X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and Train model
    model = CatBoostRegressor(loss_function='MAE', learning_rate=0.1, verbose=200)
    model.fit(X_train, y_train)

    # Make predictions using X_test_estimated data
    # Ensure that X_test_estimated is processed similarly to how training data was processed before predictions
    # X_test_processed = preprocess_test_data(X_test_estimated)
    predictions = model.predict(X_test)  # Ensure preprocessing of X_test_estimated before using it.
    
    # Store the predictions in all_predictions list
    all_predictions.append(predictions)

# Concatenate all predictions
final_predictions = np.concatenate(all_predictions)

# Save the final_predictions to CSV
df = pd.DataFrame(final_predictions, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('final_predictions.csv', index=False)


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 594.7644197	total: 15.4ms	remaining: 15.4s
200:	learn: 192.8596165	total: 1.38s	remaining: 5.47s
400:	learn: 180.5368032	total: 2.57s	remaining: 3.84s
600:	learn: 174.3298784	total: 3.68s	remaining: 2.44s
800:	learn: 169.6346673	total: 4.77s	remaining: 1.18s
999:	learn: 164.5558480	total: 5.82s	remaining: 0us


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 92.3626894	total: 8.43ms	remaining: 8.42s
200:	learn: 35.2293994	total: 1.19s	remaining: 4.73s
400:	learn: 33.2446950	total: 2.26s	remaining: 3.38s
600:	learn: 32.1726808	total: 3.34s	remaining: 2.21s
800:	learn: 31.2183083	total: 4.42s	remaining: 1.1s
999:	learn: 30.4627761	total: 5.53s	remaining: 0us


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 61.9661959	total: 8.94ms	remaining: 8.94s
200:	learn: 26.2793889	total: 1.23s	remaining: 4.88s
400:	learn: 24.2466777	total: 2.53s	remaining: 3.78s
600:	learn: 23.1525407	total: 3.75s	remaining: 2.49s
800:	learn: 22.3760541	total: 4.93s	remaining: 1.22s
999:	learn: 21.6791932	total: 6.07s	remaining: 0us
