In [116]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


def preprocess_data(targets, observed, estimated, test):
    """
    Preprocess the data by resampling, merging with targets, and dropping unnecessary columns.
    
    Parameters:
    - targets: Target dataframe with 'time' and target values.
    - observed: Dataframe with observed features.
    - estimated: Dataframe with estimated features.
    - test: Dataframe with test features.
    
    Returns:
    - Preprocessed dataframes ready for training and testing.
    """

    # Ensure the datetime columns are in datetime format
    targets['time'] = pd.to_datetime(targets['time'])
    observed['date_forecast'] = pd.to_datetime(observed['date_forecast'])
    estimated['date_forecast'] = pd.to_datetime(estimated['date_forecast'])
    test['date_forecast'] = pd.to_datetime(test['date_forecast'])

    # Resample observed, estimated, and test data to 1 hour using mean() as aggregator
    # and drop rows where all columns are NaN
    observed_resampled = observed.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
    
    # Merge the observed and estimated data
    weather_data = pd.concat([observed_resampled, estimated_resampled])

    # Merge with target values
    merged_data = pd.merge(targets, weather_data, how='inner', left_on='time', right_on='date_forecast')

    # Drop non-feature columns
    merged_data = merged_data.drop(columns=['time', 'date_forecast', 'pv_measurement'])
    
    return merged_data, test_resampled

locations = ['A', 'B', 'C']
all_predictions = []

for loc in locations:
    # Load your data
    train = pd.read_parquet(f'{loc}/train_targets.parquet').fillna(0)
    X_train_estimated = pd.read_parquet(f'{loc}/X_train_estimated.parquet')
    X_train_observed = pd.read_parquet(f'{loc}/X_train_observed.parquet')
    X_test_estimated = pd.read_parquet(f'{loc}/X_test_estimated.parquet')

   # Preprocess data
    X_train, X_test = preprocess_data(train, X_train_observed, X_train_estimated, X_test_estimated)
    y = train['pv_measurement'].values

    # Ensure X and y have the same length
    min_length = min(len(X_train), len(y))
    X, y = X_train.iloc[:min_length], y[:min_length]
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and Train model
    model = CatBoostRegressor(loss_function='MAE', learning_rate=0.1, verbose=200)
    model.fit(X_train, y_train)

    # Make predictions using X_test_estimated data
    # Ensure that X_test_estimated is processed similarly to how training data was processed before predictions
    # X_test_processed = preprocess_test_data(X_test_estimated)
    predictions = model.predict(X_test)  # Ensure preprocessing of X_test_estimated before using it.
    
    # Store the predictions in all_predictions list
    all_predictions.append(predictions)

# Concatenate all predictions
final_predictions = np.concatenate(all_predictions)

# Save the final_predictions to CSV
df = pd.DataFrame(final_predictions, columns=['prediction'])
df['id'] = df.index
df = df[['id', 'prediction']]
df.to_csv('final_predictions.csv', index=False)


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 599.3576502	total: 7.06ms	remaining: 7.05s
200:	learn: 195.5278459	total: 1.11s	remaining: 4.41s
400:	learn: 180.7882932	total: 2.43s	remaining: 3.63s
600:	learn: 174.8383528	total: 3.47s	remaining: 2.3s
800:	learn: 168.9150358	total: 4.44s	remaining: 1.1s
999:	learn: 163.3484242	total: 5.39s	remaining: 0us


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 92.6790690	total: 7.85ms	remaining: 7.84s
200:	learn: 35.3088242	total: 1.07s	remaining: 4.26s
400:	learn: 33.3564343	total: 2.08s	remaining: 3.11s
600:	learn: 31.7515480	total: 3.16s	remaining: 2.1s
800:	learn: 30.6651960	total: 4.24s	remaining: 1.05s
999:	learn: 29.7470238	total: 5.31s	remaining: 0us


  estimated_resampled = estimated.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()
  test_resampled = test.set_index('date_forecast').resample('1H').mean().dropna(how='all').reset_index()


0:	learn: 62.2962634	total: 6.74ms	remaining: 6.73s
200:	learn: 26.1341178	total: 1.04s	remaining: 4.15s
400:	learn: 24.2091846	total: 2.11s	remaining: 3.15s
600:	learn: 23.0368980	total: 3.29s	remaining: 2.18s
800:	learn: 22.3315588	total: 4.38s	remaining: 1.09s
999:	learn: 21.6950144	total: 5.5s	remaining: 0us
