In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import utils
import numpy as np
import random
import string
from autogluon.common import space


def do_prediction(location, limit, name, percentage, trials):
    x_train, tuning_data, x_test = utils.preprocess_category_estimated_observed(location)
    x_train.drop(["time", 'date_forecast'], axis=1, inplace=True)
    tuning_data.drop(["time", 'date_forecast'], axis=1, inplace=True)
    x_test_date_forecast = x_test['date_forecast']
    x_test.drop(['date_forecast'], axis=1, inplace=True)
    
    x_test.fillna(0, inplace=True)

    label = 'pv_measurement'
    train_data = TabularDataset(x_train)
    
    precentage_tuning = percentage/100
    
    tuning_data = TabularDataset(tuning_data)
    tuning_data = tuning_data.sample(frac=0.3, random_state=79)

    test_data = TabularDataset(x_test)

    predictor = TabularPredictor(label=label,
                                 path="AutoGluonTesting",
                                 eval_metric='mean_absolute_error')
    
    num_trials = trials 
    search_strategy = 'auto'

    hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
        'num_trials': num_trials,
        'scheduler': 'local',
        'searcher': search_strategy,
    }

    predictor.fit(train_data,
                  time_limit=limit,
                  tuning_data=tuning_data,
                  use_bag_holdout=True,
                  presets=['best_quality'], )

    y_pred = predictor.predict(test_data)

    print(y_pred)
    preds = pd.DataFrame()
    preds['date_forecast'] = x_test_date_forecast
    preds['predicted'] = np.asarray(y_pred)
    preds.to_csv(name +  '_' + location + '.csv')
    print('Saved this file: ' + name +'_'+ str(percentage) + '_' + location + '.csv')

    
time_limit = 60 * 60 * 5.5
percentage = 30
trials = 20 + 20
name= "best_quality_random_seed"
print('Starting run with percentage tuning= ' + str(percentage))
do_prediction('A', time_limit, name, percentage, trials)
do_prediction('B', time_limit, name, percentage, trials)
do_prediction('C', time_limit, name, percentage, trials)
print('Done with run with percentage tuning= ' + str(percentage))


Starting run with percentage tuning= 30


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 19800.0s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315221.20 GB / 618408.77 GB (51.0%)
Train Data Rows:    29667
Train Data Columns: 43
Tuning Data Rows:    1325
Tuning Data Columns: 43
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 674.14552, 1195.53172)
	If 'regression' is not the correct problem_type, please manually specify the problem_type pa

Total data points: 34085
Data points to be removed: 0
1536


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 42 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', ['bool']) :  1 | ['snow_density:kgm3']
	0.1s = Fit runtime
	43 features in original data used to generate 43 features in processed data.
	Train Data (Processed) Memory Usage:

KeyboardInterrupt: 

In [1]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import utils
import numpy as np
import random
import string
from autogluon.common import space


file_name = 'best_quality_random_seed_'

pred_a = pd.read_csv(file_name + 'A.csv')
pred_a['date'] = pd.to_datetime(pred_a['date_forecast'])
pred_b = pd.read_csv(file_name + 'B.csv')
pred_b['date'] = pd.to_datetime(pred_b['date_forecast'])
pred_c = pd.read_csv(file_name + 'C.csv')
pred_c['date'] = pd.to_datetime(pred_c['date_forecast'])

test = pd.read_csv('data/test.csv')
test['time'] = pd.to_datetime(test['time'])

submission = pd.DataFrame(columns=['prediction'])

for val in pred_a['date']:
    if test['time'].eq(val).any():
        row = pred_a.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

for val in pred_b['date']:
    if test['time'].eq(val).any():
        row = pred_b.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

for val in pred_c['date']:
    if test['time'].eq(val).any():
        row = pred_c.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

submission['prediction'] = submission['prediction'].where(submission['prediction'] >= 0, 0)
submission.index.name = "id"

submission.to_csv('submission_' + file_name + 'attempt.csv')


In [2]:

from catboost import CatBoostRegressor

def catboost_pred(x_train, y_train, x_val, y_val, x_test, iterations, depth, seed):
  
    # Initialize CatBoostRegressor
    model = CatBoostRegressor(
        iterations=iterations,
        depth=depth,
        loss_function='MAE',
        verbose=100,
        random_seed=seed
    )

    print(x_train.columns)
    print(x_val.columns)
    
    # Train the model
    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
    )

    # Make predictions
    y_pred = model.predict(x_test)

    return y_pred

In [3]:
def filter_weather_data(csv_file, df_a, df_b, df_c):
    # Read the CSV file into a dataframe
    df_a = df_a.copy()
    df_b = df_b.copy()
    df_c = df_c.copy()
    
    data = pd.read_csv(csv_file)
    
    # Create a dictionary of dataframes for each location
    dfs = {'A': df_a, 'B': df_b, 'C': df_c}
    
    # Filter each location's dataframe
    filtered_dfs = {}
    for location, df in dfs.items():
        # Get the 'time' values from the CSV that correspond to the current location
        times = data[data['location'] == location]['time']
        
        # Filter the dataframe for this location based on the times
        # Make sure both 'date_forecast' in df and 'time' in data are of the same datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        times = pd.to_datetime(times)
        
        # Now, filter the dataframe based on the times
        filtered_dfs[location] = df[df['date_forecast'].isin(times)]
    
    # Return the filtered dataframes
    return filtered_dfs['A'], filtered_dfs['B'], filtered_dfs['C']

In [4]:
def replace_missing_with_zero(df):
    """
    Replaces missing values (NaN, None, etc.) in a DataFrame with 0.
    
    Parameters:
    - df: The DataFrame to be processed.
    
    Returns:
    - df_cleaned: The DataFrame with missing values replaced by 0.
    """
    df = df.copy()
    total_replaced = 0
    
    for column in df.columns:
        missing_count = df[column].isnull().sum()
        if missing_count > 0:
            total_replaced += missing_count
            df[column].fillna(0, inplace=True)
    
    print(f"Total values replaced: {total_replaced}")
    
    return df


In [5]:
def prepare_test():
    X_test_estimated_A = pd.read_parquet('data/A/X_test_estimated.parquet')
    X_test_estimated_B = pd.read_parquet('data/B/X_test_estimated.parquet')
    X_test_estimated_C = pd.read_parquet('data/C/X_test_estimated.parquet')
    
    X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = filter_weather_data("data/test.csv", X_test_estimated_A, X_test_estimated_B, X_test_estimated_C)
    
    X_test_a = replace_missing_with_zero(X_test_estimated_a)
    X_test_b = replace_missing_with_zero(X_test_estimated_b)
    X_test_c = replace_missing_with_zero(X_test_estimated_c)
    
    X_test_a = encode_datetime_with_season_and_cyclical_time(X_test_a, "date_forecast")
    X_test_b = encode_datetime_with_season_and_cyclical_time(X_test_b, "date_forecast")
    X_test_c = encode_datetime_with_season_and_cyclical_time(X_test_c, "date_forecast")
    
    X_test_a['direct_rad_lag1'] = X_test_a['direct_rad:W'].shift(1)
    X_test_a['diffuse_rad_lag1'] = X_test_a['diffuse_rad:W'].shift(1)
    X_test_a['direct_rad_rolling_mean'] = X_test_a['direct_rad:W'].rolling(window=3).mean()
    X_test_a['sun_elev_direct_rad_interaction'] = X_test_a['sun_elevation:d'] * X_test_a['direct_rad:W']
    
    X_test_b['direct_rad_lag1'] = X_test_b['direct_rad:W'].shift(1)
    X_test_b['diffuse_rad_lag1'] = X_test_b['diffuse_rad:W'].shift(1)
    X_test_b['direct_rad_rolling_mean'] = X_test_b['direct_rad:W'].rolling(window=3).mean()
    X_test_b['sun_elev_direct_rad_interaction'] = X_test_b['sun_elevation:d'] * X_test_b['direct_rad:W']
    
    X_test_c['direct_rad_lag1'] = X_test_c['direct_rad:W'].shift(1)
    X_test_c['diffuse_rad_lag1'] = X_test_c['diffuse_rad:W'].shift(1)
    X_test_c['direct_rad_rolling_mean'] = X_test_c['direct_rad:W'].rolling(window=3).mean()
    X_test_c['sun_elev_direct_rad_interaction'] = X_test_c['sun_elevation:d'] * X_test_c['direct_rad:W']

    
    return X_test_a.reindex(sorted(X_test_a.columns), axis=1), X_test_b.reindex(sorted(X_test_b.columns), axis=1), X_test_c.reindex(sorted(X_test_c.columns), axis=1)



In [6]:
def encode_datetime_with_season_and_cyclical_time(df, datetime_col):
    """
    Encodes the datetime column by adding season and transforming hour into cyclical features (sine and cosine),
    ensuring all possible categories are present and in a consistent order.
    
    Parameters:
    - df: DataFrame containing the data.
    - datetime_col: The name of the datetime column in the DataFrame.
    
    Returns:
    - df_encoded: DataFrame with the datetime column encoded with seasons and cyclical hours.
    """
    df = df.copy()
    # Extract month and hour from the datetime column
    df['month'] = df[datetime_col].dt.month
    df['hour'] = df[datetime_col].dt.hour
    
    # Function to map month to season
    def map_month_to_season(month):
        if month in [3, 4, 5]:
            return 'spring'
        elif month in [6, 7, 8]:
            return 'summer'
        elif month in [9, 10, 11]:
            return 'autumn'
        else:
            return 'winter'

    # Apply function to create season column
    df['season'] = df['month'].apply(map_month_to_season)

    # One-hot encode the season
    df_encoded = pd.get_dummies(df, columns=['season'])

    # Ensure all seasons are present
    for season in ['spring', 'summer', 'autumn', 'winter']:
        if f'season_{season}' not in df_encoded.columns:
            df_encoded[f'season_{season}'] = 0

    # Cyclical transformation of hour
    df_encoded['hour_sin'] = np.sin(df_encoded['hour'] * (2 * np.pi / 24))
    df_encoded['hour_cos'] = np.cos(df_encoded['hour'] * (2 * np.pi / 24))

    # Drop the original month and hour columns
    df_encoded = df_encoded.drop(['month', 'hour'], axis=1)

    # Define the expected column order and reorder the dataframe columns
    expected_columns = ['season_spring', 'season_summer', 'season_autumn', 'season_winter', 
                        'hour_sin', 'hour_cos'] + [col for col in df_encoded if col not in ['season_spring', 'season_summer', 'season_autumn', 'season_winter', 'hour_sin', 'hour_cos']]
    df_encoded = df_encoded[expected_columns]

    return df_encoded

In [7]:
from sklearn.preprocessing import QuantileTransformer

def apply_quantile_transformation(df1, df2, df3, feature_names):
    """
    Applies quantile transformation to specified features across three dataframes and replaces the original features.

    Parameters:
    - df1, df2, df3: DataFrames to transform.
    - feature_names: List of the names of the features to transform.

    Returns:
    - Three DataFrames with the transformed features replacing the original features.
    """
    for feature_name in feature_names:
        # Combine the feature data from all dataframes for each feature
        combined_feature_data = pd.concat([df1[feature_name], df2[feature_name], df3[feature_name]], ignore_index=True)

        # Instantiate and fit the QuantileTransformer
        transformer = QuantileTransformer(output_distribution='normal', random_state=0)
        transformer.fit(combined_feature_data.to_frame())

        # Transform the feature in each dataframe and replace the original feature
        df1[feature_name] = transformer.transform(df1[[feature_name]])
        df2[feature_name] = transformer.transform(df2[[feature_name]])
        df3[feature_name] = transformer.transform(df3[[feature_name]])

    return df1, df2, df3

In [8]:
import csv
def write_predictions_to_csv(array1, array2, array3, filename="predictions.csv"):
    """
    Writes three 1D numpy arrays to a CSV file with an "id" and "prediction" column.
    
    :param array1: The first 1D numpy array containing the predictions.
    :param array2: The second 1D numpy array containing the predictions.
    :param array3: The third 1D numpy array containing the predictions.
    :param filename: The name of the CSV file to write to.
    """
    # Concatenate the arrays
    predictions = np.concatenate((array1, array2, array3))
    
    # Write to CSV
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write header
        writer.writerow(["id", "prediction"])
        # Write the id and prediction
        for idx, prediction in enumerate(predictions):
            writer.writerow([idx, prediction])
            
    print(f"File '{filename}' has been written with {len(predictions)} rows.")

In [9]:
def set_negative_values_to_zero(csv_file_path, output_csv_file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Set negative values to zero
    df[df < 0] = 0

    # Save the modified DataFrame back to a new CSV file
    df.to_csv(output_csv_file_path, index=False)
    print(f"Modified CSV saved to {output_csv_file_path}")

In [20]:
def cat_boost(location, iterations, depth, seed):
    x_train, x_test, x_observed = utils.preprocess_category_estimated_observed(location)
    x_train.drop(["time", "snow_density:kgm3"], axis=1, inplace=True)
    x_observed.drop(["time", "date_calc", "snow_density:kgm3"], axis=1, inplace=True)

    test_a, test_b, test_c = prepare_test()
    if location == "A":
        x_test = test_a
    if location == "B":
        x_test = test_b
    if location == "C":
        x_test = test_c
        
    x_train = encode_datetime_with_season_and_cyclical_time(x_train, "date_forecast")
    x_observed = encode_datetime_with_season_and_cyclical_time(x_observed, "date_forecast")
    x_test.drop(["date_calc", "snow_density:kgm3", "date_forecast", 'elevation:m', 'snow_drift:idx'], axis=1, inplace=True)

    x_train['direct_rad_lag1'] = x_train['direct_rad:W'].shift(1)
    x_train['diffuse_rad_lag1'] = x_train['diffuse_rad:W'].shift(1)
    x_train['direct_rad_rolling_mean'] = x_train['direct_rad:W'].rolling(window=3).mean()
    x_train['sun_elev_direct_rad_interaction'] = x_train['sun_elevation:d'] * x_train['direct_rad:W']
    
    x_observed['direct_rad_lag1'] = x_observed['direct_rad:W'].shift(1)
    x_observed['diffuse_rad_lag1'] = x_observed['diffuse_rad:W'].shift(1)
    x_observed['direct_rad_rolling_mean'] = x_observed['direct_rad:W'].rolling(window=3).mean()
    x_observed['sun_elev_direct_rad_interaction'] = x_observed['sun_elevation:d'] * x_observed['direct_rad:W']
    
    x_train, x_observed, x_test = apply_quantile_transformation(x_train, x_observed, x_test, ['direct_rad:W', 'diffuse_rad:W', 'sun_elev_direct_rad_interaction', 'clear_sky_rad:W'])

    # Filter to include only data from April to July
    april_to_july_df = x_train[x_train['date_forecast'].dt.month.isin([4, 5, 6, 7])]

    # Sample 30% of the data from this subset
    random_sample = april_to_july_df.sample(frac=0.3, random_state=seed)

    # Remove the sampled rows from the original dataframe
    x_train = x_train.drop(random_sample.index)
    
    x_train.drop(["date_forecast", 'elevation:m', 'snow_drift:idx'], axis=1, inplace=True)
    x_observed.drop(["date_forecast", 'elevation:m', 'snow_drift:idx'], axis=1, inplace=True)
    random_sample.drop(["date_forecast", 'elevation:m', 'snow_drift:idx'], axis=1, inplace=True)

    x_train['observed'] = 1
    random_sample['observed'] = 1
    x_observed['observed'] = 0
    x_test['observed'] = 0
    
    x_train = pd.concat([x_train, x_observed], ignore_index=True)
    X = x_train.drop('pv_measurement', axis=1)  # Features
    Y = x_train['pv_measurement']  # Target
    
    X_val = random_sample.drop('pv_measurement', axis=1)  # Features
    Y_val = random_sample['pv_measurement']  # Target

    
    y_pred = catboost_pred(X, Y, X_val, Y_val, x_test, iterations, depth, seed)

    return np.asarray(y_pred)


#%%

import time

def make_cat_boost_pred(iterations, depth, seed):
    start_time = time.time()
    a = cat_boost("A", iterations, depth, seed)
    b = cat_boost("B", int(iterations/2), depth, seed)
    c = cat_boost("C", int(iterations/2), depth, seed)
    
    name = "cat_boost_"+str(depth)+".csv"
    
    write_predictions_to_csv(a, b, c, filename=name)
    set_negative_values_to_zero("cat_boost2_"+str(depth)+".csv", "cat_boost2_"+str(depth)+".csv")
    print("""
          
          
          New iterations!!!!
          
          
          """)
    end_time = time.time()
    total_time = end_time - start_time
    print(f"The code took {total_time} seconds to run.")
    return name

a = make_cat_boost_pred(3000, 15, 2)# denne
b = make_cat_boost_pred(3000, 14, 12)# denne
c = make_cat_boost_pred(3000, 13, 22) 
d = make_cat_boost_pred(3000, 12, 32)
e = make_cat_boost_pred(4000, 11, 42)
f = make_cat_boost_pred(5000, 10, 52)
g = make_cat_boost_pred(7000, 9, 62)
h = make_cat_boost_pred(9000, 8, 52)
i = make_cat_boost_pred(10000, 7, 63)


csv_files = [a,b,c,d,e,f,g,h,i]  # Replace with your filenames
output_file = 'submission_average_catboost.csv'
average_predictions(csv_files, output_file)


Total data points: 34085
Data points to be removed: 0
Total values replaced: 38077
Total values replaced: 5689
Total values replaced: 1037
Total values replaced: 1017
Total values replaced: 1074
Index(['season_spring', 'season_summer', 'season_autumn', 'season_winter',
       'hour_sin', 'hour_cos', 'absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
       'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx',
       'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_depth:cm', 'snow_melt_10

0:	learn: 82.7341752	test: 226.0166005	best: 226.0166005 (0)	total: 553ms	remaining: 13m 49s
100:	learn: 19.0288719	test: 61.5440549	best: 61.5440549 (100)	total: 55.5s	remaining: 12m 48s
200:	learn: 11.3081247	test: 54.4936206	best: 54.4936206 (200)	total: 1m 50s	remaining: 11m 51s
300:	learn: 8.3404768	test: 53.1741391	best: 53.1741391 (300)	total: 2m 44s	remaining: 10m 56s
400:	learn: 6.4882525	test: 52.5676028	best: 52.5629153 (397)	total: 3m 39s	remaining: 10m 1s
500:	learn: 5.2443776	test: 52.2329264	best: 52.2267593 (498)	total: 4m 34s	remaining: 9m 6s
600:	learn: 4.3263660	test: 51.9863814	best: 51.9863814 (600)	total: 5m 28s	remaining: 8m 11s
700:	learn: 3.6605073	test: 51.8009563	best: 51.8009563 (700)	total: 6m 22s	remaining: 7m 15s
800:	learn: 3.1373366	test: 51.7002548	best: 51.6977671 (797)	total: 7m 15s	remaining: 6m 20s
900:	learn: 2.5764796	test: 51.5602808	best: 51.5602808 (900)	total: 8m 9s	remaining: 5m 25s
1000:	learn: 2.0702213	test: 51.4296108	best: 51.4291216 (9

0:	learn: 561.7789170	test: 1152.7140833	best: 1152.7140833 (0)	total: 273ms	remaining: 13m 37s
100:	learn: 178.7623841	test: 347.7784195	best: 347.7784195 (100)	total: 26s	remaining: 12m 27s
200:	learn: 160.0247716	test: 326.6659627	best: 326.6659627 (200)	total: 50.9s	remaining: 11m 49s
300:	learn: 150.1580078	test: 321.8042936	best: 321.8042936 (300)	total: 1m 15s	remaining: 11m 16s
400:	learn: 137.5936938	test: 316.2688974	best: 316.2688974 (400)	total: 1m 40s	remaining: 10m 52s
500:	learn: 110.5727618	test: 306.3368876	best: 306.3275369 (499)	total: 2m 6s	remaining: 10m 29s
600:	learn: 82.4137200	test: 301.7124000	best: 301.7043610 (599)	total: 2m 32s	remaining: 10m 7s
700:	learn: 68.5772491	test: 298.6338916	best: 298.6338916 (700)	total: 2m 58s	remaining: 9m 45s
800:	learn: 61.8008375	test: 297.6097442	best: 297.6052962 (799)	total: 3m 24s	remaining: 9m 21s
900:	learn: 57.2253238	test: 296.5027888	best: 296.4985465 (893)	total: 3m 50s	remaining: 8m 56s
1000:	learn: 53.9658090	te

0:	learn: 72.8173601	test: 191.4338697	best: 191.4338697 (0)	total: 280ms	remaining: 6m 59s
100:	learn: 17.9186303	test: 49.1673952	best: 49.1673952 (100)	total: 27s	remaining: 6m 13s
200:	learn: 12.6394522	test: 41.8982506	best: 41.8956162 (198)	total: 53.7s	remaining: 5m 47s
300:	learn: 10.1118306	test: 40.6976839	best: 40.6924701 (298)	total: 1m 20s	remaining: 5m 19s
400:	learn: 8.0208423	test: 39.8669764	best: 39.8658519 (399)	total: 1m 46s	remaining: 4m 52s
500:	learn: 6.6658121	test: 39.4573708	best: 39.4573708 (500)	total: 2m 13s	remaining: 4m 25s
600:	learn: 5.7261765	test: 39.2243950	best: 39.2223653 (599)	total: 2m 39s	remaining: 3m 59s
700:	learn: 5.3177266	test: 39.0863596	best: 39.0863596 (700)	total: 3m 5s	remaining: 3m 31s
800:	learn: 4.9293188	test: 39.0504716	best: 39.0375663 (748)	total: 3m 31s	remaining: 3m 4s
900:	learn: 4.2023729	test: 38.9861619	best: 38.9819885 (891)	total: 3m 59s	remaining: 2m 39s
1000:	learn: 3.4851283	test: 38.8730999	best: 38.8695411 (997)	to

100:	learn: 20.9781318	test: 61.4567060	best: 61.4567060 (100)	total: 11.8s	remaining: 2m 43s
200:	learn: 14.8095932	test: 53.4977548	best: 53.4977548 (200)	total: 23.5s	remaining: 2m 32s
300:	learn: 12.2326036	test: 52.0767553	best: 52.0767553 (300)	total: 35.3s	remaining: 2m 20s
400:	learn: 10.5970755	test: 51.4241795	best: 51.4241795 (400)	total: 47s	remaining: 2m 8s
500:	learn: 9.3617048	test: 51.0860481	best: 51.0835290 (499)	total: 58.7s	remaining: 1m 57s
600:	learn: 8.5320220	test: 50.8253697	best: 50.8253697 (600)	total: 1m 10s	remaining: 1m 45s
700:	learn: 7.7193909	test: 50.6518880	best: 50.6424376 (695)	total: 1m 22s	remaining: 1m 33s
800:	learn: 6.8303871	test: 50.4130968	best: 50.4130968 (800)	total: 1m 34s	remaining: 1m 22s
900:	learn: 6.0730245	test: 50.2700281	best: 50.2574059 (895)	total: 1m 45s	remaining: 1m 10s
1000:	learn: 5.2678144	test: 50.1216083	best: 50.1157927 (999)	total: 1m 57s	remaining: 58.6s
1100:	learn: 4.6384474	test: 50.0138745	best: 50.0091326 (1098)	

100:	learn: 187.1888472	test: 337.9292987	best: 337.9292987 (100)	total: 5.83s	remaining: 2m 47s
200:	learn: 173.3070127	test: 318.3136616	best: 318.3060210 (199)	total: 11.6s	remaining: 2m 41s
300:	learn: 165.9851414	test: 311.6927785	best: 311.6927785 (300)	total: 17.4s	remaining: 2m 36s
400:	learn: 157.9182368	test: 305.9879622	best: 305.9879622 (400)	total: 23s	remaining: 2m 29s
500:	learn: 138.4004969	test: 298.3854852	best: 298.3128755 (499)	total: 28.9s	remaining: 2m 24s
600:	learn: 121.4931779	test: 293.4816584	best: 293.4816584 (600)	total: 34.9s	remaining: 2m 19s
700:	learn: 105.7059363	test: 290.2174744	best: 290.2174744 (700)	total: 40.8s	remaining: 2m 13s
800:	learn: 98.2029081	test: 288.5968739	best: 288.5968739 (800)	total: 46.7s	remaining: 2m 8s
900:	learn: 91.4426766	test: 287.3418989	best: 287.3418989 (900)	total: 52.6s	remaining: 2m 2s
1000:	learn: 86.7507283	test: 286.7354569	best: 286.7264536 (999)	total: 58.5s	remaining: 1m 56s
1100:	learn: 83.1265129	test: 285.96

0:	learn: 73.2928201	test: 188.7315146	best: 188.7315146 (0)	total: 64.7ms	remaining: 1m 36s
100:	learn: 19.5491424	test: 52.1936812	best: 52.1936812 (100)	total: 5.76s	remaining: 1m 19s
200:	learn: 15.9036128	test: 45.5640066	best: 45.5640066 (200)	total: 11.5s	remaining: 1m 14s
300:	learn: 13.6807649	test: 44.3741027	best: 44.3741027 (300)	total: 17.2s	remaining: 1m 8s
400:	learn: 12.0282892	test: 43.6008469	best: 43.6008469 (400)	total: 23s	remaining: 1m 2s
500:	learn: 11.0139762	test: 43.1926880	best: 43.1926880 (500)	total: 28.7s	remaining: 57.2s
600:	learn: 10.0122937	test: 42.8070273	best: 42.8070273 (600)	total: 34.4s	remaining: 51.4s
700:	learn: 9.1180977	test: 42.5388845	best: 42.5368442 (698)	total: 40.4s	remaining: 46.1s
800:	learn: 8.2864298	test: 42.3351963	best: 42.3351963 (800)	total: 46.6s	remaining: 40.7s
900:	learn: 7.6467433	test: 42.1777403	best: 42.1765638 (899)	total: 52.6s	remaining: 35s
1000:	learn: 6.9409203	test: 41.9836935	best: 41.9812772 (999)	total: 58.3s

3900:	learn: 49.4074636	test: 294.9285246	best: 294.9135044 (3892)	total: 1m 53s	remaining: 2.87s
3999:	learn: 48.7032928	test: 294.6998906	best: 294.6998906 (3999)	total: 1m 56s	remaining: 0us

bestTest = 294.6998906
bestIteration = 3999

Total data points: 32844
Data points to be removed: 4248
Total values replaced: 30049
Total values replaced: 4867
Total values replaced: 1037
Total values replaced: 1017
Total values replaced: 1074
Index(['season_spring', 'season_summer', 'season_autumn', 'season_winter',
       'hour_sin', 'hour_cos', 'absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
   

700:	learn: 11.0126138	test: 43.6297537	best: 43.6267980 (697)	total: 19.3s	remaining: 35.8s
800:	learn: 10.3696587	test: 43.3524485	best: 43.3524485 (800)	total: 22.1s	remaining: 33.1s
900:	learn: 9.6909385	test: 43.1884371	best: 43.1867011 (898)	total: 24.8s	remaining: 30.3s
1000:	learn: 9.0597790	test: 42.9228678	best: 42.9228678 (1000)	total: 27.6s	remaining: 27.5s
1100:	learn: 8.5759518	test: 42.8564230	best: 42.8564230 (1100)	total: 30.3s	remaining: 24.8s
1200:	learn: 8.3213433	test: 42.7482774	best: 42.7479586 (1199)	total: 33.1s	remaining: 22s
1300:	learn: 7.9275673	test: 42.6360514	best: 42.6348938 (1297)	total: 35.8s	remaining: 19.2s
1400:	learn: 7.4735117	test: 42.4782257	best: 42.4782257 (1400)	total: 38.6s	remaining: 16.5s
1500:	learn: 7.1046733	test: 42.4194198	best: 42.4140992 (1493)	total: 41.3s	remaining: 13.7s
1600:	learn: 6.8667665	test: 42.3963124	best: 42.3952353 (1590)	total: 44.1s	remaining: 11s
1700:	learn: 6.5876178	test: 42.2872974	best: 42.2872974 (1700)	tota

4100:	learn: 70.7128222	test: 296.5226972	best: 296.5153067 (4095)	total: 1m 7s	remaining: 14.8s
4200:	learn: 70.0991065	test: 296.4445646	best: 296.4444768 (4150)	total: 1m 9s	remaining: 13.2s
4300:	learn: 68.9788204	test: 296.3293328	best: 296.2618065 (4227)	total: 1m 10s	remaining: 11.5s
4400:	learn: 67.6496144	test: 296.0781685	best: 296.0559118 (4396)	total: 1m 12s	remaining: 9.88s
4500:	learn: 66.6210670	test: 295.9868784	best: 295.9868784 (4500)	total: 1m 14s	remaining: 8.23s
4600:	learn: 66.1288136	test: 295.9604829	best: 295.9549798 (4534)	total: 1m 15s	remaining: 6.59s
4700:	learn: 65.2815023	test: 296.0253496	best: 295.9521558 (4605)	total: 1m 17s	remaining: 4.94s
4800:	learn: 64.5865306	test: 295.9856881	best: 295.9521558 (4605)	total: 1m 19s	remaining: 3.29s
4900:	learn: 64.0238580	test: 295.8777302	best: 295.8567759 (4891)	total: 1m 21s	remaining: 1.64s
4999:	learn: 62.6058842	test: 295.9206938	best: 295.8194424 (4912)	total: 1m 22s	remaining: 0us

bestTest = 295.8194424


0:	learn: 73.5232555	test: 187.8614349	best: 187.8614349 (0)	total: 19.4ms	remaining: 48.4s
100:	learn: 20.9114505	test: 53.6801382	best: 53.6801382 (100)	total: 1.62s	remaining: 38.6s
200:	learn: 17.5093678	test: 47.2337707	best: 47.2337707 (200)	total: 3.19s	remaining: 36.5s
300:	learn: 15.9852953	test: 46.3867273	best: 46.3867273 (300)	total: 4.78s	remaining: 34.9s
400:	learn: 14.9406283	test: 45.7995207	best: 45.7995207 (400)	total: 6.37s	remaining: 33.3s
500:	learn: 14.0270011	test: 45.3296246	best: 45.3296246 (500)	total: 7.92s	remaining: 31.6s
600:	learn: 13.4836599	test: 44.9689121	best: 44.9689121 (600)	total: 9.51s	remaining: 30s
700:	learn: 13.0062790	test: 44.6536695	best: 44.6536695 (700)	total: 11.1s	remaining: 28.5s
800:	learn: 12.6565306	test: 44.5324906	best: 44.5288395 (784)	total: 12.6s	remaining: 26.8s
900:	learn: 12.3091853	test: 44.3876556	best: 44.3873886 (898)	total: 14.2s	remaining: 25.2s
1000:	learn: 11.8506362	test: 44.2492179	best: 44.2492179 (1000)	total: 1

2900:	learn: 98.7185516	test: 290.5106938	best: 290.3274129 (2563)	total: 33.8s	remaining: 47.7s
3000:	learn: 97.2355123	test: 290.4457882	best: 290.3274129 (2563)	total: 34.9s	remaining: 46.6s
3100:	learn: 96.3642989	test: 290.3694969	best: 290.2625145 (3077)	total: 36.1s	remaining: 45.4s
3200:	learn: 95.1113877	test: 290.3519472	best: 290.2178309 (3181)	total: 37.3s	remaining: 44.2s
3300:	learn: 93.6395347	test: 290.2750310	best: 290.2178309 (3181)	total: 38.4s	remaining: 43.1s
3400:	learn: 92.3657491	test: 290.3108395	best: 290.2178309 (3181)	total: 39.6s	remaining: 41.9s
3500:	learn: 91.2113126	test: 290.1804844	best: 290.1629566 (3499)	total: 40.8s	remaining: 40.7s
3600:	learn: 90.1762197	test: 289.9993148	best: 289.9985237 (3596)	total: 41.9s	remaining: 39.6s
3700:	learn: 89.6250442	test: 289.8471809	best: 289.8471756 (3699)	total: 43.1s	remaining: 38.4s
3800:	learn: 89.0997938	test: 289.6950566	best: 289.6665679 (3797)	total: 44.2s	remaining: 37.2s
3900:	learn: 88.4456355	test: 

1500:	learn: 13.5802342	test: 53.2598432	best: 53.2414199 (1495)	total: 16.7s	remaining: 22.3s
1600:	learn: 13.2075755	test: 53.1464217	best: 53.1426492 (1598)	total: 17.8s	remaining: 21.2s
1700:	learn: 12.8996366	test: 53.0694434	best: 53.0627952 (1688)	total: 18.9s	remaining: 20s
1800:	learn: 12.6964573	test: 53.0291189	best: 53.0265671 (1799)	total: 20s	remaining: 18.9s
1900:	learn: 12.5292458	test: 53.0008905	best: 53.0008905 (1900)	total: 21.1s	remaining: 17.8s
2000:	learn: 12.3684406	test: 52.9658790	best: 52.9658790 (2000)	total: 22.2s	remaining: 16.6s
2100:	learn: 12.1717304	test: 52.9269686	best: 52.9099251 (2087)	total: 23.3s	remaining: 15.5s
2200:	learn: 12.0449298	test: 52.8922116	best: 52.8889475 (2188)	total: 24.4s	remaining: 14.4s
2300:	learn: 11.8107103	test: 52.8318788	best: 52.8057661 (2291)	total: 25.5s	remaining: 13.3s
2400:	learn: 11.6313329	test: 52.7371858	best: 52.7371832 (2399)	total: 26.6s	remaining: 12.2s
2500:	learn: 11.4711222	test: 52.7030057	best: 52.6875

Modified CSV saved to cat_boost2_9.csv

          
          
          New iterations!!!!
          
          
          
The code took 160.1140592098236 seconds to run.
Total data points: 34085
Data points to be removed: 0
Total values replaced: 38077
Total values replaced: 5689
Total values replaced: 1037
Total values replaced: 1017
Total values replaced: 1074
Index(['season_spring', 'season_summer', 'season_autumn', 'season_winter',
       'hour_sin', 'hour_cos', 'absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
       'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx',
     

5600:	learn: 103.5096697	test: 297.3769837	best: 297.3649947 (5588)	total: 41.4s	remaining: 25.2s
5700:	learn: 102.7457601	test: 297.3025803	best: 297.2635058 (5658)	total: 42.2s	remaining: 24.4s
5800:	learn: 102.1241764	test: 297.2855043	best: 297.2600454 (5741)	total: 42.9s	remaining: 23.7s
5900:	learn: 101.6377378	test: 297.5082198	best: 297.2600454 (5741)	total: 43.6s	remaining: 22.9s
6000:	learn: 101.1867740	test: 297.6112302	best: 297.2600454 (5741)	total: 44.4s	remaining: 22.2s
6100:	learn: 100.4526421	test: 297.4254892	best: 297.2600454 (5741)	total: 45.1s	remaining: 21.4s
6200:	learn: 99.6649738	test: 297.4815240	best: 297.2600454 (5741)	total: 45.8s	remaining: 20.7s
6300:	learn: 98.9694331	test: 297.6913485	best: 297.2600454 (5741)	total: 46.6s	remaining: 20s
6400:	learn: 98.5416138	test: 297.7356138	best: 297.2600454 (5741)	total: 47.3s	remaining: 19.2s
6500:	learn: 98.0896936	test: 297.5954304	best: 297.2600454 (5741)	total: 48s	remaining: 18.5s
6600:	learn: 97.7480953	test

2300:	learn: 14.3317243	test: 53.3714088	best: 53.3638614 (2288)	total: 16.1s	remaining: 15.4s
2400:	learn: 14.1612327	test: 53.3054385	best: 53.3027308 (2398)	total: 16.8s	remaining: 14.7s
2500:	learn: 14.0548899	test: 53.2817196	best: 53.2776477 (2468)	total: 17.5s	remaining: 14s
2600:	learn: 13.8977045	test: 53.1694519	best: 53.1694519 (2600)	total: 18.2s	remaining: 13.3s
2700:	learn: 13.7075199	test: 53.0936474	best: 53.0936474 (2700)	total: 18.8s	remaining: 12.5s
2800:	learn: 13.5165307	test: 53.0998576	best: 53.0778897 (2766)	total: 19.5s	remaining: 11.8s
2900:	learn: 13.4118662	test: 53.0841140	best: 53.0778897 (2766)	total: 20.2s	remaining: 11.1s
3000:	learn: 13.3150680	test: 53.0627295	best: 53.0627295 (3000)	total: 20.9s	remaining: 10.4s
3100:	learn: 13.2011726	test: 53.0119451	best: 53.0119451 (3100)	total: 21.6s	remaining: 9.73s
3200:	learn: 13.0614422	test: 52.9393850	best: 52.9393850 (3200)	total: 22.3s	remaining: 9.03s
3300:	learn: 12.9164112	test: 52.9113539	best: 52.90

3500:	learn: 10.6246632	test: 44.3829370	best: 44.3723719 (3494)	total: 23.2s	remaining: 6.62s
3600:	learn: 10.4874625	test: 44.3600089	best: 44.3592453 (3596)	total: 23.9s	remaining: 5.96s
3700:	learn: 10.4138689	test: 44.3468486	best: 44.3405592 (3683)	total: 24.5s	remaining: 5.29s
3800:	learn: 10.3105504	test: 44.3256926	best: 44.3177169 (3797)	total: 25.2s	remaining: 4.63s
3900:	learn: 10.2720276	test: 44.3204026	best: 44.3177169 (3797)	total: 25.8s	remaining: 3.97s
4000:	learn: 10.1885405	test: 44.2981004	best: 44.2814543 (3992)	total: 26.5s	remaining: 3.3s
4100:	learn: 10.1153320	test: 44.2686764	best: 44.2650792 (4082)	total: 27.1s	remaining: 2.64s
4200:	learn: 10.0592683	test: 44.2532397	best: 44.2392858 (4154)	total: 27.8s	remaining: 1.98s
4300:	learn: 10.0206804	test: 44.2424411	best: 44.2392858 (4154)	total: 28.4s	remaining: 1.31s
4400:	learn: 9.9433039	test: 44.2090184	best: 44.2077525 (4382)	total: 29.1s	remaining: 655ms
4499:	learn: 9.8796574	test: 44.1987238	best: 44.196

4400:	learn: 126.3853705	test: 312.9766656	best: 312.9210658 (4398)	total: 24.1s	remaining: 30.7s
4500:	learn: 125.5111882	test: 312.6186124	best: 312.6186124 (4500)	total: 24.7s	remaining: 30.1s
4600:	learn: 125.0714264	test: 312.2482709	best: 312.2482709 (4600)	total: 25.2s	remaining: 29.6s
4700:	learn: 124.5487626	test: 312.0702330	best: 312.0527462 (4697)	total: 25.7s	remaining: 29s
4800:	learn: 124.0331426	test: 311.9710069	best: 311.9710069 (4800)	total: 26.3s	remaining: 28.5s
4900:	learn: 123.6553171	test: 311.8502973	best: 311.8502973 (4900)	total: 26.8s	remaining: 27.9s
5000:	learn: 123.0479011	test: 311.9772952	best: 311.7853524 (4938)	total: 27.3s	remaining: 27.3s
5100:	learn: 122.6255810	test: 311.8610048	best: 311.7853524 (4938)	total: 27.9s	remaining: 26.8s
5200:	learn: 122.1070421	test: 311.5770149	best: 311.5770149 (5200)	total: 28.4s	remaining: 26.2s
5300:	learn: 121.4735471	test: 311.2684466	best: 311.2668181 (5298)	total: 29s	remaining: 25.7s
5400:	learn: 121.0148387

100:	learn: 25.7198003	test: 66.0180421	best: 66.0180421 (100)	total: 553ms	remaining: 26.8s
200:	learn: 23.1303051	test: 58.1153893	best: 58.1153893 (200)	total: 1.07s	remaining: 25.6s
300:	learn: 22.2524178	test: 56.8062407	best: 56.8062407 (300)	total: 1.59s	remaining: 24.8s
400:	learn: 21.3602137	test: 55.7271863	best: 55.7271863 (400)	total: 2.1s	remaining: 24.1s
500:	learn: 20.6927768	test: 55.0458253	best: 55.0458253 (500)	total: 2.62s	remaining: 23.6s
600:	learn: 20.1368183	test: 54.5770512	best: 54.5770512 (600)	total: 3.14s	remaining: 23s
700:	learn: 19.6054796	test: 54.2190879	best: 54.2190792 (699)	total: 3.66s	remaining: 22.4s
800:	learn: 19.0797666	test: 53.6979751	best: 53.6967075 (799)	total: 4.17s	remaining: 21.9s
900:	learn: 18.8865406	test: 53.5889932	best: 53.5889932 (900)	total: 4.69s	remaining: 21.4s
1000:	learn: 18.7544785	test: 53.4877641	best: 53.4877641 (1000)	total: 5.2s	remaining: 20.8s
1100:	learn: 18.6422879	test: 53.4375422	best: 53.4347190 (1096)	total: 

800:	learn: 17.2162653	test: 44.7267798	best: 44.7267797 (799)	total: 3.94s	remaining: 20.7s
900:	learn: 16.8507307	test: 44.4652905	best: 44.4652724 (899)	total: 4.42s	remaining: 20.1s
1000:	learn: 16.5904994	test: 44.3410475	best: 44.3410475 (1000)	total: 4.9s	remaining: 19.6s
1100:	learn: 16.4291541	test: 44.3004619	best: 44.2986343 (1086)	total: 5.38s	remaining: 19.1s
1200:	learn: 16.3653737	test: 44.2807042	best: 44.2807042 (1200)	total: 5.87s	remaining: 18.6s
1300:	learn: 16.1594627	test: 44.1164999	best: 44.1164999 (1300)	total: 6.34s	remaining: 18s
1400:	learn: 15.9811564	test: 44.0007044	best: 43.9991538 (1387)	total: 6.82s	remaining: 17.5s
1500:	learn: 15.7845105	test: 43.8710198	best: 43.8706697 (1499)	total: 7.3s	remaining: 17s
1600:	learn: 15.6812276	test: 43.8137278	best: 43.8110097 (1589)	total: 7.77s	remaining: 16.5s
1700:	learn: 15.5287222	test: 43.7438353	best: 43.7357110 (1678)	total: 8.24s	remaining: 16s
1800:	learn: 15.3811176	test: 43.6593458	best: 43.6593458 (180

In [15]:
def average_predictions(csv_files, output_file):
    """
    Averages the predictions from multiple CSV files and writes the result to a new CSV file.

    Parameters:
    - csv_files: List of strings, where each string is the filepath to a CSV file.
    - output_file: String, the filepath for the output CSV file containing averaged predictions.
    """

    # Initialize an empty DataFrame to store the summed predictions
    sum_predictions = None
    count = 0
    
    # Loop through each CSV file
    for file in csv_files:
        # Read the prediction column from the CSV file
        data = pd.read_csv(file, usecols=['id', 'prediction'])
        
        # If sum_predictions is None, it's the first file, so assign data to sum_predictions
        if sum_predictions is None:
            sum_predictions = data.set_index('id')
        else:
            # If sum_predictions is not None, add the prediction to the sum_predictions DataFrame
            sum_predictions = sum_predictions.add(data.set_index('id'), fill_value=0)
        
        count += 1

    # Calculate the average predictions
    avg_predictions = sum_predictions / count
    
    # Reset the index to turn 'id' back into a column
    avg_predictions.reset_index(inplace=True)
    
    # Write the average predictions to the output CSV file
    avg_predictions.to_csv(output_file, index=False)
    print(f"Averaged predictions saved to {output_file}")

Averaged predictions saved to catboostAv2.csv
