In [1]:
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import utils
import numpy as np
import random
import string
from autogluon.common import space


def do_prediction(location, limit, name, percentage, trials):
    x_train, tuning_data, x_test = utils.preprocess_category_estimated_observed(location)
    x_train.drop(["time", 'date_forecast'], axis=1, inplace=True)
    tuning_data.drop(["time", 'date_forecast'], axis=1, inplace=True)
    x_test_date_forecast = x_test['date_forecast']
    x_test.drop(['date_forecast'], axis=1, inplace=True)
    
    x_test.fillna(0, inplace=True)

    label = 'pv_measurement'
    train_data = TabularDataset(x_train)
    
    precentage_tuning = percentage/100
    
    tuning_data = TabularDataset(tuning_data)
    tuning_data = tuning_data.sample(frac=0.3, random_state=79)

    test_data = TabularDataset(x_test)

    predictor = TabularPredictor(label=label,
                                 path="AutoGluonTesting",
                                 eval_metric='mean_absolute_error')
    
    num_trials = trials 
    search_strategy = 'auto'

    hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
        'num_trials': num_trials,
        'scheduler': 'local',
        'searcher': search_strategy,
    }

    predictor.fit(train_data,
                  time_limit=limit,
                  tuning_data=tuning_data,
                  use_bag_holdout=True,
                  presets=['best_quality'], )

    y_pred = predictor.predict(test_data)

    print(y_pred)
    preds = pd.DataFrame()
    preds['date_forecast'] = x_test_date_forecast
    preds['predicted'] = np.asarray(y_pred)
    preds.to_csv(name +  '_' + location + '.csv')
    print('Saved this file: ' + name +'_'+ str(percentage) + '_' + location + '.csv')

    
time_limit = 60 * 60 * 5.5
percentage = 30
trials = 20 + 20
name= "best_quality_random_seed"
print('Starting run with percentage tuning= ' + str(percentage))
do_prediction('A', time_limit, name, percentage, trials)
do_prediction('B', time_limit, name, percentage, trials)
do_prediction('C', time_limit, name, percentage, trials)
print('Done with run with percentage tuning= ' + str(percentage))


Starting run with percentage tuning= 30


  estimated_df = estimated_df.resample('H').mean()
  test_df = test_df.resample('H').mean()
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 19800.0s
AutoGluon will save models to "AutoGluonTesting/"
AutoGluon Version:  0.8.2
Python Version:     3.9.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Wed Nov 9 20:13:27 UTC 2022
Disk Space Avail:   315221.20 GB / 618408.77 GB (51.0%)
Train Data Rows:    29667
Train Data Columns: 43
Tuning Data Rows:    1325
Tuning Data Columns: 43
Label Column: pv_measurement
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (5733.42, 0.0, 674.14552, 1195.53172)
	If 'regression' is not the correct problem_type, please manually specify the problem_type pa

Total data points: 34085
Data points to be removed: 0
1536


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 43 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 42 | ['absolute_humidity_2m:gm3', 'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W', ...]
		('int', ['bool']) :  1 | ['snow_density:kgm3']
	0.1s = Fit runtime
	43 features in original data used to generate 43 features in processed data.
	Train Data (Processed) Memory Usage:

KeyboardInterrupt: 

In [1]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import utils
import numpy as np
import random
import string
from autogluon.common import space


file_name = 'best_quality_random_seed_'

pred_a = pd.read_csv(file_name + 'A.csv')
pred_a['date'] = pd.to_datetime(pred_a['date_forecast'])
pred_b = pd.read_csv(file_name + 'B.csv')
pred_b['date'] = pd.to_datetime(pred_b['date_forecast'])
pred_c = pd.read_csv(file_name + 'C.csv')
pred_c['date'] = pd.to_datetime(pred_c['date_forecast'])

test = pd.read_csv('data/test.csv')
test['time'] = pd.to_datetime(test['time'])

submission = pd.DataFrame(columns=['prediction'])

for val in pred_a['date']:
    if test['time'].eq(val).any():
        row = pred_a.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

for val in pred_b['date']:
    if test['time'].eq(val).any():
        row = pred_b.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

for val in pred_c['date']:
    if test['time'].eq(val).any():
        row = pred_c.loc[pred_a['date'] == val]
        submission = submission._append({'prediction': float(row['predicted'])}, ignore_index=True)

submission['prediction'] = submission['prediction'].where(submission['prediction'] >= 0, 0)
submission.index.name = "id"

submission.to_csv('submission_' + file_name + 'attempt.csv')


In [2]:

from catboost import CatBoostRegressor

def catboost_pred(x_train, y_train, x_val, y_val, x_test, iterations, depth, seed):
  
    # Initialize CatBoostRegressor
    model = CatBoostRegressor(
        iterations=iterations,
        depth=depth,
        loss_function='MAE',
        verbose=100,
        random_seed=seed
    )

    print(x_train.columns)
    print(x_val.columns)
    
    # Train the model
    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
    )

    # Make predictions
    y_pred = model.predict(x_test)

    return y_pred

In [3]:
def filter_weather_data(csv_file, df_a, df_b, df_c):
    # Read the CSV file into a dataframe
    df_a = df_a.copy()
    df_b = df_b.copy()
    df_c = df_c.copy()
    
    data = pd.read_csv(csv_file)
    
    # Create a dictionary of dataframes for each location
    dfs = {'A': df_a, 'B': df_b, 'C': df_c}
    
    # Filter each location's dataframe
    filtered_dfs = {}
    for location, df in dfs.items():
        # Get the 'time' values from the CSV that correspond to the current location
        times = data[data['location'] == location]['time']
        
        # Filter the dataframe for this location based on the times
        # Make sure both 'date_forecast' in df and 'time' in data are of the same datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        times = pd.to_datetime(times)
        
        # Now, filter the dataframe based on the times
        filtered_dfs[location] = df[df['date_forecast'].isin(times)]
    
    # Return the filtered dataframes
    return filtered_dfs['A'], filtered_dfs['B'], filtered_dfs['C']

In [4]:
def replace_missing_with_zero(df):
    """
    Replaces missing values (NaN, None, etc.) in a DataFrame with 0.
    
    Parameters:
    - df: The DataFrame to be processed.
    
    Returns:
    - df_cleaned: The DataFrame with missing values replaced by 0.
    """
    df = df.copy()
    total_replaced = 0
    
    for column in df.columns:
        missing_count = df[column].isnull().sum()
        if missing_count > 0:
            total_replaced += missing_count
            df[column].fillna(0, inplace=True)
    
    print(f"Total values replaced: {total_replaced}")
    
    return df


In [5]:
def prepare_test():
    X_test_estimated_A = pd.read_parquet('data/A/X_test_estimated.parquet')
    X_test_estimated_B = pd.read_parquet('data/B/X_test_estimated.parquet')
    X_test_estimated_C = pd.read_parquet('data/C/X_test_estimated.parquet')
    
    X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = filter_weather_data("data/test.csv", X_test_estimated_A, X_test_estimated_B, X_test_estimated_C)
    
    X_test_a = replace_missing_with_zero(X_test_estimated_a)
    X_test_b = replace_missing_with_zero(X_test_estimated_b)
    X_test_c = replace_missing_with_zero(X_test_estimated_c)
    
    X_test_a = encode_datetime_with_season_and_cyclical_time(X_test_a, "date_forecast")
    X_test_b = encode_datetime_with_season_and_cyclical_time(X_test_b, "date_forecast")
    X_test_c = encode_datetime_with_season_and_cyclical_time(X_test_c, "date_forecast")
    
    X_test_a['direct_rad_lag1'] = X_test_a['direct_rad:W'].shift(1)
    X_test_a['diffuse_rad_lag1'] = X_test_a['diffuse_rad:W'].shift(1)
    X_test_a['direct_rad_rolling_mean'] = X_test_a['direct_rad:W'].rolling(window=3).mean()
    X_test_a['sun_elev_direct_rad_interaction'] = X_test_a['sun_elevation:d'] * X_test_a['direct_rad:W']
    
    X_test_b['direct_rad_lag1'] = X_test_b['direct_rad:W'].shift(1)
    X_test_b['diffuse_rad_lag1'] = X_test_b['diffuse_rad:W'].shift(1)
    X_test_b['direct_rad_rolling_mean'] = X_test_b['direct_rad:W'].rolling(window=3).mean()
    X_test_b['sun_elev_direct_rad_interaction'] = X_test_b['sun_elevation:d'] * X_test_b['direct_rad:W']
    
    X_test_c['direct_rad_lag1'] = X_test_c['direct_rad:W'].shift(1)
    X_test_c['diffuse_rad_lag1'] = X_test_c['diffuse_rad:W'].shift(1)
    X_test_c['direct_rad_rolling_mean'] = X_test_c['direct_rad:W'].rolling(window=3).mean()
    X_test_c['sun_elev_direct_rad_interaction'] = X_test_c['sun_elevation:d'] * X_test_c['direct_rad:W']

    
    return X_test_a.reindex(sorted(X_test_a.columns), axis=1), X_test_b.reindex(sorted(X_test_b.columns), axis=1), X_test_c.reindex(sorted(X_test_c.columns), axis=1)



In [6]:
def encode_datetime_with_season_and_cyclical_time(df, datetime_col):
    """
    Encodes the datetime column by adding season and transforming hour into cyclical features (sine and cosine),
    ensuring all possible categories are present and in a consistent order.
    
    Parameters:
    - df: DataFrame containing the data.
    - datetime_col: The name of the datetime column in the DataFrame.
    
    Returns:
    - df_encoded: DataFrame with the datetime column encoded with seasons and cyclical hours.
    """
    df = df.copy()
    # Extract month and hour from the datetime column
    df['month'] = df[datetime_col].dt.month
    df['hour'] = df[datetime_col].dt.hour
    
    # Function to map month to season
    def map_month_to_season(month):
        if month in [3, 4, 5]:
            return 'spring'
        elif month in [6, 7, 8]:
            return 'summer'
        elif month in [9, 10, 11]:
            return 'autumn'
        else:
            return 'winter'

    # Apply function to create season column
    df['season'] = df['month'].apply(map_month_to_season)

    # One-hot encode the season
    df_encoded = pd.get_dummies(df, columns=['season'])

    # Ensure all seasons are present
    for season in ['spring', 'summer', 'autumn', 'winter']:
        if f'season_{season}' not in df_encoded.columns:
            df_encoded[f'season_{season}'] = 0

    # Cyclical transformation of hour
    df_encoded['hour_sin'] = np.sin(df_encoded['hour'] * (2 * np.pi / 24))
    df_encoded['hour_cos'] = np.cos(df_encoded['hour'] * (2 * np.pi / 24))

    # Drop the original month and hour columns
    df_encoded = df_encoded.drop(['month', 'hour'], axis=1)

    # Define the expected column order and reorder the dataframe columns
    expected_columns = ['season_spring', 'season_summer', 'season_autumn', 'season_winter', 
                        'hour_sin', 'hour_cos'] + [col for col in df_encoded if col not in ['season_spring', 'season_summer', 'season_autumn', 'season_winter', 'hour_sin', 'hour_cos']]
    df_encoded = df_encoded[expected_columns]

    return df_encoded

In [7]:
from sklearn.preprocessing import QuantileTransformer

def apply_quantile_transformation(df1, df2, df3, feature_names):
    """
    Applies quantile transformation to specified features across three dataframes and replaces the original features.

    Parameters:
    - df1, df2, df3: DataFrames to transform.
    - feature_names: List of the names of the features to transform.

    Returns:
    - Three DataFrames with the transformed features replacing the original features.
    """
    for feature_name in feature_names:
        # Combine the feature data from all dataframes for each feature
        combined_feature_data = pd.concat([df1[feature_name], df2[feature_name], df3[feature_name]], ignore_index=True)

        # Instantiate and fit the QuantileTransformer
        transformer = QuantileTransformer(output_distribution='normal', random_state=0)
        transformer.fit(combined_feature_data.to_frame())

        # Transform the feature in each dataframe and replace the original feature
        df1[feature_name] = transformer.transform(df1[[feature_name]])
        df2[feature_name] = transformer.transform(df2[[feature_name]])
        df3[feature_name] = transformer.transform(df3[[feature_name]])

    return df1, df2, df3

In [8]:
import csv
def write_predictions_to_csv(array1, array2, array3, filename="predictions.csv"):
    """
    Writes three 1D numpy arrays to a CSV file with an "id" and "prediction" column.
    
    :param array1: The first 1D numpy array containing the predictions.
    :param array2: The second 1D numpy array containing the predictions.
    :param array3: The third 1D numpy array containing the predictions.
    :param filename: The name of the CSV file to write to.
    """
    # Concatenate the arrays
    predictions = np.concatenate((array1, array2, array3))
    
    # Write to CSV
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        # Write header
        writer.writerow(["id", "prediction"])
        # Write the id and prediction
        for idx, prediction in enumerate(predictions):
            writer.writerow([idx, prediction])
            
    print(f"File '{filename}' has been written with {len(predictions)} rows.")

In [9]:
def set_negative_values_to_zero(csv_file_path, output_csv_file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Set negative values to zero
    df[df < 0] = 0

    # Save the modified DataFrame back to a new CSV file
    df.to_csv(output_csv_file_path, index=False)
    print(f"Modified CSV saved to {output_csv_file_path}")

In [13]:
def cat_boost(location, iterations, depth, seed):
    x_train, x_test, x_observed = utils.preprocess_category_estimated_observed(location)
    x_train.drop(["time", "snow_density:kgm3"], axis=1, inplace=True)
    x_observed.drop(["time", "date_calc", "snow_density:kgm3"], axis=1, inplace=True)

    test_a, test_b, test_c = prepare_test()
    if location == "A":
        x_test = test_a
    if location == "B":
        x_test = test_b
    if location == "C":
        x_test = test_c
        
    x_train = encode_datetime_with_season_and_cyclical_time(x_train, "date_forecast")
    x_observed = encode_datetime_with_season_and_cyclical_time(x_observed, "date_forecast")
    x_test.drop(["date_calc", "snow_density:kgm3", "date_forecast", 'elevation:m', 'snow_drift:idx'], axis=1, inplace=True)

    x_train['direct_rad_lag1'] = x_train['direct_rad:W'].shift(1)
    x_train['diffuse_rad_lag1'] = x_train['diffuse_rad:W'].shift(1)
    x_train['direct_rad_rolling_mean'] = x_train['direct_rad:W'].rolling(window=3).mean()
    x_train['sun_elev_direct_rad_interaction'] = x_train['sun_elevation:d'] * x_train['direct_rad:W']
    
    x_observed['direct_rad_lag1'] = x_observed['direct_rad:W'].shift(1)
    x_observed['diffuse_rad_lag1'] = x_observed['diffuse_rad:W'].shift(1)
    x_observed['direct_rad_rolling_mean'] = x_observed['direct_rad:W'].rolling(window=3).mean()
    x_observed['sun_elev_direct_rad_interaction'] = x_observed['sun_elevation:d'] * x_observed['direct_rad:W']
    
    x_train, x_observed, x_test = apply_quantile_transformation(x_train, x_observed, x_test, ['direct_rad:W', 'diffuse_rad:W', 'sun_elev_direct_rad_interaction', 'clear_sky_rad:W'])

    # Filter to include only data from April to July
    april_to_july_df = x_train[x_train['date_forecast'].dt.month.isin([4, 5, 6, 7])]

    # Sample 30% of the data from this subset
    random_sample = april_to_july_df.sample(frac=0.3, random_state=seed)

    # Remove the sampled rows from the original dataframe
    x_train = x_train.drop(random_sample.index)
    
    x_train.drop(["date_forecast", 'elevation:m', 'snow_drift:idx'], axis=1, inplace=True)
    x_observed.drop(["date_forecast", 'elevation:m', 'snow_drift:idx'], axis=1, inplace=True)
    random_sample.drop(["date_forecast", 'elevation:m', 'snow_drift:idx'], axis=1, inplace=True)

    x_train['observed'] = 1
    random_sample['observed'] = 1
    x_observed['observed'] = 0
    x_test['observed'] = 0
    
    x_train = pd.concat([x_train, x_observed], ignore_index=True)
    X = x_train.drop('pv_measurement', axis=1)  # Features
    Y = x_train['pv_measurement']  # Target
    
    X_val = random_sample.drop('pv_measurement', axis=1)  # Features
    Y_val = random_sample['pv_measurement']  # Target

    
    y_pred = catboost_pred(X, Y, X_val, Y_val, x_test, iterations, depth, seed)

    return np.asarray(y_pred)


#%%

import time

def make_cat_boost_pred(iterations, depth, seed):
    start_time = time.time()
    a = cat_boost("A", iterations, depth, seed)
    b = cat_boost("B", int(iterations/2), depth, seed)
    c = cat_boost("C", int(iterations/2), depth, seed)
    write_predictions_to_csv(a, b, c, filename="cat_boost2_"+str(depth)+".csv")
    set_negative_values_to_zero("cat_boost2_"+str(depth)+".csv", "cat_boost2_"+str(depth)+".csv")
    print("""
          
          
          New iterations!!!!
          
          
          """)
    end_time = time.time()
    total_time = end_time - start_time
    print(f"The code took {total_time} seconds to run.")
    return "cat_boost_"+str(depth)+".csv"

a = make_cat_boost_pred(500, 15, 2)# denne
b = make_cat_boost_pred(1000, 14, 12)# denne
c = make_cat_boost_pred(2000, 13, 22) #DENNE MÅ TESTES MER
d = make_cat_boost_pred(3000, 12, 32)
e = make_cat_boost_pred(4000, 11, 42)
f = make_cat_boost_pred(5000, 10, 52)
g = make_cat_boost_pred(7000, 9, 62)
h = make_cat_boost_pred(9000, 8, 52)
i = make_cat_boost_pred(10000, 7, 62)


csv_files = [a,b,c,d,e,f,g,h,i]  # Replace with your filenames
output_file = 'catboostAv2.csv'
average_predictions(csv_files, output_file)


Total data points: 34085
Data points to be removed: 0
Total values replaced: 38077
Total values replaced: 5689
Total values replaced: 1037
Total values replaced: 1017
Total values replaced: 1074
Index(['season_spring', 'season_summer', 'season_autumn', 'season_winter',
       'hour_sin', 'hour_cos', 'absolute_humidity_2m:gm3',
       'air_density_2m:kgm3', 'ceiling_height_agl:m', 'clear_sky_energy_1h:J',
       'clear_sky_rad:W', 'cloud_base_agl:m', 'dew_or_rime:idx',
       'dew_point_2m:K', 'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W',
       'direct_rad_1h:J', 'effective_cloud_cover:p', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
       'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx',
       'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_depth:cm', 'snow_melt_10

KeyboardInterrupt: 

In [16]:
set_negative_values_to_zero("cat_boost2_"+str(14)+".csv", "cat_boost2_"+str(14)+".csv")

Modified CSV saved to cat_boost2_14.csv
