In [1]:
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# from Preprocessing import *

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Preprocessing functions
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
def drop_consecutive_nonzero_repeats(df):
    count = 0
    previous_value = None
    indices_to_drop = []

    for i, value in enumerate(df['pv_measurement']):
        if value != 0:  # Exclude zeros
            if value == previous_value:
                count += 1
                if count > 6:  # 3 consecutive times the same number
                    # Mark the indices to be dropped
                    indices_to_drop.extend(list(range(i - count, i + 1)))
            else:
                count = 0
            previous_value = value

    # Drop the rows with consecutive repeats
    df = df.drop(indices_to_drop)
    df.reset_index(drop=True, inplace=True)
    return df


def preprocessing(df,target,soort_data):
    
        target.rename(columns={'time': 'date_forecast'}, inplace=True)
        
        # 1. Attempt to drop the 'date_calc' column (if it exists) in each DataFrame
        if 'date_calc' in df.columns:
            df = df.drop('date_calc', axis=1)
        
        # 2. Linear interpolation for all columns
        for features in df.columns:
            if features == 'snow_density:kgm3':
                df[features].fillna(0, inplace=True)
            else:
                # Interpolate missing values using linear interpolation
                df[features] = df[features].interpolate(method='linear')


        # 4. Set 'date_forecast' as the index and resample to hourly data
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        df.set_index('date_forecast', inplace=True)
        df = df.resample('H').mean()

        if soort_data == 'train_observed' or soort_data == 'train_estimated':
            df = pd.merge(df, target, on='date_forecast', how='inner')

        # 7. Conditional operations for 'pv_measurement' column (if it exists).
        if 'pv_measurement' in df.columns:

            df = drop_consecutive_nonzero_repeats(df)
            df = df.dropna(subset=["pv_measurement"])
            
         # 8. Attempt to drop the 'date_forecast' column (if it exists) in each DataFrame
        if 'date_forecast' in df.columns:
            df = df.drop('date_forecast', axis=1)
            
        # Drop all rows where all columns are empty
        df = df.dropna(how='all')
        return df


'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Main pipline
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

#define function to load all the data
def load_data(location):
    target = pd.read_parquet(f'{location}/raw/train_targets.parquet')
    train_observed = pd.read_parquet(f'{location}/raw/X_train_observed.parquet')
    train_estimated = pd.read_parquet(f'{location}/raw/X_train_estimated.parquet')
    test_estimated = pd.read_parquet(f'{location}/raw/X_test_estimated.parquet')
    
    #put all the data of one location into a list
    data = [target, train_observed, train_estimated, test_estimated]
    return data

#load all the data and put them in a separate list for every location
data_A = load_data('A')
data_B = load_data('B')
data_C = load_data('C')

#preprocess the three different datasets for all locations
def preprocess_data(data):
    train_observed = preprocessing(data[1],data[0],'train_observed')
    train_estimated = preprocessing(data[2],data[0],'train_estimated')
    test_estimated = preprocessing(data[3],data[0],'test_estimated')
    data = [train_observed, train_estimated, test_estimated]
    return data

preprocessed_A = preprocess_data(data_A)
preprocessed_B = preprocess_data(data_B)
preprocessed_C = preprocess_data(data_C)

def save_to_file(data_to_file,location):
    #saving train estimated data to csv
    data_to_file[0].to_csv(f'{location}/preproc_train_observed_{location}.csv', index=False)
    data_to_file[1].to_csv(f'{location}/preproc_train_estimated_{location}.csv', index=False)
    data_to_file[2].to_csv(f'{location}/preproc_test_estimated_{location}.csv', index=False)
    
#save preprocessed data
preprocessed_A = save_to_file(preprocessed_A,'A')
preprocessed_B = save_to_file(preprocessed_B,'B')
preprocessed_C = save_to_file(preprocessed_C,'C')




  from pandas.core.computation.check import NUMEXPR_INSTALLED
