In [10]:
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# from Preprocessing import *

''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Preprocessing functions
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
def drop_consecutive_nonzero_repeats(df):
    count = 0
    previous_value = None
    indices_to_drop = []

    for i, value in enumerate(df['pv_measurement']):
        if value != 0:  # Exclude zeros
            if value == previous_value:
                count += 1
                if count > 1:  # 3 consecutive times the same number
                    # Mark the indices to be dropped
                    indices_to_drop.extend(list(range(i - count, i + 1)))
            else:
                count = 0
            previous_value = value

    # Drop the rows with consecutive repeats
    df = df.drop(indices_to_drop)
    df.reset_index(drop=True, inplace=True)
    return df


# drop consecutive zeros
def drop_consecutive_zero_repeats(df):
    count = 0
    previous_value = None
    indices_to_drop = []

    for i, value in enumerate(df['pv_measurement']):
        if value == previous_value:
            count += 1
            if count > 22:  # 24 consecutive times zero
                # Mark the indices to be dropped
                indices_to_drop.extend(list(range(i - count, i + 1)))
        else:
            count = 0
        previous_value = value

    # Drop the rows with consecutive repeats
    df.drop(indices_to_drop, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df

def preprocessing(df,target,soort_data):
    
        target.rename(columns={'time': 'date_forecast'}, inplace=True)
        
        # 1. Attempt to drop the 'date_calc' column (if it exists) in each DataFrame
        if 'date_calc' in df.columns:
            df = df.drop('date_calc', axis=1)
        
        # 2. Linear interpolation for all columns
        for features in df.columns:
            if features == 'snow_density:kgm3':
                df[features].fillna(0, inplace=True)
            else:
                # Interpolate missing values using linear interpolation
                df[features] = df[features].interpolate(method='linear')
        
        # # 3. Remove columns with constant data
        # unique_counts = df.nunique()
        # constant_features = unique_counts[unique_counts == 1].index
        # df = df.drop(columns=constant_features, axis=1)

        #    # 4. Set 'date_forecast' as the index and resample to hourly data
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        # Set 'date_forecast' as the index
        df.set_index('date_forecast', inplace=True)
        # Shift the index one hour back
        # df.index = df.index + pd.Timedelta(hours=1)
        
        # Resample to hourly interval and calculate the mean
        df = df.resample('H').mean() #closed='right'
        
        
       
      
        

        # 5. Merge the first and second DataFrames in the preprocessed_features list 
        # 6. Merge the first and third DataFrames in the input_features list
        if soort_data == 'train_observed' or soort_data == 'train_estimated':
            df = pd.merge(df, target, on='date_forecast', how='inner')
            
        #Reset the index and bring 'date_forecast' back as a column
        if 'date_forecast' in df.index.names:
            df = df.reset_index()
      
        print('train_observed',df.head())
        print('train_observed',df.columns)
            

        # 7. Conditional operations for 'pv_measurement' column (if it exists).
        if 'pv_measurement' in df.columns:
            
            # drop consecutive non-zero and zero repeats
            df = drop_consecutive_nonzero_repeats(df)
            #df = drop_consecutive_zero_repeats(df)
            
            # Drop rows with NaN values in the 'pv_measurement' column
            df = df.dropna(subset=['pv_measurement'], axis=0)
            df = df.fillna(0) #probable not needed anymore
            
         # 8. Attempt to drop the 'date_forecast' column (if it exists) in each DataFrame
        if 'date_forecast' in df.columns:
            df = df.drop('date_forecast', axis=1)
            
        # Drop all rows where all columns are empty
        df = df.dropna(how='all')
        # Drop rows where all columns except 'date_forecast' are empty
        df = df.dropna(subset=df.columns.difference(['date_forecast']), how='all')
    
        return df

'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
def check_dataframe_same_columns(df1, df2, df3):
    # Check and print column comparisons for df1 and df2
    if df1.columns.equals(df2.columns):
        print("Both df1 and df2 data frames have the same columns.")
    else:
        print("The df1 and df2 data frames have different columns.")
        unique_columns_df1 = set(df1.columns) - set(df2.columns)
        unique_columns_df2 = set(df2.columns) - set(df1.columns)
        print("Columns unique to df1:", unique_columns_df1)
        print("Columns unique to df2:", unique_columns_df2)

    # Check and print column comparisons for df1 and df3
    if df1.columns.equals(df3.columns):
        print("Both df1 and df3 data frames have the same columns.")
    else:
        print("The df1 and df3 data frames have different columns.")
        unique_columns_df1 = set(df1.columns) - set(df3.columns)
        unique_columns_df3 = set(df3.columns) - set(df1.columns)
        print("Columns unique to df1:", unique_columns_df1)
        print("Columns unique to df3:", unique_columns_df3)

    # Check and print column comparisons for df2 and df3
    if df2.columns.equals(df3.columns):
        print("Both df2 and df3 data frames have the same columns.")
    else:
        print("The df2 and df3 data frames have different columns.")
        unique_columns_df2 = set(df2.columns) - set(df3.columns)
        unique_columns_df3 = set(df3.columns) - set(df2.columns)
        print("Columns unique to df2:", unique_columns_df2)
        print("Columns unique to df3:", unique_columns_df3)
        
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
Main pipline
'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''

#define function to load all the data
def load_data(location):
    target = pd.read_parquet(f'{location}/raw/train_targets.parquet')
    train_observed = pd.read_parquet(f'{location}/raw/X_train_observed.parquet')
    train_estimated = pd.read_parquet(f'{location}/raw/X_train_estimated.parquet')
    test_estimated = pd.read_parquet(f'{location}/raw/X_test_estimated.parquet')
    
    #put all the data of one location into a list
    data = [target, train_observed, train_estimated, test_estimated]
    return data

#load all the data and put them in a separate list for every location
data_A = load_data('A')
data_B = load_data('B')
data_C = load_data('C')

# #check if columns are the same
# print('location A')
# check_dataframe_same_columns(data_A[1],data_A[2],data_A[3])
# print('location B')
# check_dataframe_same_columns(data_B[1],data_B[2],data_B[3])
# print('location C')
# check_dataframe_same_columns(data_C[1],data_C[2],data_C[3])

#preprocess the three different datasets for all locations
def preprocess_data(data):
    train_observed = preprocessing(data[1],data[0],'train_observed')
    train_estimated = preprocessing(data[2],data[0],'train_estimated')
    test_estimated = preprocessing(data[3],data[0],'test_estimated')
    data = [train_observed, train_estimated, test_estimated]
    return data

preprocessed_A = preprocess_data(data_A)
preprocessed_B = preprocess_data(data_B)
preprocessed_C = preprocess_data(data_C)

#checking for the columns
# print('location A')
# check_dataframe_same_columns(preprocessed_A[0],preprocessed_A[1],preprocessed_A[2])
# print('location B')
# check_dataframe_same_columns(preprocessed_B[0],preprocessed_B[1],preprocessed_B[2])
# print('location C')
# check_dataframe_same_columns(preprocessed_C[0],preprocessed_C[1],preprocessed_C[2])

test = pd.read_csv('test.csv')
test.rename(columns={'time': 'date_forecast'}, inplace=True)
test['date_forecast'] = pd.to_datetime(test['date_forecast'])
# Set 'date_forecast' as the index
test.set_index('date_forecast', inplace=True)
# Split the DataFrame based on 'location'
location_a = test[test['location'] == 'A']
location_b = test[test['location'] == 'B']
location_c = test[test['location'] == 'C']

#preprocessed_A[2] = pd.merge(location_a, preprocessed_A[2], on='date_forecast', how='inner')
#preprocessed_B[2] = pd.merge(location_b, preprocessed_B[2], on='date_forecast', how='inner')
#preprocessed_C[2] = pd.merge(location_c, preprocessed_C[2], on='date_forecast', how='inner')

# Drop specified columns
#columns_to_drop = ['id', 'prediction', 'location']
#preprocessed_A[2] = preprocessed_A[2].drop(columns=columns_to_drop)
#preprocessed_B[2] = preprocessed_B[2].drop(columns=columns_to_drop)
#preprocessed_C[2] = preprocessed_C[2].drop(columns=columns_to_drop)

def save_to_file(data_to_file,location):
    #saving train estimated data to csv
    data_to_file[0].to_csv(f'{location}/preproc_train_observed_{location}_with_dateforecast_zero.csv', index=False)
    data_to_file[1].to_csv(f'{location}/preproc_train_estimated_{location}_with_dateforecast_zero.csv', index=False)
    data_to_file[2].to_csv(f'{location}/preproc_test_estimated_{location}_with_dateforecast_zero.csv', index=False)
    
    
    #saving train estimated data to excel
    #data_to_file[0].to_excel(f'{location}/preproc_train_observed_{location}_with_dateforecast_left.xlsx', index=False)
    #data_to_file[1].to_excel(f'{location}/preproc_train_estimated_{location}_with_dateforecast_left.xlsx', index=False)
    #data_to_file[2].to_excel(f'{location}/preproc_test_estimated_{location}_with_dateforecast_left.xlsx', index=False)

# #save preprocessed data
preprocessed_A = save_to_file(preprocessed_A,'A')
preprocessed_B = save_to_file(preprocessed_B,'B')
preprocessed_C = save_to_file(preprocessed_C,'C')
#


# def divide_dataset(train_observed):
#     x_train = train_observed.drop('pv_measurement', axis = 1)
#     y_train = train_observed['pv_measurement']
#     training_data = [x_train, y_train]
#     return training_data
# 
# #retrieve the input features and target value separately for all locations
# training_A = divide_dataset(preprocessed_A[0])
# # training_B = divide_dataset(preprocessed_B[0])
# # training_C = divide_dataset(preprocessed_C[0])
# 
# 
# #define a function that applies a pearson correlation matrix using the target variable
# def correlation_matrix(data):
#     correlation = data[0].corr()
# 
#     # plot the entire correlation matrix
#     plt.figure(figsize=(64, 56))
#     sns.heatmap(correlation, annot=True, cmap=plt.cm.Reds)
#     plt.title("Correlation all features Heatmap")
#     plt.show()
# 
#     #Correlation with output variable
#     cor_target = abs(correlation['pv_measurement'])
#     # print('cor_target',cor_target)
# 
#     #Selecting highly correlated features
#     relevant_features = cor_target[cor_target>0.5] 
#     print('relevant_features',relevant_features)
#     relevant_features_label = list(cor_target[cor_target>0.5].index)
#     #print('relevant_features_label',relevant_features_label)
#     filtered_cor = correlation.loc[relevant_features_label, relevant_features_label]
# 
#     # Using Pearson Correlation (compare with other input features)
#     plt.figure(figsize=(12, 8))
#     sns.heatmap(filtered_cor, annot=True, cmap=plt.cm.Reds)
#     plt.title("Correlation high target features Heatmap")
#     plt.show()
# 
#     # Get the upper triangle of the correlation matrix
#     upper_triangle = filtered_cor.where(np.triu(np.ones(filtered_cor.shape), k=1).astype(bool))
# 
#     # Find indices where values are greater than 0.5
#     indices_gt_0_5 = [(i, j) for i in range(filtered_cor.shape[0]-1) for j in range(i + 1, filtered_cor.shape[1]-1) if abs(upper_triangle.iloc[i, j]) > 0.5]
# 
#     # Create a separate array for values not greater than 0.5
#     values_not_gt_0_5 = [upper_triangle.iloc[i, j] for i, j in indices if abs(upper_triangle.iloc[i, j]) <= 0.5]
#     # print(indices)
#     # Display the list of indices
#     
# 
#     print("Indices where values are greater than 0.5:", indices_gt_0_5)
#     print("Values not greater than 0.5:", values_not_gt_0_5)
#     
#     indices = [list(pair) for pair in indices_gt_0_5]
#     print(indices)
# 
#     #compare input features that correlate with each other and select the one that has the highest correlation with the target variable
#     array = []
#     for i in range(len(indices)):
#         if relevant_features[indices[i][0]] > relevant_features[indices[i][1]]:
#             array.append(indices[i][0])
#         else:
#             array.append(indices[i][1])
# 
#     print('array',array)
#     array = np.unique(array)
#     print('uniqe_array',array)
#     final_features= relevant_features.index[array]
#     print('final_features',final_features)
# 
#     final_data = [df[final_features] for df in data]
#     final_data.append(final_features)
# 
#     # temp = train_data[0]
#     # train_data_final = temp[feature_names]
#     # train_data_final['pv_measurements'] = target
#     # train_data_final['date_forecast'] = temp['date_forecast']
#     # column_order = ['date_forecast'] + [col for col in train_data_final.columns if col != 'date_forecast']
#     # train_data_final = train_data_final[column_order]
#     # 
#     # test_data_final = train_data[2[feature_names]]
#     # test_data_final = test_data_final[test_data_final.iloc[:,1:].notnull().all(axis=1)]
# 
# 
#     #return the following final_data = [final_train_observed, final_train_estimated,final_test_estimated, feature_names]
#     return final_data
# 
# #Apply person correlation and extract important features
# final_data_a = correlation_matrix(preprocessed_A)
# # final_data_b = correlation_matrix(preprocessed_B)
# # final_data_c = correlation_matrix(preprocessed_C)
# 
# # #add pv_measurement back to observed and estimated train data
# # final_data_a[0]['pv_measurements'] = training_A[1]
# # # final_data_a[0].to_excel('final_train_observed_A.xlsx')
# # final_data_b[0]['pv_measurements'] = training_B[1]
# # # final_data_b[0].to_excel('final_train_observed_B.xlsx')
# # final_data_c[0]['pv_measurements'] = training_C[1]
# # # final_data_b[0].to_excel('final_train_observed_C.xlsx')
# # final_data_a[1]['pv_measurements'] = training_A[1]
# # # final_data_a[0].to_excel('final_train_observed_A.xlsx')
# # final_data_b[1]['pv_measurements'] = training_B[1]
# # # final_data_b[0].to_excel('final_train_observed_B.xlsx')
# # final_data_c[1]['pv_measurements'] = training_C[1]
# # # final_data_b[0].to_excel('final_train_observed_C.xlsx')
# # 
# # # saving train observed data to csv
# # final_data_a[0].to_csv('A/final_train_observed_A.csv', index=False)
# # # final_data_b[0].to_csv('final_train_observed_B.csv', index=False)
# # # final_data_c[0].to_csv('final_train_observed_C.csv', index=False)
# # 
# # # saving train estimated data to csv
# # final_data_a[1].to_csv('A/final_train_estimated_A.csv', index=False)
# # # final_data_b[1].to_csv('final_train_estimated_B.csv', index=False)
# # # final_data_c[1].to_csv('final_train_estimated_C.csv', index=False)
# # 
# # # saving test estimated data to csv
# # final_data_a[2].to_csv('A/final_test_estimated_A.csv', index=False)
# # # final_data_b[2].to_csv('final_test_estimated_B.csv', index=False)
# # # final_data_c[2].to_csv('final_test_estimated_C.csv', index=False)



train_observed         date_forecast  absolute_humidity_2m:gm3  air_density_2m:kgm3  \
0 2019-06-02 22:00:00                     7.700              1.22825   
1 2019-06-02 23:00:00                     7.700              1.22350   
2 2019-06-03 00:00:00                     7.875              1.21975   
3 2019-06-03 01:00:00                     8.425              1.21800   
4 2019-06-03 02:00:00                     8.950              1.21800   

   ceiling_height_agl:m  clear_sky_energy_1h:J  clear_sky_rad:W  \
0           1728.949951               0.000000             0.00   
1           1689.824951               0.000000             0.00   
2           1563.224976               0.000000             0.00   
3           1283.425049             208.649994             0.75   
4           1003.500000           32468.150391            23.10   

   cloud_base_agl:m  dew_or_rime:idx  dew_point_2m:K  diffuse_rad:W  ...  \
0       1728.949951              0.0      280.299988          0.000  ... 