In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
df = pd.read_excel('/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Data REAL/Complete Data w 2024.xlsx')
df.set_index("DateTime", inplace=True)
df.sort_index(ascending=True)
df

In [None]:
import pandas as pd

df = df.drop(columns=['TempNo1', 'TempNo2', 'TempNo3', 'TempNo4', 'TempNo5', 
                              'precipNo1', 'precipNo2', 'precipNo3', 'precipNo4', 'precipNo5'])

df

In [None]:
df.isna().sum()

In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame
df['Total_Volume'] = df['Total_Volume'].interpolate(method='linear')


In [None]:
df.isna().sum()

# Feature engineering 

### Calendar features (public holidays, extended vacations like christmas, weekend/weekday)

In [None]:
# Create a weekend dummy (1 for Saturday and Sunday, 0 otherwise)
df['Weekend'] = (df.index.dayofweek >= 5).astype(int)

In [None]:
# Create the Christmas vacation dummy variable
df['Christmas vacation'] = 0  

# Loop through each year in your dataset
for year in df.index.year.unique():
    start_date = pd.Timestamp(year=year, month=12, day=24)
    end_date = pd.Timestamp(year=year, month=1, day=1, hour=23, minute=59, second=59) + pd.offsets.YearEnd(0)
    
    # Set the dummy variable to 1 for the duration of the Christmas vacation
    df.loc[start_date:end_date, 'Christmas vacation'] = 1

In [None]:
# Create dummy for public holiday 
from datetime import date
import holidays

# Combine the holidays for Norway, Sweden, Denmark, and Finland
holiday_list = []
years = range(2018, 2025)  

for year in years:
    for country in ['NO', 'SE', 'DK', 'FI']:
        for date, name in sorted(holidays.CountryHoliday(country, years=[year]).items()):
            holiday_list.append(date)

# Remove duplicates if a holiday is shared between countries
holiday_list = list(set(holiday_list))

# Initialize the public holiday dummy column
df['Public holiday'] = 0

# Mark the public holidays in the dummy column
df.loc[df.index.normalize().isin(holiday_list), 'Public holiday'] = 1

In [None]:
import pandas as pd


def is_winter_time(date):
    # Define the winter start and end dates
    year = date.year
    start_winter_this_year = pd.Timestamp(year=year, month=10, day=29)
    end_winter_this_year = pd.Timestamp(year=year+1, month=3, day=26)

    # Adjust for dates across the new year
    if date.month < 4:
        start_winter_this_year = pd.Timestamp(year=year-1, month=10, day=29)
    
    # Check if the date is within the winter period
    return int(start_winter_this_year <= date <= end_winter_this_year)

# Apply the function to the DataFrame's index
df['Winter Time'] = df.index.map(is_winter_time)



### Summing production/consumption and taking avg of temp and prec


In [None]:
# Calculate the average temperature across the five areas
df['Temp NO'] = df[['TempNo1', 'TempNo2', 'TempNo3', 'TempNo4', 'TempNo5']].mean(axis=1)

# Calculate the average precipitation across the five areas
df['Precip NO'] = df[['precipNo1', 'precipNo2', 'precipNo3', 'precipNo4', 'precipNo5']].mean(axis=1)

df.drop(['TempNo1', 'TempNo2', 'TempNo3', 'TempNo4', 'TempNo5',
         'precipNo1', 'precipNo2', 'precipNo3', 'precipNo4', 'precipNo5'], axis=1, inplace=True)


In [None]:

# Summing the columns for Total Consumption
df['Total Consumption'] = df[['Total Consumption NO', 'Total Consumption SE', 
                              'Total Consumption DK', 'Total Consumption FI']].sum(axis=1)

# Summing the columns for Total Production
df['Total Production'] = df[['Total Production NO', 'Total Production SE', 
                             'Total Production DK', 'Total Production FI']].sum(axis=1)


df.drop(['Total Consumption NO', 'Total Consumption SE', 'Total Consumption DK', 'Total Consumption FI',
         'Total Production NO', 'Total Production SE', 'Total Production DK', 'Total Production FI'], axis=1, inplace=True)


In [None]:

df.drop([
    'Day-ahead Consumption Prognosis NO',
    'Day-ahead Production Prognosis NO',
    'Day-ahead Consumption Prognosis SE',
    'Day-ahead Production Prognosis SE',
    'Day-ahead Consumption Prognosis DK',
    'Day-ahead Production Prognosis DK',
    'Day-ahead Consumption Prognosis FI',
    'Day-ahead Production Prognosis FI'
], axis=1, inplace=True)


# Determine optimal lagged features

In [None]:
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf


# Adjusting the plot size, font size, and line width for better readability
plt.rcParams['figure.figsize'] = (16, 8)  # Adjusts the figure size
plt.rcParams['font.size'] = 12  # Adjusts the font size
plt.rcParams['lines.linewidth'] = 2  # Adjusts the line width

# First Plot with 50 Lags
plt.figure()  # Creates a new figure
plot_acf(df['System Price'], lags=50, alpha=0.05)
plt.title('Autocorrelation Function (ACF) for System Price - 50 Lags', fontsize=14)
plt.xlabel('Lags', fontsize=12)
plt.ylabel('Autocorrelation', fontsize=12)
plt.grid(True)  # Adds a grid for easier reference
plt.show()

# Second Plot with 168 Lags
plt.figure()  # Creates a new figure for the second plot
plot_acf(df['System Price'], lags=170, alpha=0.05)
plt.title('Autocorrelation Function (ACF) for System Price - 170 Lags', fontsize=14)
plt.xlabel('Lags', fontsize=12)
plt.ylabel('Autocorrelation', fontsize=12)
plt.grid(True)  # Adds a grid for easier reference
plt.show()


### Correlation analysis 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


correlations = {}


for lag in range(1, 169):
    df[f'system_price_lag_{lag}'] = df['System Price'].shift(periods=lag)
    correlations[lag] = df['System Price'].corr(df[f'system_price_lag_{lag}'])


correlations_df = pd.DataFrame(list(correlations.items()), columns=['Lag', 'Correlation'])


correlations_df['Abs_Correlation'] = correlations_df['Correlation'].abs()
sorted_correlations = correlations_df.sort_values(by='Abs_Correlation', ascending=False)


print("Top 10 lags based on absolute correlation:")
print(sorted_correlations.head(20))


sorted_correlations.head(50).plot(x='Lag', y='Abs_Correlation', kind='bar', figsize=(16, 8))
plt.title('Top Lag Correlations with System Price')
plt.xlabel('Lag (hours)')
plt.ylabel('Absolute Correlation')
plt.tight_layout()  #
plt.show()


df.drop(columns=[f'system_price_lag_{lag}' for lag in range(1, 169)], inplace=True)


### Mutual information function 

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mutual_info_score
import matplotlib.pyplot as plt


def calculate_mutual_information(series, max_lag):
    mutual_info = []
    for lag in range(1, max_lag + 1):
        # Create lagged series
        lagged_series = series.shift(lag)
        # Remove the NaNs introduced by shifting
        combined = pd.DataFrame({'original': series, 'lagged': lagged_series}).dropna()
        # Calculate mutual information
        mi = mutual_info_score(combined['original'], combined['lagged'])
        mutual_info.append(mi)
    return mutual_info

max_lag = 40


series = df['System Price'].dropna()  
mi_values = calculate_mutual_information(series, max_lag)


plt.plot(range(1, max_lag + 1), mi_values, marker='o')
plt.title('Mutual Information of System Price Across Different Lags')
plt.xlabel('Lag')
plt.ylabel('Mutual Information')
plt.grid(True)
plt.show()

for i in range(1, len(mi_values) - 1):
    if mi_values[i] < mi_values[i-1] and mi_values[i] <= mi_values[i+1]:
        print(f"First local minimum occurs at lag {i}")
        optimal_lag = i
        break




### Suggested lags 

In [None]:
lags_to_create = [1, 2, 3, 24, 48, 168]


existing_lags = [col for col in df.columns if 'System Price Lag' in col]
df.drop(columns=existing_lags, inplace=True, errors='ignore')  

for lag in lags_to_create:
    lag_column_name = f'System Price Lag {lag}'
    df[lag_column_name] = df['System Price'].shift(lag)


print(f"Number of lagged columns added: {len(lags_to_create)}")


print(f"Total number of columns in the DataFrame: {df.shape[1]}")

df.dropna(inplace=True)

In [None]:
df.to_excel('/Users/evenbakke/Documents/Master Thesis/MasterThesis-/Data 2.0/Final Data with 2024.xlsx')