In [97]:
#include autoreload modules for fast testing and update
%load_ext autoreload
%autoreload 2

#import relevant modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

#import modules
from src.data.helper_closed_transactions import read_epex_file, filter_lead_time, extract_transactions
from src.data.welfare_complete import read_pw_file, pw_preparation, read_weekly_prices_file

#set maximum columns display pandas
pd.options.display.max_columns = 50

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Read new version of the excel transactions

In [2]:
tic=time.time()
path='../data/external/prepared_EPEX_2019_Nov_Dec/ID_GDM_2019_11_01_11.csv'
cols = list(pd.read_csv(path, nrows=1))
df=pd.read_csv(path,
              usecols=[i for i in cols if 'Unnamed' not in i])
toc=time.time()

In [3]:
def isTimeFormat(input):
    try:
        time.strptime(input, '%m/%d/%y %H:%M')
        return True
    except ValueError:
        return False

In [4]:
#format time presenting hour as well
df['problem_from_time']=df['FROM_TIME'].apply(lambda x: isTimeFormat(x) )
df['problem_to_time']=df['TO_TIME'].apply(lambda x: isTimeFormat(x))
df['problem_timestamp']=df['TIMESTAMP'].apply(lambda x: isTimeFormat(x))
df['FROM_TIME']=df['FROM_TIME'].apply(lambda x: x if isTimeFormat(x) else str(x)+' 0:0')
df['TO_TIME']=df['TO_TIME'].apply(lambda x: x if isTimeFormat(x) else str(x)+' 0:0')
df['TIMESTAMP']=df['TIMESTAMP'].apply(lambda x: x if isTimeFormat(x) else str(x)+' 0:0')

In [5]:
df['FROM_TIME']=pd.to_datetime(df['FROM_TIME'],format='%m/%d/%y %H:%M')
df['TO_TIME']=pd.to_datetime(df['TO_TIME'],format='%m/%d/%y %H:%M')
df['TIMESTAMP']=pd.to_datetime(df['TIMESTAMP'],format='%m/%d/%y %H:%M')

Check when the problems occur. There are now rows which present both problems

In [6]:
df[(df['problem_from_time']==False)&(df['problem_to_time']==False)]

Unnamed: 0,DLVRYDATE,MARKET_AREA_BUY,MARKET_AREA_SELL,FROM_TIME,TO_TIME,VOLUME,PRICE,TIMESTAMP,TRADE_ID,DURATION,PROD,problem_from_time,problem_to_time,problem_timestamp


In [7]:
df[df['problem_timestamp']==False]

Unnamed: 0,DLVRYDATE,MARKET_AREA_BUY,MARKET_AREA_SELL,FROM_TIME,TO_TIME,VOLUME,PRICE,TIMESTAMP,TRADE_ID,DURATION,PROD,problem_from_time,problem_to_time,problem_timestamp
75574,11/8/19,DE,DE,2019-11-08 10:00:00,2019-11-08 11:00:00,1.6,58.2,2019-11-08,1081700011,1.0,Intraday_Hour_Power,True,True,False
75575,11/8/19,DE,DE,2019-11-08 10:00:00,2019-11-08 11:00:00,7.0,58.21,2019-11-08,1081700013,1.0,Intraday_Hour_Power,True,True,False
75576,11/8/19,DE,DE,2019-11-08 10:00:00,2019-11-08 11:00:00,11.6,58.0,2019-11-08,1081700014,1.0,Intraday_Hour_Power,True,True,False
75577,11/8/19,DE,DE,2019-11-08 10:00:00,2019-11-08 11:00:00,6.0,58.0,2019-11-08,1081700015,1.0,Intraday_Hour_Power,True,True,False
75578,11/8/19,DE,DE,2019-11-08 10:00:00,2019-11-08 11:00:00,1.0,58.0,2019-11-08,1081700016,1.0,Intraday_Hour_Power,True,True,False
75579,11/8/19,DE,DE,2019-11-08 10:00:00,2019-11-08 11:00:00,3.3,57.8,2019-11-08,1081700017,1.0,Intraday_Hour_Power,True,True,False
75580,11/8/19,DE,DE,2019-11-08 10:00:00,2019-11-08 11:00:00,33.0,58.0,2019-11-08,1081700019,1.0,Intraday_Hour_Power,True,True,False
75581,11/8/19,DE,DE,2019-11-08 10:00:00,2019-11-08 11:00:00,6.6,58.9,2019-11-08,1081700021,1.0,Intraday_Hour_Power,True,True,False
186629,11/5/19,DE,DE,2019-11-05 00:15:00,2019-11-05 00:30:00,0.1,28.0,2019-11-05,1081367179,0.25,Intraday_Quarter_Hour_Power,True,True,False
186630,11/5/19,DE,DE,2019-11-05 00:15:00,2019-11-05 00:30:00,0.1,30.0,2019-11-05,1081367180,0.25,Intraday_Quarter_Hour_Power,True,True,False


Drop rows with date problem being timestamp

In [8]:
df=df[df['problem_timestamp']]

In [9]:
def correct_to_time(problem, to_time, from_time, duration):
    
    if not problem:
        return from_time + pd.Timedelta(duration, unit='hours')
    else:
        return to_time

def correct_from_time(problem, to_time, from_time, duration):
    
    if not problem:
        return to_time - pd.Timedelta(duration, unit='hours')
    else:
        return from_time
    


In [10]:
#modify dates mistakes
df['FROM_TIME']=df.apply(lambda x: correct_from_time(x['problem_from_time'],x['TO_TIME'],x['FROM_TIME'], x['DURATION']), axis=1)
df['TO_TIME']=df.apply(lambda x: correct_to_time(x['problem_to_time'],x['TO_TIME'],x['FROM_TIME'], x['DURATION']), axis=1)

In [102]:
df.head()

Unnamed: 0,DLVRYDATE,MARKET_AREA_BUY,MARKET_AREA_SELL,FROM_TIME,TO_TIME,VOLUME,PRICE,TIMESTAMP,TRADE_ID,DURATION,PROD,problem_from_time,problem_to_time,problem_timestamp
0,11/2/19,DE,DE,2019-11-02 06:30:00,2019-11-02 07:00:00,1.0,15.3,2019-11-02 05:35:00,1081040362,0.5,Intraday_Half_Hour_Power,True,True,True
1,11/2/19,DE,DE,2019-11-02 06:30:00,2019-11-02 07:00:00,1.0,13.02,2019-11-02 05:35:00,1081040372,0.5,Intraday_Half_Hour_Power,True,True,True
2,11/2/19,DE,DE,2019-11-02 09:30:00,2019-11-02 10:00:00,1.5,30.14,2019-11-02 08:38:00,1081055751,0.5,Intraday_Half_Hour_Power,True,True,True
3,11/2/19,DE,DE,2019-11-02 09:30:00,2019-11-02 10:00:00,0.4,31.49,2019-11-02 08:39:00,1081055853,0.5,Intraday_Half_Hour_Power,True,True,True
4,11/2/19,DE,DE,2019-11-02 09:30:00,2019-11-02 10:00:00,1.5,31.75,2019-11-02 08:39:00,1081055864,0.5,Intraday_Half_Hour_Power,True,True,True


Check the correction has happened

In [12]:
df[df['problem_to_time']==False]

Unnamed: 0,DLVRYDATE,MARKET_AREA_BUY,MARKET_AREA_SELL,FROM_TIME,TO_TIME,VOLUME,PRICE,TIMESTAMP,TRADE_ID,DURATION,PROD,problem_from_time,problem_to_time,problem_timestamp
9456,11/1/19,DE,DE,2019-11-01 20:00:00,2019-11-01 21:00:00,1.0,38.07,2019-11-01 19:07:00,1080995314,1.00,Intraday_Hour_Power,True,False,True
9457,11/1/19,DE,DE,2019-11-01 20:00:00,2019-11-01 21:00:00,0.7,35.11,2019-11-01 19:07:00,1080995332,1.00,Intraday_Hour_Power,True,False,True
9458,11/1/19,DE,DE,2019-11-01 20:00:00,2019-11-01 21:00:00,0.7,36.98,2019-11-01 19:07:00,1080995363,1.00,Intraday_Hour_Power,True,False,True
9459,11/1/19,DE,DE,2019-11-01 20:00:00,2019-11-01 21:00:00,0.3,36.99,2019-11-01 19:07:00,1080995364,1.00,Intraday_Hour_Power,True,False,True
9460,11/1/19,DE,DE,2019-11-01 20:00:00,2019-11-01 21:00:00,1.0,36.03,2019-11-01 19:08:00,1080995394,1.00,Intraday_Hour_Power,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
830704,11/10/19,DE,DE,2019-11-10 23:00:00,2019-11-10 23:15:00,4.0,50.19,2019-11-10 21:50:00,1081964782,0.25,XBID_Quarter_Hour_Power,True,False,True
830705,11/10/19,DE,DE,2019-11-10 23:30:00,2019-11-10 23:45:00,4.0,37.54,2019-11-10 21:50:00,1081964783,0.25,XBID_Quarter_Hour_Power,True,False,True
830706,11/10/19,DE,DE,2019-11-10 23:00:00,2019-11-10 23:15:00,3.4,50.19,2019-11-10 21:50:00,1081964784,0.25,XBID_Quarter_Hour_Power,True,False,True
830707,11/10/19,DE,DE,2019-11-10 23:00:00,2019-11-10 23:15:00,0.6,50.20,2019-11-10 21:50:00,1081964785,0.25,XBID_Quarter_Hour_Power,True,False,True


Check lead time

In [13]:
df['lead_time']=df['FROM_TIME']-df['TIMESTAMP']

In [23]:
lead_time_anal=df['lead_time'].dt.total_seconds()/60
lead_time_anal.max()

1903.0

The transactions have to be filtered

In [24]:
# define logical statement to filter rows based on lead time
logical_statement_lead_time = (df['lead_time'].dt.total_seconds(
)/60 >= 30) & (df['lead_time'].dt.total_seconds()/60 <= 60)

# filter the dataframe based on logical statement
df_filtered = df[logical_statement_lead_time]

In [25]:
lead_time_anal=df_filtered['lead_time'].dt.total_seconds()/60

In [26]:
lead_time_anal.min()

30.0

In [27]:
lead_time_anal.max()

60.0

Now change columns' names to make them match with the normal welfare matching

In [33]:
df_filtered['DURATION'].value_counts()

0.25    110769
1.00     81820
0.50       128
Name: DURATION, dtype: int64

In [30]:
def map_duration(duration):
    
    d={0.25:'Quarter Hour', 0.5:'Half Hour', 1:'Hour'}
    
    return d[duration]

In [34]:
df_filtered['End Validity Date']=df_filtered['TIMESTAMP']
df_filtered['Execution Price']=df_filtered['PRICE']
df_filtered['Executed Volume']=df_filtered['VOLUME']
df_filtered['Delivery Start']=df_filtered['FROM_TIME']
df_filtered['Instrument Type']=df_filtered['DURATION'].apply(lambda x: map_duration(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['End Validity Date']=df_filtered['TIMESTAMP']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Execution Price']=df_filtered['PRICE']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Executed Volume']=df_filtered['VOLUME']
A value is trying to be set on a copy of a

In [35]:
df_filtered.head()

Unnamed: 0,DLVRYDATE,MARKET_AREA_BUY,MARKET_AREA_SELL,FROM_TIME,TO_TIME,VOLUME,PRICE,TIMESTAMP,TRADE_ID,DURATION,PROD,problem_from_time,problem_to_time,problem_timestamp,lead_time,End Validity Date,Execution Price,Executed Volume,Delivery Start,Instrument Type
0,11/2/19,DE,DE,2019-11-02 06:30:00,2019-11-02 07:00:00,1.0,15.3,2019-11-02 05:35:00,1081040362,0.5,Intraday_Half_Hour_Power,True,True,True,0 days 00:55:00,2019-11-02 05:35:00,15.3,1.0,2019-11-02 06:30:00,Half Hour
1,11/2/19,DE,DE,2019-11-02 06:30:00,2019-11-02 07:00:00,1.0,13.02,2019-11-02 05:35:00,1081040372,0.5,Intraday_Half_Hour_Power,True,True,True,0 days 00:55:00,2019-11-02 05:35:00,13.02,1.0,2019-11-02 06:30:00,Half Hour
2,11/2/19,DE,DE,2019-11-02 09:30:00,2019-11-02 10:00:00,1.5,30.14,2019-11-02 08:38:00,1081055751,0.5,Intraday_Half_Hour_Power,True,True,True,0 days 00:52:00,2019-11-02 08:38:00,30.14,1.5,2019-11-02 09:30:00,Half Hour
3,11/2/19,DE,DE,2019-11-02 09:30:00,2019-11-02 10:00:00,0.4,31.49,2019-11-02 08:39:00,1081055853,0.5,Intraday_Half_Hour_Power,True,True,True,0 days 00:51:00,2019-11-02 08:39:00,31.49,0.4,2019-11-02 09:30:00,Half Hour
4,11/2/19,DE,DE,2019-11-02 09:30:00,2019-11-02 10:00:00,1.5,31.75,2019-11-02 08:39:00,1081055864,0.5,Intraday_Half_Hour_Power,True,True,True,0 days 00:51:00,2019-11-02 08:39:00,31.75,1.5,2019-11-02 09:30:00,Half Hour


Split the document in different days and apply to the the second part of the pipeline

In [38]:
df_filtered['FROM_TIME'].dt.day.min()

1

In [39]:
df_filtered['FROM_TIME'].dt.day.max()

10

In [67]:
def split_daily_transactions(df):
    
    min_date_day=df['FROM_TIME'].dt.day.min()
    max_date_day=df['FROM_TIME'].dt.day.max()
    month=str(df['FROM_TIME'].dt.month.iloc[0])
    
    df_list=[]
    file_names=[]
    for day in range(min_date_day, max_date_day+1):
        
        df_day=df[df['FROM_TIME'].dt.day==day]
        df_list.append(df_day)
        
        day_string=str(day) if len(str(day))==2 else '0'+str(day)
        name='2019'+month+day_string
        file_names.append(name)
    
    return df_list, file_names

In [68]:
df_list, file_names=split_daily_transactions(df_filtered)

In [69]:
file_names

['20191101',
 '20191102',
 '20191103',
 '20191104',
 '20191105',
 '20191106',
 '20191107',
 '20191108',
 '20191109',
 '20191110']

Put together the preparation function

In [47]:
def prepare_new_transactions(path):
    
    cols = list(pd.read_csv(path, nrows=1))
    df=pd.read_csv(path,
                  usecols=[i for i in cols if 'Unnamed' not in i])
    
    #format time presenting hour as well
    df['problem_from_time']=df['FROM_TIME'].apply(lambda x: isTimeFormat(x) )
    df['problem_to_time']=df['TO_TIME'].apply(lambda x: isTimeFormat(x))
    df['problem_timestamp']=df['TIMESTAMP'].apply(lambda x: isTimeFormat(x))
    df['FROM_TIME']=df['FROM_TIME'].apply(lambda x: x if isTimeFormat(x) else str(x)+' 0:0')
    df['TO_TIME']=df['TO_TIME'].apply(lambda x: x if isTimeFormat(x) else str(x)+' 0:0')
    df['TIMESTAMP']=df['TIMESTAMP'].apply(lambda x: x if isTimeFormat(x) else str(x)+' 0:0')
    
    #convert to datetime
    df['FROM_TIME']=pd.to_datetime(df['FROM_TIME'],utc=True, format='%m/%d/%y %H:%M')
    df['TO_TIME']=pd.to_datetime(df['TO_TIME'],utc=True, format='%m/%d/%y %H:%M')
    df['TIMESTAMP']=pd.to_datetime(df['TIMESTAMP'], utc=True, format='%m/%d/%y %H:%M')
    
    #remove unusable columns
    df=df[df['problem_timestamp']]
    
    #modify dates mistakes
    df['FROM_TIME']=df.apply(lambda x: correct_from_time(x['problem_from_time'],x['TO_TIME'],x['FROM_TIME'], x['DURATION']), axis=1)
    df['TO_TIME']=df.apply(lambda x: correct_to_time(x['problem_to_time'],x['TO_TIME'],x['FROM_TIME'], x['DURATION']), axis=1)
    
    #define lead time
    df['lead_time']=df['FROM_TIME']-df['TIMESTAMP']
    
    # define logical statement to filter rows based on lead time
    logical_statement_lead_time = (df['lead_time'].dt.total_seconds(
    )/60 >= 30) & (df['lead_time'].dt.total_seconds()/60 <= 60)

    # filter the dataframe based on logical statement
    df_filtered = df[logical_statement_lead_time]
    
    #add columns for pipeline compatibility
    df_filtered['End Validity Date']=df_filtered['TIMESTAMP']
    df_filtered['Execution Price']=df_filtered['PRICE']
    df_filtered['Executed Volume']=df_filtered['VOLUME']
    df_filtered['Delivery Start']=df_filtered['FROM_TIME']
    df_filtered['Instrument Type']=df_filtered['DURATION'].apply(lambda x: map_duration(x))
    
    df_list, file_names=split_daily_transactions(df_filtered)
    
    return df_list
    

In [49]:
tic=time.time()
df_list_f=prepare_new_transactions('../data/external/prepared_EPEX_2019_Nov_Dec/ID_GDM_20191101.csv')
toc=time.time()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['End Validity Date']=df_filtered['TIMESTAMP']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Execution Price']=df_filtered['PRICE']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Executed Volume']=df_filtered['VOLUME']
A value is trying to be set on a copy of a

In [51]:
toc-tic

127.00357103347778

In [55]:
df_list_f[0]

Unnamed: 0,DLVRYDATE,MARKET_AREA_BUY,MARKET_AREA_SELL,FROM_TIME,TO_TIME,VOLUME,PRICE,TIMESTAMP,TRADE_ID,DURATION,PROD,problem_from_time,problem_to_time,problem_timestamp,lead_time,End Validity Date,Execution Price,Executed Volume,Delivery Start,Instrument Type
199,11/1/19,DE,DE,2019-11-01 00:00:00+00:00,2019-11-01 01:00:00+00:00,3.1,31.00,2019-10-31 23:00:00+00:00,1080898285,1.00,Intraday_Hour_Power,True,True,True,0 days 01:00:00,2019-10-31 23:00:00+00:00,31.00,3.1,2019-11-01 00:00:00+00:00,Hour
200,11/1/19,DE,DE,2019-11-01 00:00:00+00:00,2019-11-01 01:00:00+00:00,0.4,30.01,2019-10-31 23:02:00+00:00,1080898397,1.00,Intraday_Hour_Power,True,True,True,0 days 00:58:00,2019-10-31 23:02:00+00:00,30.01,0.4,2019-11-01 00:00:00+00:00,Hour
201,11/1/19,DE,DE,2019-11-01 00:00:00+00:00,2019-11-01 01:00:00+00:00,0.7,30.02,2019-10-31 23:02:00+00:00,1080898396,1.00,Intraday_Hour_Power,True,True,True,0 days 00:58:00,2019-10-31 23:02:00+00:00,30.02,0.7,2019-11-01 00:00:00+00:00,Hour
202,11/1/19,DE,DE,2019-11-01 00:00:00+00:00,2019-11-01 01:00:00+00:00,0.2,31.17,2019-10-31 23:04:00+00:00,1080898472,1.00,Intraday_Hour_Power,True,True,True,0 days 00:56:00,2019-10-31 23:04:00+00:00,31.17,0.2,2019-11-01 00:00:00+00:00,Hour
203,11/1/19,DE,DE,2019-11-01 00:00:00+00:00,2019-11-01 01:00:00+00:00,0.4,31.23,2019-10-31 23:04:00+00:00,1080898473,1.00,Intraday_Hour_Power,True,True,True,0 days 00:56:00,2019-10-31 23:04:00+00:00,31.23,0.4,2019-11-01 00:00:00+00:00,Hour
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122758,11/1/19,DE,DE,2019-11-01 23:45:00+00:00,2019-11-02 00:00:00+00:00,9.8,28.60,2019-11-01 23:14:00+00:00,1081014573,0.25,Intraday_Quarter_Hour_Power,True,True,True,0 days 00:31:00,2019-11-01 23:14:00+00:00,28.60,9.8,2019-11-01 23:45:00+00:00,Quarter Hour
122759,11/1/19,DE,DE,2019-11-01 23:45:00+00:00,2019-11-02 00:00:00+00:00,1.0,26.90,2019-11-01 23:14:00+00:00,1081014572,0.25,Intraday_Quarter_Hour_Power,True,True,True,0 days 00:31:00,2019-11-01 23:14:00+00:00,26.90,1.0,2019-11-01 23:45:00+00:00,Quarter Hour
122760,11/1/19,DE,DE,2019-11-01 23:45:00+00:00,2019-11-02 00:00:00+00:00,1.0,26.90,2019-11-01 23:15:00+00:00,1081014666,0.25,Intraday_Quarter_Hour_Power,True,True,True,0 days 00:30:00,2019-11-01 23:15:00+00:00,26.90,1.0,2019-11-01 23:45:00+00:00,Quarter Hour
305422,11/1/19,DE,DE,2019-11-01 14:00:00+00:00,2019-11-01 15:00:00+00:00,1.0,34.20,2019-11-01 13:16:00+00:00,1080965900,1.00,XBID_Hour_Power,True,True,True,0 days 00:44:00,2019-11-01 13:16:00+00:00,34.20,1.0,2019-11-01 14:00:00+00:00,Hour


In [98]:
wp = read_weekly_prices_file(
        "../data/external/Hydro Generation and Price_CH_2019.csv")

In [99]:
wp.head()

Unnamed: 0,End Date,Unnamed: 1,Summe von Generation [MWh]3,Summe von Revenue [Û],Max. von Generation [MWh],Min. von Generation [MWh]2,Average Weekly Price [Euro/MWh],Max Weekly Pumping Price [Euro/MWh],start_date
0,2019-01-06 00:00:00+00:00,1,166734,10740712.41,4707,302,64.418249,51.5346,2018-12-30 00:00:00+00:00
1,2019-01-13 00:00:00+00:00,2,414486,28105388.19,6826,391,67.807811,54.246248,2019-01-06 00:00:00+00:00
2,2019-01-20 00:00:00+00:00,3,376949,24978865.93,5758,448,66.265903,53.012723,2019-01-13 00:00:00+00:00
3,2019-01-27 00:00:00+00:00,4,575958,44816423.67,7158,416,77.811965,62.249572,2019-01-20 00:00:00+00:00
4,2019-02-03 00:00:00+00:00,5,415389,26862029.56,6484,538,64.667166,51.733733,2019-01-27 00:00:00+00:00


In [100]:
wp.tail()

Unnamed: 0,End Date,Unnamed: 1,Summe von Generation [MWh]3,Summe von Revenue [Û],Max. von Generation [MWh],Min. von Generation [MWh]2,Average Weekly Price [Euro/MWh],Max Weekly Pumping Price [Euro/MWh],start_date
48,2019-12-08 00:00:00+00:00,49,432270,24068432.79,6345,409,55.679165,44.543332,2019-12-01 00:00:00+00:00
49,2019-12-15 00:00:00+00:00,50,410218,20140890.04,6595,312,49.098016,39.278413,2019-12-08 00:00:00+00:00
50,2019-12-22 00:00:00+00:00,51,286045,12831597.03,6309,259,44.858666,35.886933,2019-12-15 00:00:00+00:00
51,2019-12-29 00:00:00+00:00,52,194045,6281356.04,4637,616,32.370615,25.896492,2019-12-22 00:00:00+00:00
52,2020-01-01 00:00:00+00:00,53,99512,3703083.54,4714,587,37.212432,29.769946,2019-12-29 00:00:00+00:00


In [89]:
power_lim = read_pw_file(
    "../data/external/Hydro Generation up- downscale Potential_CH_2019.csv")
power_lim = pw_preparation(power_lim)

In [90]:
power_lim.head()

Unnamed: 0,VALUE_TIME,start_time,end_time,Selling Actual value update [MW],Pumping Actual value update [MW],Upscale Potential [MW],Donwnscale Potential [MW]
0,2019-01-01 00:00:00+00:00,2019-01-01 00:00:00+00:00,2019-01-01 00:15:00+00:00,4145,260,4145,260
1,2019-01-01 00:15:00+00:00,2019-01-01 00:15:00+00:00,2019-01-01 00:30:00+00:00,4145,260,4145,260
2,2019-01-01 00:30:00+00:00,2019-01-01 00:30:00+00:00,2019-01-01 00:45:00+00:00,4145,260,4145,260
3,2019-01-01 00:45:00+00:00,2019-01-01 00:45:00+00:00,2019-01-01 01:00:00+00:00,4145,260,4145,260
4,2019-01-01 01:00:00+00:00,2019-01-01 01:00:00+00:00,2019-01-01 01:15:00+00:00,4238,167,4238,167


In [74]:
power_lim.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35041 entries, 0 to 35040
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype              
---  ------                            --------------  -----              
 0   VALUE_TIME                        35041 non-null  datetime64[ns, UTC]
 1   start_time                        35041 non-null  datetime64[ns, UTC]
 2   end_time                          35041 non-null  datetime64[ns, UTC]
 3   Selling Actual value update [MW]  35041 non-null  object             
 4   Pumping Actual value update [MW]  35041 non-null  object             
 5   Upscale Potential [MW]            35041 non-null  object             
 6   Donwnscale Potential [MW]         35041 non-null  object             
dtypes: datetime64[ns, UTC](3), object(4)
memory usage: 1.9+ MB


In [75]:
power_lim_selection = power_lim[(power_lim['start_time'].dt.day == 1) & (
            power_lim['start_time'].dt.month == 3)]

In [78]:
power_lim_selection.head()

Unnamed: 0,VALUE_TIME,start_time,end_time,Selling Actual value update [MW],Pumping Actual value update [MW],Upscale Potential [MW],Donwnscale Potential [MW]
5664,2019-03-01 00:00:00+00:00,2019-03-01 00:00:00+00:00,2019-03-01 00:15:00+00:00,5167,36,5167,36
5665,2019-03-01 00:15:00+00:00,2019-03-01 00:15:00+00:00,2019-03-01 00:30:00+00:00,5167,36,5167,36
5666,2019-03-01 00:30:00+00:00,2019-03-01 00:30:00+00:00,2019-03-01 00:45:00+00:00,5167,36,5167,36
5667,2019-03-01 00:45:00+00:00,2019-03-01 00:45:00+00:00,2019-03-01 01:00:00+00:00,5167,36,5167,36
5668,2019-03-01 01:00:00+00:00,2019-03-01 01:00:00+00:00,2019-03-01 01:15:00+00:00,5146,57,5146,57


In [84]:
power_lim_selection[power_lim_selection['Upscale Potential [MW]']==5167]

Unnamed: 0,VALUE_TIME,start_time,end_time,Selling Actual value update [MW],Pumping Actual value update [MW],Upscale Potential [MW],Donwnscale Potential [MW]
5664,2019-03-01 00:00:00+00:00,2019-03-01 00:00:00+00:00,2019-03-01 00:15:00+00:00,5167,36,5167,36
5665,2019-03-01 00:15:00+00:00,2019-03-01 00:15:00+00:00,2019-03-01 00:30:00+00:00,5167,36,5167,36
5666,2019-03-01 00:30:00+00:00,2019-03-01 00:30:00+00:00,2019-03-01 00:45:00+00:00,5167,36,5167,36
5667,2019-03-01 00:45:00+00:00,2019-03-01 00:45:00+00:00,2019-03-01 01:00:00+00:00,5167,36,5167,36


In [86]:
power_lim_selection['Upscale Potential [MW]']=power_lim_selection['Upscale Potential [MW]'].astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  power_lim_selection['Upscale Potential [MW]']=power_lim_selection['Upscale Potential [MW]'].astype('float')


In [87]:
power_lim_selection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96 entries, 5664 to 5759
Data columns (total 7 columns):
 #   Column                            Non-Null Count  Dtype              
---  ------                            --------------  -----              
 0   VALUE_TIME                        96 non-null     datetime64[ns, UTC]
 1   start_time                        96 non-null     datetime64[ns, UTC]
 2   end_time                          96 non-null     datetime64[ns, UTC]
 3   Selling Actual value update [MW]  96 non-null     object             
 4   Pumping Actual value update [MW]  96 non-null     object             
 5   Upscale Potential [MW]            96 non-null     float64            
 6   Donwnscale Potential [MW]         96 non-null     object             
dtypes: datetime64[ns, UTC](3), float64(1), object(3)
memory usage: 8.1+ KB


In [88]:
power_lim_selection.groupby(power_lim_selection['start_time'].dt.hour)['Upscale Potential [MW]'].mean()

start_time
0     5167.0
1     5146.0
2     5123.0
3     5179.0
4     5103.0
5     4954.0
6     4077.0
7     2884.0
8     2262.0
9     2209.0
10    2788.0
11    2410.0
12    2768.0
13    3822.0
14    4160.0
15    3947.0
16    3282.0
17    1904.0
18     584.0
19    1021.0
20    3571.0
21    4340.0
22    4432.0
23    4897.0
Name: Upscale Potential [MW], dtype: float64

In [79]:
power_lim_selection['Donwnscale Potential [MW]']

5664     0
5665     0
5666     0
5667     0
5668     1
        ..
5755    22
5756    23
5757    23
5758    23
5759    23
Name: start_time, Length: 96, dtype: int64