# Cleaning SMARD production and consumption datasets

In [1]:
import functions
import pandas as pd
import yaml

## Generation

### Reading data

In [2]:
config = functions.read_yaml("./../config.yaml")

In [4]:
prod = pd.read_csv(config["data"]['smard_generation_month'], delimiter=';')
prod

Unnamed: 0,Date,Start,End,Biomass [MWh] Calculated resolutions,Hydropower [MWh] Calculated resolutions,Wind offshore [MWh] Calculated resolutions,Wind onshore [MWh] Calculated resolutions,Photovoltaics [MWh] Calculated resolutions,Other renewable [MWh] Calculated resolutions,Nuclear [MWh] Calculated resolutions,Lignite [MWh] Calculated resolutions,Hard coal [MWh] Calculated resolutions,Fossil gas [MWh] Calculated resolutions,Hydro pumped storage [MWh] Calculated resolutions,Other conventional [MWh] Calculated resolutions
0,"Jan 1, 2017",12:00 AM,12:00 AM,3538847.75,857085.75,1230811.25,6537027.5,798422.25,143007.75,5724510.75,11680618.25,9528848.25,3726353.75,943923.5,5849799.75
1,"Feb 1, 2017",12:00 AM,12:00 AM,3309601,893765.25,1899418.75,7974597.25,1484969,169841,4543553.5,10507497.25,7034393.75,2522128,728857.75,4897134.75
2,"Mar 1, 2017",12:00 AM,12:00 AM,3570500.5,1237295.25,1487019.75,7844275.25,3230598.5,168431.5,4850962.25,11530155.25,5885619.25,2191162.75,850219.75,4542777
3,"Apr 1, 2017",12:00 AM,12:00 AM,3422009,1155622.5,1395689.25,6688391.75,3856853.75,158481.25,4764289,10910745.75,4581378.5,1787657.5,800464,3344311
4,"May 1, 2017",12:00 AM,12:00 AM,3433313.25,1510106,1207361,4434810.75,5120312.25,148759.25,5649988.5,10874074,5433327.25,1773670.5,796230.5,3483117.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,"Aug 1, 2022",12:00 AM,12:00 AM,3192319.5,1093975,1018192.75,3497707.25,7286873.75,79183.75,2940986.5,8915118.5,5637572.75,2250216.75,836072.75,884578.75
68,"Sep 1, 2022",12:00 AM,12:00 AM,3181223.75,1070724,1910452.25,6151548.5,4800868.75,82417.25,2892888.5,8737581.5,6126504.25,1934657.25,861912,885732.5
69,"Oct 1, 2022",12:00 AM,12:00 AM,3284823.5,1162092.25,2466036.25,8541308.5,3560053.25,86545,2669763.25,8701680.5,3734102.5,2152051.5,1003282.5,903465
70,"Nov 1, 2022",12:00 AM,12:00 AM,3321422.5,947984.25,2818895.75,10025509,1703989.25,105893,2836794.75,9264454,6014713.25,2318648.5,1048726.25,928239.25


In [5]:
prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 15 columns):
 #   Column                                             Non-Null Count  Dtype 
---  ------                                             --------------  ----- 
 0   Date                                               72 non-null     object
 1   Start                                              72 non-null     object
 2   End                                                72 non-null     object
 3   Biomass [MWh] Calculated resolutions               72 non-null     object
 4   Hydropower [MWh] Calculated resolutions            72 non-null     object
 5   Wind offshore [MWh] Calculated resolutions         72 non-null     object
 6   Wind onshore [MWh] Calculated resolutions          72 non-null     object
 7   Photovoltaics [MWh] Calculated resolutions         72 non-null     object
 8   Other renewable [MWh] Calculated resolutions       72 non-null     object
 9   Nuclear [MWh] Calculate

### Changing dtype of values

In [6]:
def fix_dtypes(df:pd.DataFrame)->pd.DataFrame:
    """
    Function drops columns that are not needed and sets the correct dtypes to columns

    Input:
    df: pandas dataframe

    Output
    pandas dataframe

    """

    df_temp = df.copy()

    #drop columns that are not needed
    df_temp.drop(['Start','End'], axis=1, inplace=True)

    #change date dtype
    df_temp['Date'] = pd.to_datetime(df_temp['Date'])

    
     #change value to float
    for col in df_temp.columns:
        if df_temp[col].dtype == 'object':
            df_temp[col] = df_temp[col].str.replace(',', '').astype('float')
   

    return df_temp
    

In [7]:
prod = fix_dtypes(prod)
prod

Unnamed: 0,Date,Biomass [MWh] Calculated resolutions,Hydropower [MWh] Calculated resolutions,Wind offshore [MWh] Calculated resolutions,Wind onshore [MWh] Calculated resolutions,Photovoltaics [MWh] Calculated resolutions,Other renewable [MWh] Calculated resolutions,Nuclear [MWh] Calculated resolutions,Lignite [MWh] Calculated resolutions,Hard coal [MWh] Calculated resolutions,Fossil gas [MWh] Calculated resolutions,Hydro pumped storage [MWh] Calculated resolutions,Other conventional [MWh] Calculated resolutions
0,2017-01-01,3538847.75,857085.75,1230811.25,6537027.50,798422.25,143007.75,5724510.75,11680618.25,9528848.25,3726353.75,943923.50,5849799.75
1,2017-02-01,3309601.00,893765.25,1899418.75,7974597.25,1484969.00,169841.00,4543553.50,10507497.25,7034393.75,2522128.00,728857.75,4897134.75
2,2017-03-01,3570500.50,1237295.25,1487019.75,7844275.25,3230598.50,168431.50,4850962.25,11530155.25,5885619.25,2191162.75,850219.75,4542777.00
3,2017-04-01,3422009.00,1155622.50,1395689.25,6688391.75,3856853.75,158481.25,4764289.00,10910745.75,4581378.50,1787657.50,800464.00,3344311.00
4,2017-05-01,3433313.25,1510106.00,1207361.00,4434810.75,5120312.25,148759.25,5649988.50,10874074.00,5433327.25,1773670.50,796230.50,3483117.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,2022-08-01,3192319.50,1093975.00,1018192.75,3497707.25,7286873.75,79183.75,2940986.50,8915118.50,5637572.75,2250216.75,836072.75,884578.75
68,2022-09-01,3181223.75,1070724.00,1910452.25,6151548.50,4800868.75,82417.25,2892888.50,8737581.50,6126504.25,1934657.25,861912.00,885732.50
69,2022-10-01,3284823.50,1162092.25,2466036.25,8541308.50,3560053.25,86545.00,2669763.25,8701680.50,3734102.50,2152051.50,1003282.50,903465.00
70,2022-11-01,3321422.50,947984.25,2818895.75,10025509.00,1703989.25,105893.00,2836794.75,9264454.00,6014713.25,2318648.50,1048726.25,928239.25


### Quick inspection of totals

In [8]:
prod.set_index('Date').sum(axis=1)

Date
2017-01-01    50559256.50
2017-02-01    45965757.25
2017-03-01    47389017.00
2017-04-01    42865893.25
2017-05-01    43865070.50
                 ...     
2022-08-01    37632798.00
2022-09-01    38636510.50
2022-10-01    38265204.00
2022-11-01    41335269.75
2022-12-01    41985497.50
Length: 72, dtype: float64

These totals align with other data sources.

### Clean column names

In [9]:
def clean_col_names(df:pd.DataFrame)->pd.DataFrame:
    """
    Function cleans the column names and formats them following Pep8 style

    Input:
    df: pandas dataframe

    Output
    pandas dataframe

    """

    df_temp = df.copy()

    df_temp.columns = (df_temp.columns
                       .str.replace(' [MWh] Calculated resolutions', '')
                       .str.lower()
                       .str.replace(' ', '_')
                      )
    return df_temp

In [10]:
prod = clean_col_names(prod)
prod.columns

Index(['date', 'biomass', 'hydropower', 'wind_offshore', 'wind_onshore',
       'photovoltaics', 'other_renewable', 'nuclear', 'lignite', 'hard_coal',
       'fossil_gas', 'hydro_pumped_storage', 'other_conventional'],
      dtype='object')

### Reshaping data

In [11]:
prod.head()

Unnamed: 0,date,biomass,hydropower,wind_offshore,wind_onshore,photovoltaics,other_renewable,nuclear,lignite,hard_coal,fossil_gas,hydro_pumped_storage,other_conventional
0,2017-01-01,3538847.75,857085.75,1230811.25,6537027.5,798422.25,143007.75,5724510.75,11680618.25,9528848.25,3726353.75,943923.5,5849799.75
1,2017-02-01,3309601.0,893765.25,1899418.75,7974597.25,1484969.0,169841.0,4543553.5,10507497.25,7034393.75,2522128.0,728857.75,4897134.75
2,2017-03-01,3570500.5,1237295.25,1487019.75,7844275.25,3230598.5,168431.5,4850962.25,11530155.25,5885619.25,2191162.75,850219.75,4542777.0
3,2017-04-01,3422009.0,1155622.5,1395689.25,6688391.75,3856853.75,158481.25,4764289.0,10910745.75,4581378.5,1787657.5,800464.0,3344311.0
4,2017-05-01,3433313.25,1510106.0,1207361.0,4434810.75,5120312.25,148759.25,5649988.5,10874074.0,5433327.25,1773670.5,796230.5,3483117.25


In [12]:
def reshape(df:pd.DataFrame)->pd.DataFrame:
    """
    Function reshapes dataframe from wide to long

    New dataframe will have this format

    date | source | quantity_mwh

    Input:
    df: pandas dataframe

    Output
    pandas dataframe


    """

    df_temp = df.copy()

    #get columns to use in reshape
    var_columns = df_temp.drop('date', axis=1).columns.tolist()

    #reshape from wide to long using pd.melt
    df_temp = (pd.melt(df_temp,
                       id_vars=['date'],
                       value_vars=var_columns,
                        var_name='source',
                        value_name = 'quantity_mwh')
              )
    
    return df_temp

In [13]:
prod = reshape(prod)
prod.head()

Unnamed: 0,date,source,quantity_mwh
0,2017-01-01,biomass,3538847.75
1,2017-02-01,biomass,3309601.0
2,2017-03-01,biomass,3570500.5
3,2017-04-01,biomass,3422009.0
4,2017-05-01,biomass,3433313.25


### Changing units from MWh to GWh

In [14]:
def fix_units(df:pd.DataFrame)->pd.DataFrame:
    """

    This function divides the quantity column by 1000 to change the units to gigawatt-hours

    Input:
    df: pandas dataframe

    Output
    pandas dataframe

    """

    df_temp = df.copy()

    df_temp['quantity_mwh'] = df_temp['quantity_mwh'].apply(lambda x: x/1000)
    
    df_temp = df_temp.rename(columns={'quantity_mwh':'quantity_gwh'})

    return df_temp

In [15]:
prod = fix_units(prod)

In [16]:
prod

Unnamed: 0,date,source,quantity_gwh
0,2017-01-01,biomass,3538.84775
1,2017-02-01,biomass,3309.60100
2,2017-03-01,biomass,3570.50050
3,2017-04-01,biomass,3422.00900
4,2017-05-01,biomass,3433.31325
...,...,...,...
859,2022-08-01,other_conventional,884.57875
860,2022-09-01,other_conventional,885.73250
861,2022-10-01,other_conventional,903.46500
862,2022-11-01,other_conventional,928.23925


In [17]:
#uncomment to save prod separately
#prod.to_csv('../data/cleaned/smard_prod_clean.csv', index=False)

## Consumption

### Reading data

In [19]:
consump = pd.read_csv(config["data"]['smard_consumption_month'], delimiter=';')
consump

Unnamed: 0,Date,Start,End,Total (grid load) [MWh] Calculated resolutions,Residual load [MWh] Calculated resolutions,Hydro pumped storage [MWh] Calculated resolutions
0,"Jan 1, 2017",12:00 AM,12:00 AM,47301200.25,38734939.25,778529.5
1,"Feb 1, 2017",12:00 AM,12:00 AM,42100201,30741216,744737.25
2,"Mar 1, 2017",12:00 AM,12:00 AM,44319006,31757112.5,811232.5
3,"Apr 1, 2017",12:00 AM,12:00 AM,39933859.5,27992924.75,748242
4,"May 1, 2017",12:00 AM,12:00 AM,41034028.75,30271544.75,703380.75
...,...,...,...,...,...,...
67,"Aug 1, 2022",12:00 AM,12:00 AM,37453414,25650640.25,1127873.25
68,"Sep 1, 2022",12:00 AM,12:00 AM,37355684.25,24492814.75,1123106.5
69,"Oct 1, 2022",12:00 AM,12:00 AM,38543441,23976043,1372024
70,"Nov 1, 2022",12:00 AM,12:00 AM,39764029,25215635,1409620.75


The residual load is the actual consumption or grid load minus the generation from photovoltaic installations and wind power stations. If the residual load is zero or negative, then renewable energy from wind or solar was able to meet the demand for energy. [SMARD.de](https://www.smard.de/en/204142-204142#:~:text=The%20forecasted%20residual%20load%20is,stations%20(onshore%20and%20offshore).)

### Changing dtypes of values

In [20]:
consump = fix_dtypes(consump)

### Quick inspection of totals

In [21]:
consump.set_index('Date').sum(axis=1)

Date
2017-01-01    86814669.00
2017-02-01    73586154.25
2017-03-01    76887351.00
2017-04-01    68675026.25
2017-05-01    72008954.25
                 ...     
2022-08-01    64231927.50
2022-09-01    62971605.50
2022-10-01    63891508.00
2022-11-01    66389284.75
2022-12-01    71371643.25
Length: 72, dtype: float64

### Clean column names

In [22]:
consump = clean_col_names(consump)

### Reshaping data

In [23]:
consump = reshape(consump)

### Changing units from MWh to TWh

In [24]:
consump = fix_units(consump)

In [25]:
consump

Unnamed: 0,date,source,quantity_gwh
0,2017-01-01,total_(grid_load),47301.20025
1,2017-02-01,total_(grid_load),42100.20100
2,2017-03-01,total_(grid_load),44319.00600
3,2017-04-01,total_(grid_load),39933.85950
4,2017-05-01,total_(grid_load),41034.02875
...,...,...,...
211,2022-08-01,hydro_pumped_storage,1127.87325
212,2022-09-01,hydro_pumped_storage,1123.10650
213,2022-10-01,hydro_pumped_storage,1372.02400
214,2022-11-01,hydro_pumped_storage,1409.62075


In [26]:
#uncomment to save consumption
#consump.to_csv('../data/cleaned/smard_consumption_clean.csv', index=False)

## Merging geration and consumption datasets

In [27]:
prod.insert(0, 'type', 'generation')
consump.insert(0, 'type', 'consumption')

In [28]:
prod

Unnamed: 0,type,date,source,quantity_gwh
0,generation,2017-01-01,biomass,3538.84775
1,generation,2017-02-01,biomass,3309.60100
2,generation,2017-03-01,biomass,3570.50050
3,generation,2017-04-01,biomass,3422.00900
4,generation,2017-05-01,biomass,3433.31325
...,...,...,...,...
859,generation,2022-08-01,other_conventional,884.57875
860,generation,2022-09-01,other_conventional,885.73250
861,generation,2022-10-01,other_conventional,903.46500
862,generation,2022-11-01,other_conventional,928.23925


In [29]:
consump

Unnamed: 0,type,date,source,quantity_gwh
0,consumption,2017-01-01,total_(grid_load),47301.20025
1,consumption,2017-02-01,total_(grid_load),42100.20100
2,consumption,2017-03-01,total_(grid_load),44319.00600
3,consumption,2017-04-01,total_(grid_load),39933.85950
4,consumption,2017-05-01,total_(grid_load),41034.02875
...,...,...,...,...
211,consumption,2022-08-01,hydro_pumped_storage,1127.87325
212,consumption,2022-09-01,hydro_pumped_storage,1123.10650
213,consumption,2022-10-01,hydro_pumped_storage,1372.02400
214,consumption,2022-11-01,hydro_pumped_storage,1409.62075


In [30]:
smard_clean = pd.concat([prod,consump], axis=0, ignore_index=True)

In [31]:
smard_clean.shape

(1080, 4)

In [32]:
smard_clean.head()

Unnamed: 0,type,date,source,quantity_gwh
0,generation,2017-01-01,biomass,3538.84775
1,generation,2017-02-01,biomass,3309.601
2,generation,2017-03-01,biomass,3570.5005
3,generation,2017-04-01,biomass,3422.009
4,generation,2017-05-01,biomass,3433.31325


In [33]:
smard_clean.tail()

Unnamed: 0,type,date,source,quantity_gwh
1075,consumption,2022-08-01,hydro_pumped_storage,1127.87325
1076,consumption,2022-09-01,hydro_pumped_storage,1123.1065
1077,consumption,2022-10-01,hydro_pumped_storage,1372.024
1078,consumption,2022-11-01,hydro_pumped_storage,1409.62075
1079,consumption,2022-12-01,hydro_pumped_storage,1417.14


## Saving clean dataset

In [34]:
#uncomment to save
#smard_clean.to_csv('../data/cleaned/smard_clean.csv', index=False)