# Statistical tests on the change in the yearly production and consumption
- H-null: the difference in values are not statistically significant
- H-alt: the difference in values are statistically significant

** these values are not random, so this doesn't fit conceptually with statistical test

In [1]:
import functions
import pandas as pd
import yaml

## Importing data

In [2]:
config = functions.read_yaml("./../config.yaml")

In [3]:
data = pd.read_csv(config['data']['smard_clean'])

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1080 entries, 0 to 1079
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   type          1080 non-null   object 
 1   date          1080 non-null   object 
 2   source        1080 non-null   object 
 3   quantity_gwh  1080 non-null   float64
dtypes: float64(1), object(3)
memory usage: 33.9+ KB


In [5]:
data.head(3)

Unnamed: 0,type,date,source,quantity_gwh
0,generation,2017-01-01,biomass,3538.84775
1,generation,2017-02-01,biomass,3309.601
2,generation,2017-03-01,biomass,3570.5005


### Changing date to datetime object

In [6]:
data.date = pd.to_datetime(data['date'])

## Splitting dataset into consumption and production

In [7]:
prod = data[data['type']=='generation'].copy()

In [8]:
prod.type.unique()

array(['generation'], dtype=object)

In [9]:
prod.shape

(864, 4)

In [10]:
consump = data[data['type']=='consumption'].copy()

In [11]:
consump.type.unique()

array(['consumption'], dtype=object)

In [12]:
consump.shape

(216, 4)

## Production
In the production dataset, there are 12 different categories:
- biomass
- hydropower
- wind offshore
- wind onshore
- photovoltaics
- other renewable
- nuclear
- lignite
- hard coal
- fossil gas
- hyro pumped storage
- other conventional

In this dataset there is data in the amount of electricity produced by these sources in gigawatt-hours (GWh) from 2017 to 2022. The goal is to see if the changes in the total amount produced from one year to the next is statistically significant.

In [13]:
prod.head()

Unnamed: 0,type,date,source,quantity_gwh
0,generation,2017-01-01,biomass,3538.84775
1,generation,2017-02-01,biomass,3309.601
2,generation,2017-03-01,biomass,3570.5005
3,generation,2017-04-01,biomass,3422.009
4,generation,2017-05-01,biomass,3433.31325


In [14]:
prod.source.unique()

array(['biomass', 'hydropower', 'wind_offshore', 'wind_onshore',
       'photovoltaics', 'other_renewable', 'nuclear', 'lignite',
       'hard_coal', 'fossil_gas', 'hydro_pumped_storage',
       'other_conventional'], dtype=object)

In [15]:
prod.source.nunique()

12

### Grouping sources into larger groups

In [16]:
source_groups = {'wind_offshore': 'wind',
                 'wind_onshore': 'wind',
                 'lignite': 'combustibles',
                 'hard_coal': 'combustibles',
                 'fossil_gas': 'combustibles',
                 'other_conventional': 'combustibles'
                }

In [17]:
source_groups.keys()

dict_keys(['wind_offshore', 'wind_onshore', 'lignite', 'hard_coal', 'fossil_gas', 'other_conventional'])

In [18]:
prod['source_groups']=prod['source'].apply(lambda x: source_groups[x] if x in source_groups.keys() else x)

In [19]:
prod.source_groups.unique()

array(['biomass', 'hydropower', 'wind', 'photovoltaics',
       'other_renewable', 'nuclear', 'combustibles',
       'hydro_pumped_storage'], dtype=object)

### Statistical test - looking at year over year changes by production source

In [20]:
from scipy.stats import ttest_ind

In [21]:
prod.head()

Unnamed: 0,type,date,source,quantity_gwh,source_groups
0,generation,2017-01-01,biomass,3538.84775,biomass
1,generation,2017-02-01,biomass,3309.601,biomass
2,generation,2017-03-01,biomass,3570.5005,biomass
3,generation,2017-04-01,biomass,3422.009,biomass
4,generation,2017-05-01,biomass,3433.31325,biomass


In [22]:
prod.info()

<class 'pandas.core.frame.DataFrame'>
Index: 864 entries, 0 to 863
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   type           864 non-null    object        
 1   date           864 non-null    datetime64[ns]
 2   source         864 non-null    object        
 3   quantity_gwh   864 non-null    float64       
 4   source_groups  864 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 40.5+ KB


In [23]:
def year_source_testing(df):

    df_temp = df.copy()

    results_dict = {}

    #get number of years in the dataset
    years = (df_temp['date']
             .dt.year
             .unique()
             .tolist()
            )

    #get sources in the dataset
    sources = df_temp['source_groups'].unique().tolist()

    for source in sources:
        for i, year in enumerate(years):
            try:
                group1 = df_temp[(df_temp['source_groups'] == source) & (df_temp['date'].dt.year == years[i])]
                group2 = df_temp[(df_temp['source_groups'] == source) & (df_temp['date'].dt.year == years[i+1])]

            except:
                pass
            
            ttest_statistic, pvalue = ttest_ind(group1['quantity_gwh'],group2['quantity_gwh'])

            source_key = source+'_'+str(i)

            results_dict[source_key]=[group1['date'].dt.year.unique()[0], group2['date'].dt.year.unique()[0], source, pvalue, pvalue < 0.05]

    df_results = pd.DataFrame.from_dict(results_dict, orient='index', columns = ['year1', 'year2', 'source', 'pvalue', 'pvalue_less_than_0.05'])
    
    return df_results


In [24]:
prod_results = year_source_testing(prod)
prod_results

Unnamed: 0,year1,year2,source,pvalue,pvalue_less_than_0.05
biomass_0,2017,2018,biomass,0.7683522,False
biomass_1,2018,2019,biomass,0.7474973,False
biomass_2,2019,2020,biomass,0.6281097,False
biomass_3,2020,2021,biomass,0.07597893,False
biomass_4,2021,2022,biomass,0.9891887,False
biomass_5,2022,2022,biomass,1.0,False
hydropower_0,2017,2018,hydropower,0.7755413,False
hydropower_1,2018,2019,hydropower,0.6002565,False
hydropower_2,2019,2020,hydropower,0.796708,False
hydropower_3,2020,2021,hydropower,0.3968616,False


In [25]:
prod_results[prod_results['pvalue_less_than_0.05'] == True]

Unnamed: 0,year1,year2,source,pvalue,pvalue_less_than_0.05
other_renewable_4,2021,2022,other_renewable,0.010134,True
nuclear_2,2019,2020,nuclear,0.006035455,True
nuclear_3,2020,2021,nuclear,0.04527939,True
nuclear_4,2021,2022,nuclear,1.548787e-17,True
hydro_pumped_storage_2,2019,2020,hydro_pumped_storage,2.21145e-06,True
hydro_pumped_storage_3,2020,2021,hydro_pumped_storage,5.115668e-07,True
hydro_pumped_storage_4,2021,2022,hydro_pumped_storage,0.001263703,True


### Statistical test - looking at total production year over year

In [26]:
def year_testing(df):

    df_temp = df.copy()

    results_dict = {}

    #get number of years in the dataset
    years = (df_temp['date']
             .dt.year
             .unique()
             .tolist()
            )

    for i, year in enumerate(years):
        try:
            group1 = df_temp[(df_temp['date'].dt.year == years[i])]
            group2 = df_temp[(df_temp['date'].dt.year == years[i+1])]

        except:
            pass
        
        ttest_statistic, pvalue = ttest_ind(group1['quantity_gwh'],group2['quantity_gwh'])

        dict_key = i

        results_dict[dict_key]=[group1['date'].dt.year.unique()[0], group2['date'].dt.year.unique()[0], pvalue, pvalue < 0.05]

    df_results = pd.DataFrame.from_dict(results_dict, orient='index', columns = ['year1', 'year2', 'pvalue', 'pvalue_less_than_0.05'])
    
    return df_results


In [27]:
year_testing(prod)

Unnamed: 0,year1,year2,pvalue,pvalue_less_than_0.05
0,2017,2018,0.938258,False
1,2018,2019,0.688641,False
2,2019,2020,0.607914,False
3,2020,2021,0.85614,False
4,2021,2022,0.811763,False
5,2022,2022,1.0,False
