# Load libraries

In [1]:
import src.utils.seasonalityMetrics as sm # calls helper file

import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import STL

# full report 

In [4]:

df_ratings = pd.read_csv(r"src\data\beerAdvocateReviews.csv") # df_ratings needs to contain 'rating', 'month', and 'year' columns.



In [5]:
sm.full_seasonality_report(df_ratings)

# note that since the FFT is discrete, we dont have a value for frequency 0.083, but only 0.085. This for a year-periodic signal it will give 11.75 months as main period, instead of 12.

Average Yearly Mean Amplitude of STL seasonality: 0.0276


The frequency with the maximum magnitude is 0.085106 cycles per month with a magnitude of 0.008027.
The magnitude of the frequency closest to a 12-month period (0.083) is: 0.008027.

This means the most significant period is: 11.750000 months.


(Figure({
     'data': [{'name': 'Original',
               'type': 'scatter',
               'x': array([datetime.datetime(2002, 1, 1, 0, 0),
                           datetime.datetime(2002, 2, 1, 0, 0),
                           datetime.datetime(2002, 3, 1, 0, 0),
                           datetime.datetime(2002, 4, 1, 0, 0),
                           datetime.datetime(2002, 5, 1, 0, 0),
                           datetime.datetime(2002, 6, 1, 0, 0),
                           datetime.datetime(2002, 7, 1, 0, 0),
                           datetime.datetime(2002, 8, 1, 0, 0),
                           datetime.datetime(2002, 9, 1, 0, 0),
                           datetime.datetime(2002, 10, 1, 0, 0),
                           datetime.datetime(2002, 11, 1, 0, 0),
                           datetime.datetime(2002, 12, 1, 0, 0),
                           datetime.datetime(2003, 1, 1, 0, 0),
                           datetime.datetime(2003, 2, 1, 0, 0),
                      

## individual function calls

In [6]:
# preprocessing for STL
stl_data = sm.STL_data_preprocessing(df_ratings)

# plotting the STL decomposition
sm.plot_STL(stl_data)

# Perform STL decomposition
stl = STL(stl_data)
result = stl.fit()
seasonal = result.seasonal

max_freq, max_magnitude, annual_freq_magnitude =  sm.report_fourier_analysis(seasonal, cutoff_freq=0.15)
sm.plot_frequency_spectrum(seasonal, cutoff_freq=0.5)

The frequency with the maximum magnitude is 0.085106 cycles per month with a magnitude of 0.008027.
The magnitude of the frequency closest to a 12-month period (0.083) is: 0.008027.

This means the most significant period is: 11.750000 months.


## test on filtered dataset

In [7]:
df_ratings.columns

df_american_pale_ales = df_ratings[
    (df_ratings['style'] == 'English Pale Ale')
]

df_american_pale_ales.head(100)
sm.full_seasonality_report(df_american_pale_ales)

Average Yearly Mean Amplitude of STL seasonality: 0.0648


The frequency with the maximum magnitude is 0.080214 cycles per month with a magnitude of 0.012365.
The magnitude of the frequency closest to a 12-month period (0.083) is: 0.004351.

This means the most significant period is: 12.466667 months.


(Figure({
     'data': [{'name': 'Original',
               'type': 'scatter',
               'x': array([datetime.datetime(2002, 1, 1, 0, 0),
                           datetime.datetime(2002, 2, 1, 0, 0),
                           datetime.datetime(2002, 3, 1, 0, 0),
                           datetime.datetime(2002, 4, 1, 0, 0),
                           datetime.datetime(2002, 5, 1, 0, 0),
                           datetime.datetime(2002, 6, 1, 0, 0),
                           datetime.datetime(2002, 7, 1, 0, 0),
                           datetime.datetime(2002, 8, 1, 0, 0),
                           datetime.datetime(2002, 9, 1, 0, 0),
                           datetime.datetime(2002, 10, 1, 0, 0),
                           datetime.datetime(2002, 11, 1, 0, 0),
                           datetime.datetime(2002, 12, 1, 0, 0),
                           datetime.datetime(2003, 1, 1, 0, 0),
                           datetime.datetime(2003, 2, 1, 0, 0),
                      

## Metric Function


input timeseries with dates (either ratings or numbers of ratings)
gives seasonality scores

- fourier transform 12 month peak ratio to second highest peak
- seasonality amplitude above a certain threshold


In [2]:

df_ratings = pd.read_csv(r"src\data\beerAdvocateReviews.csv") # df_ratings needs to contain 'rating', 'month', and 'year' columns.

timeseries_data  = sm.STL_data_preprocessing(df_ratings)
timeseries_data.head()

Unnamed: 0_level_0,mean_rating
date,Unnamed: 1_level_1
2002-01-01,3.804881
2002-02-01,3.742803
2002-03-01,3.773943
2002-04-01,3.752254
2002-05-01,3.693669


In [3]:
peak_ratio, avg_amplitude = sm.timeseries_seasonality_metric(timeseries_data)

print(f"peak_ratio: {peak_ratio}. avg_amplitude: {avg_amplitude}")

peak_ratio: 3.7551. avg_amplitude: 0.0276


In [7]:
print(df_ratings['style'])

0                  Euro Pale Lager
1                 English Pale Ale
2                 English Pale Ale
3                 English Pale Ale
4                 English Pale Ale
                    ...           
2240425    American Pale Ale (APA)
2240426                     Kölsch
2240427                     Kölsch
2240428                     Kölsch
2240429          English Brown Ale
Name: style, Length: 2240430, dtype: object


In [None]:
# on filtered dataset

df_kölsch = df_ratings[
    (df_ratings['style'] == 'Kölsch')
]

timeseries_data_kölsch  = sm.STL_data_preprocessing(df_kölsch)

peak_ratio, avg_amplitude = sm.timeseries_seasonality_metric(timeseries_data_kölsch)

print(f"peak_ratio: {peak_ratio}. avg_amplitude: {avg_amplitude}")

peak_ratio: 1.1932. avg_amplitude: 0.0838


NameError: name 'seasonal' is not defined