In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline

url='https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv'
data = pd.read_csv(url,sep=",")

data.head()

Unnamed: 0,Date,Consumption,Wind,Solar,Wind+Solar
0,2006-01-01,1069.184,,,
1,2006-01-02,1380.521,,,
2,2006-01-03,1442.533,,,
3,2006-01-04,1457.217,,,
4,2006-01-05,1477.131,,,


In [2]:
data.dtypes

Date            object
Consumption    float64
Wind           float64
Solar          float64
Wind+Solar     float64
dtype: object

In [3]:
data['Date'] = pd.to_datetime(data['Date'])
data.dtypes

Date           datetime64[ns]
Consumption           float64
Wind                  float64
Solar                 float64
Wind+Solar            float64
dtype: object

In [4]:
data.set_index('Date', inplace=True)
data.index

DatetimeIndex(['2006-01-01', '2006-01-02', '2006-01-03', '2006-01-04',
               '2006-01-05', '2006-01-06', '2006-01-07', '2006-01-08',
               '2006-01-09', '2006-01-10',
               ...
               '2017-12-22', '2017-12-23', '2017-12-24', '2017-12-25',
               '2017-12-26', '2017-12-27', '2017-12-28', '2017-12-29',
               '2017-12-30', '2017-12-31'],
              dtype='datetime64[ns]', name='Date', length=4383, freq=None)

In [6]:
data_freq = data.asfreq('D')
data_freq.index

DatetimeIndex(['2006-01-01', '2006-01-02', '2006-01-03', '2006-01-04',
               '2006-01-05', '2006-01-06', '2006-01-07', '2006-01-08',
               '2006-01-09', '2006-01-10',
               ...
               '2017-12-22', '2017-12-23', '2017-12-24', '2017-12-25',
               '2017-12-26', '2017-12-27', '2017-12-28', '2017-12-29',
               '2017-12-30', '2017-12-31'],
              dtype='datetime64[ns]', name='Date', length=4383, freq='D')

In [7]:
data_freq.isnull().sum()

Consumption       0
Wind           1463
Solar          2195
Wind+Solar     2196
dtype: int64

In [8]:
data_feq = data.asfreq('D', method='ffill')
data_freq.isnull().sum()

Consumption       0
Wind           1463
Solar          2195
Wind+Solar     2196
dtype: int64

#### Resampling
* 시간 단위를 다르게 샘플링

In [10]:
# 일주일 단위, 대표값은 평균
data_columns = ['Consumption', 'Wind', 'Solar', 'Wind+Solar']
data_weekly_mean = data[data_columns].resample('W').mean()
data_weekly_mean.head()

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-01-01,1069.184,,,
2006-01-08,1381.300143,,,
2006-01-15,1486.730286,,,
2006-01-22,1490.031143,,,
2006-01-29,1514.176857,,,


In [11]:
# 한 달 단위, 대표값은 최대값
data_columns = ['Consumption', 'Wind', 'Solar', 'Wind+Solar']
data_monthly_max = data[data_columns].resample('M').max()
data_monthly_max.head()

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-01-31,1613.312,,,
2006-02-28,1606.667,,,
2006-03-31,1545.834,,,
2006-04-30,1447.798,,,
2006-05-31,1380.175,,,


#### Rolling windows
weekly resampling : Jan1 - Jan7; Jan8 - Jan14; Jan15 - Jan21  
weekly rolling : Jan1 - Jan7; Jan2 - Jan8; Jan3 - Jan9

In [12]:
data_columns = ['Consumption', 'Wind', 'Solar', 'Wind+Solar']
data_7d_rol = data[data_columns].rolling(window=7, center=True).mean()
data_7d_rol.head()

Unnamed: 0_level_0,Consumption,Wind,Solar,Wind+Solar
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-01-01,,,,
2006-01-02,,,,
2006-01-03,,,,
2006-01-04,1361.471429,,,
2006-01-05,1381.300143,,,


In [16]:
data_365d_rol = data[data_columns].rolling(window=365, center=True).mean()
data_365d_rol.head()

Unnamed: 0,Consumption,Wind,Solar,Wind+Solar
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,


In [None]:
fig, ax = plt.subplots(figsize=(11,4))

sns.scatterplot(data=data, x='Date', y= 'Consumption', ax=ax, marker='.', label='Daily')
sns.lineplot(data=data_7d_rol, x='Date', y= 'Consumption', ax=ax, label='7-d Rolling Mean')
sns.lineplot(data=data_365d_rol, x='Date', y= 'Consumption', ax=ax, label='365-d Rolling Mean')

ax.legend()
ax.set_xlabel('Year')
ax.set_ylabel('Consumption (GWh)')
ax.set_title('Trends in Electricity Consumption')