Source: https://www.kaggle.com/tartakovsky/pytorch-lightning-lstm-timeseries-clean-code

Data: https://www.kaggle.com/uciml/electric-power-consumption-data-set

# Setup

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Read Data

In [2]:
filepath = 'data/household_power_consumption.txt'
df_powcon = pd.read_csv(filepath, sep=';',
                        parse_dates={'date':['Date','Time']},
                        infer_datetime_format=True,
                        index_col='date')
print(df_powcon.dtypes)
df_powcon.head()

  interactivity=interactivity, compiler=compiler, result=result)


Global_active_power       object
Global_reactive_power     object
Voltage                   object
Global_intensity          object
Sub_metering_1            object
Sub_metering_2            object
Sub_metering_3           float64
dtype: object


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [3]:
# change types to float (and all no number values to nan)
for i in range(len(df_powcon.columns)):
    df_powcon.iloc[:,i] = pd.to_numeric(df_powcon.iloc[:,i], errors='coerce')
print(df_powcon.dtypes)

Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dtype: object


In [4]:
df_powcon.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2075259 entries, 2006-12-16 17:24:00 to 2010-11-26 21:02:00
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Global_active_power    float64
 1   Global_reactive_power  float64
 2   Voltage                float64
 3   Global_intensity       float64
 4   Sub_metering_1         float64
 5   Sub_metering_2         float64
 6   Sub_metering_3         float64
dtypes: float64(7)
memory usage: 126.7 MB


In [5]:
# check for missing values
df_powcon.isnull().sum()

Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64

In [6]:
# drop missing values
for j in range(len(df_powcon.columns)):
    df_powcon.iloc[:,j]=df_powcon.iloc[:,j].fillna('NaN')
    df_powcon.iloc[:,j].dropna(inplace=True)
    df_powcon.iloc[:,j] = df_powcon.iloc[:,j].astype(float)
df_powcon.isnull().sum()

Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64

In [7]:
df_powcon.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2075259 entries, 2006-12-16 17:24:00 to 2010-11-26 21:02:00
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Global_active_power    float64
 1   Global_reactive_power  float64
 2   Voltage                float64
 3   Global_intensity       float64
 4   Sub_metering_1         float64
 5   Sub_metering_2         float64
 6   Sub_metering_3         float64
dtypes: float64(7)
memory usage: 126.7 MB


In [8]:
df_powcon.resample('h').mean()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:00:00,4.222889,0.229000,234.643889,18.100000,0.0,0.527778,16.861111
2006-12-16 18:00:00,3.632200,0.080033,234.580167,15.600000,0.0,6.716667,16.866667
2006-12-16 19:00:00,3.400233,0.085233,233.232500,14.503333,0.0,1.433333,16.683333
2006-12-16 20:00:00,3.268567,0.075100,234.071500,13.916667,0.0,0.000000,16.783333
2006-12-16 21:00:00,3.056467,0.076667,237.158667,13.046667,0.0,0.416667,17.216667
...,...,...,...,...,...,...,...
2010-11-26 17:00:00,1.725900,0.061400,237.069667,7.216667,0.0,0.000000,12.866667
2010-11-26 18:00:00,1.573467,0.053700,237.531833,6.620000,0.0,0.000000,0.000000
2010-11-26 19:00:00,1.659333,0.060033,236.741000,7.056667,0.0,0.066667,0.000000
2010-11-26 20:00:00,1.163700,0.061167,239.396000,4.913333,0.0,1.066667,0.000000


In [19]:
# resamble to hourly means
df_powcon = df_powcon.resample('h').mean()

df_powcon.isnull().sum()

Global_active_power      421
Global_reactive_power    421
Voltage                  421
Global_intensity         421
Sub_metering_1           421
Sub_metering_2           421
Sub_metering_3           421
dtype: int64

In [21]:
for j in range(len(df_powcon.columns)):
    df_powcon.iloc[:,j]=df_powcon.iloc[:,j].fillna('NaN')
    df_powcon.iloc[:,j].dropna(inplace=True)
df_powcon.isnull().sum()

Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64

In [9]:
# define features (X) and labels (y)
y = df_powcon['Global_active_power'].values

columns = ['Global_reactive_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']
X = np.zeros((len(columns), len(y)))
X = df_powcon[columns].values
X.shape

(34589, 6)

In [10]:
X

array([[2.29000000e-01, 2.34643889e+02, 1.81000000e+01, 0.00000000e+00,
        5.27777778e-01, 1.68611111e+01],
       [8.00333333e-02, 2.34580167e+02, 1.56000000e+01, 0.00000000e+00,
        6.71666667e+00, 1.68666667e+01],
       [8.52333333e-02, 2.33232500e+02, 1.45033333e+01, 0.00000000e+00,
        1.43333333e+00, 1.66833333e+01],
       ...,
       [6.00333333e-02, 2.36741000e+02, 7.05666667e+00, 0.00000000e+00,
        6.66666667e-02, 0.00000000e+00],
       [6.11666667e-02, 2.39396000e+02, 4.91333333e+00, 0.00000000e+00,
        1.06666667e+00, 0.00000000e+00],
       [0.00000000e+00, 2.39690000e+02, 3.80000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00]])

# Normalization

In [17]:
# train - valid - test splits
X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, shuffle=False, test_size=.2)
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, shuffle=False, test_size=.25)
    
print(X_train.shape, X_val.shape, X_test.shape)
    
# normalize each column
scaler = StandardScaler()
scaler.fit(X_train)
print(scaler.mean_)
X_train = scaler.transform(X_train)
#for split in [X_train, X_val, X_test]:
#    split = scaler.transform(split)
    #split = split.reshape((split.shape[0], 1, split.shape[1]))
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

(20753, 6) (6918, 6) (6918, 6)
[1.15914657e-01 2.40382181e+02 4.80091461e+00 1.20325332e+00
 1.43981164e+00 6.13056661e+00]


In [18]:
X_train

array([[[ 1.75197843e+00, -1.81974918e+00,  3.30019559e+00,
         -3.23802614e-01, -1.97904756e-01,  1.46390511e+00]],

       [[-5.55892605e-01, -1.83995702e+00,  2.67981541e+00,
         -3.23802614e-01,  1.14503940e+00,  1.46466302e+00]],

       [[-4.75331432e-01, -2.26733427e+00,  2.40767530e+00,
         -3.23802614e-01, -1.40574600e-03,  1.43965193e+00]],

       ...,

       [[-6.89644809e-01, -9.58286771e-02,  9.84109588e-01,
         -3.23802614e-01, -3.08812173e-01,  1.78980718e+00]],

       [[-1.92334491e-01,  3.65586530e-01,  3.35605508e-01,
         -3.23802614e-01, -1.96699241e-01,  1.68976282e+00]],

       [[-9.11188034e-01,  4.64899151e-01,  1.70170794e-01,
         -3.23802614e-01, -3.12428719e-01,  1.40099843e+00]]])