# Timeseries data transformations

#### Description:

This codebook covers how to perform common timeseris data transformations.

#### Skill level:

- Intermediate

### Import the required libraries
-------------------------

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns

### Create a set of timestamps
-------------------------

In [2]:
timestamps = ['2015-03-08 10:30:00.360000+00:00', '2017-07-13 15:45:05.755000-07:00',
              '2012-01-20 22:30:00.254000+05:30', '2016-12-25 00:30:00.000000+10:00']

df_raw = pd.DataFrame(timestamps, columns=['time'])

In [3]:
df_raw

Unnamed: 0,time
0,2015-03-08 10:30:00.360000+00:00
1,2017-07-13 15:45:05.755000-07:00
2,2012-01-20 22:30:00.254000+05:30
3,2016-12-25 00:30:00.000000+10:00


### Create a copy of the dataframe and convert the times to timestamp format
-------------------------

In [4]:
df = df_raw.copy()

In [5]:
df['time'] = np.array([pd.Timestamp(item) for item in np.array(df['time'])])

### Extract time elements from timestamps
-------------------------

In [6]:
df_time = df.copy()

df_time['hour'] = df_time['time'].apply(lambda d: d.hour)
df_time['minute'] = df_time['time'].apply(lambda d: d.minute)
df_time['second'] = df_time['time'].apply(lambda d: d.second)
df_time['mu_second'] = df_time['time'].apply(lambda d: d.microsecond)

In [7]:
df_time

Unnamed: 0,time,hour,minute,second,mu_second
0,2015-03-08 10:30:00.360000+00:00,10,30,0,360000
1,2017-07-13 15:45:05.755000-07:00,15,45,5,755000
2,2012-01-20 22:30:00.254000+05:30,22,30,0,254000
3,2016-12-25 00:30:00+10:00,0,30,0,0


### Bin timestamps according to the hour of the day
-------------------------

In [8]:
df_bins = df.copy()

hour_bins = [-1, 5, 11, 16, 21, 23]
bin_names = ['late_night', 'morning', 'afternoon', 'evening', 'night']

df_bins['time_of_day'] = pd.cut(df_time['hour'], bins=hour_bins, labels=bin_names)

In [9]:
df_bins

Unnamed: 0,time,time_of_day
0,2015-03-08 10:30:00.360000+00:00,morning
1,2017-07-13 15:45:05.755000-07:00,afternoon
2,2012-01-20 22:30:00.254000+05:30,night
3,2016-12-25 00:30:00+10:00,late_night


### Extract date elements from timestamps
-------------------------

In [10]:
df_date = df.copy()

df_date['year'] = df_date['time'].apply(lambda d: d.year)
df_date['month'] = df_date['time'].apply(lambda d: d.month)
df_date['day'] = df_date['time'].apply(lambda d: d.day)
df_date['day_of_week'] = df_date['time'].apply(lambda d: d.dayofweek)
df_date['day_of_year'] = df_date['time'].apply(lambda d: d.dayofyear)
df_date['week_of_year'] = df_date['time'].apply(lambda d: d.weekofyear)
df_date['quarter'] = df_date['time'].apply(lambda d: d.quarter)

In [11]:
df_date

Unnamed: 0,time,year,month,day,day_of_week,day_of_year,week_of_year,quarter
0,2015-03-08 10:30:00.360000+00:00,2015,3,8,6,67,10,1
1,2017-07-13 15:45:05.755000-07:00,2017,7,13,3,194,28,3
2,2012-01-20 22:30:00.254000+05:30,2012,1,20,4,20,3,1
3,2016-12-25 00:30:00+10:00,2016,12,25,6,360,51,4


### Bin timestamps according month of the year
-------------------------

In [12]:
df_bins = df.copy()

month_bins = [-1, 3, 6, 9, 12]
bin_names = ['Q1', 'Q2', 'Q3', 'Q4']

df_bins['quarter_of_year'] = pd.cut(df_date['month'], bins=month_bins, labels=bin_names)

In [13]:
df_bins

Unnamed: 0,time,quarter_of_year
0,2015-03-08 10:30:00.360000+00:00,Q1
1,2017-07-13 15:45:05.755000-07:00,Q3
2,2012-01-20 22:30:00.254000+05:30,Q1
3,2016-12-25 00:30:00+10:00,Q4


### Sort values by time and add a set of values
-------------------------

In [14]:
df = df_raw.sort_values(by=['time']).reset_index(drop=True)

df['values'] = [1, 2, 3, 4]

In [15]:
df

Unnamed: 0,time,values
0,2012-01-20 22:30:00.254000+05:30,1
1,2015-03-08 10:30:00.360000+00:00,2
2,2016-12-25 00:30:00.000000+10:00,3
3,2017-07-13 15:45:05.755000-07:00,4


### Lag the values variable by one period
-------------------------

In [16]:
df_lag = df.copy()

df_lag['values_lag_1'] = df_lag['values'].shift(1)

In [17]:
df_lag.head()

Unnamed: 0,time,values,values_lag_1
0,2012-01-20 22:30:00.254000+05:30,1,
1,2015-03-08 10:30:00.360000+00:00,2,1.0
2,2016-12-25 00:30:00.000000+10:00,3,2.0
3,2017-07-13 15:45:05.755000-07:00,4,3.0


### Calculate the two-period rolling mean of the values variable
-------------------------

In [18]:
df_rolling = df.copy()

df_rolling['values_rolling_mean_2'] = df_rolling['values'].rolling(window=2).mean()

In [19]:
df_rolling.head()

Unnamed: 0,time,values,values_rolling_mean_2
0,2012-01-20 22:30:00.254000+05:30,1,
1,2015-03-08 10:30:00.360000+00:00,2,1.5
2,2016-12-25 00:30:00.000000+10:00,3,2.5
3,2017-07-13 15:45:05.755000-07:00,4,3.5
