# Demo notebook for datetime series

In [1]:
import pandas as pd
import numpy as np

Pandas has a datetime64[ns] datatype. There are several relevant methods methods and attributes
- `pd.to_datetime()` #change datatype to datetime
- `pd.date_range()` #create a time range
- `pd.resample()` resample to another frequency (when datetime is index)
- `pd.Timestamp.now()` #fetch current time
- `df['column'].dt.dayofyear` #get the day of the year from a date
- `df.index.month` #get the month of the index (which is a datetime)
- `df.groupby([df.index.month])` #group by month

In [2]:
# Let us create a date time range
pd.date_range('1/1/2019', periods=5, freq='ns')

DatetimeIndex([          '2019-01-01 00:00:00',
               '2019-01-01 00:00:00.000000001',
               '2019-01-01 00:00:00.000000002',
               '2019-01-01 00:00:00.000000003',
               '2019-01-01 00:00:00.000000004'],
              dtype='datetime64[ns]', freq='N')

There are many frequencies

Alias    Description
- B        business day frequency
- C        custom business day frequency
- D        calendar day frequency
- W        weekly frequency
- M        month end frequency
- SM       semi-month end frequency (15th and end of month)
- BM       business month end frequency
- CBM      custom business month end frequency
- MS       month start frequency
- SMS      semi-month start frequency (1st and 15th)
- BMS      business month start frequency
- CBMS     custom business month start frequency
- Q        quarter end frequency
- BQ       business quarter end frequency
- QS       quarter start frequency
- BQS      business quarter start frequency
- A, Y     year end frequency
- BA, BY   business year end frequency
- AS, YS   year start frequency
- BAS, BYS business year start frequency
- BH       business hour frequency
- H        hourly frequency
- T, min   minutely frequency
- S        secondly frequency
- L, ms    milliseconds
- U, us    microseconds
- N        nanoseconds

In [3]:
#Now let us create a dataframe with datetime index
N = 250 #there are about 250 business days in a year
locations=['Leeuwarden', 'Groningen', 'Assen']
dates20 = pd.date_range('1/1/2019', periods=N, freq='B')
sales20 = pd.DataFrame(np.random.randint(7,33 ,size=(N, len(locations))), index=dates20, columns = locations)
#print(sales20.head(3))
dates21 = pd.date_range('1/1/2020', periods=N, freq='B')
sales21 = pd.DataFrame(np.random.randint(10,40 ,size=(N, len(locations))), index=dates21, columns = locations)
#print(sales21.head(3))
#concatenate the two dataframes
sales = pd.concat([sales20, sales21])
sales.head()

Unnamed: 0,Leeuwarden,Groningen,Assen
2019-01-01,21,31,23
2019-01-02,22,28,13
2019-01-03,17,9,32
2019-01-04,28,14,17
2019-01-07,19,28,24


In [4]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 500 entries, 2019-01-01 to 2020-12-15
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Leeuwarden  500 non-null    int64
 1   Groningen   500 non-null    int64
 2   Assen       500 non-null    int64
dtypes: int64(3)
memory usage: 15.6 KB


In [5]:
#we can resample to another frequency
sales.resample('M').min().head(7)

Unnamed: 0,Leeuwarden,Groningen,Assen
2019-01-31,8,9,7
2019-02-28,11,7,7
2019-03-31,7,8,7
2019-04-30,8,7,7
2019-05-31,8,8,8
2019-06-30,7,7,8
2019-07-31,7,7,7


In [6]:
# lets make the index a column
sales = sales.reset_index()
sales = sales.rename(columns={'index':'Datum'})

In [7]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Datum       500 non-null    datetime64[ns]
 1   Leeuwarden  500 non-null    int64         
 2   Groningen   500 non-null    int64         
 3   Assen       500 non-null    int64         
dtypes: datetime64[ns](1), int64(3)
memory usage: 15.8 KB


In [8]:
#from a datetime datatype I can extract attributes

In [9]:
sales['Day'] = sales['Datum'].dt.day

In [10]:
sales['Month'] = sales['Datum'].dt.month

In [11]:
sales['Year'] = sales['Datum'].dt.year

In [12]:
sales

Unnamed: 0,Datum,Leeuwarden,Groningen,Assen,Day,Month,Year
0,2019-01-01,21,31,23,1,1,2019
1,2019-01-02,22,28,13,2,1,2019
2,2019-01-03,17,9,32,3,1,2019
3,2019-01-04,28,14,17,4,1,2019
4,2019-01-07,19,28,24,7,1,2019
...,...,...,...,...,...,...,...
495,2020-12-09,28,16,14,9,12,2020
496,2020-12-10,25,20,31,10,12,2020
497,2020-12-11,18,32,39,11,12,2020
498,2020-12-14,12,26,29,14,12,2020


In [13]:
sales['Quarter'] = sales['Datum'].dt.quarter

In [14]:
sales

Unnamed: 0,Datum,Leeuwarden,Groningen,Assen,Day,Month,Year,Quarter
0,2019-01-01,21,31,23,1,1,2019,1
1,2019-01-02,22,28,13,2,1,2019,1
2,2019-01-03,17,9,32,3,1,2019,1
3,2019-01-04,28,14,17,4,1,2019,1
4,2019-01-07,19,28,24,7,1,2019,1
...,...,...,...,...,...,...,...,...
495,2020-12-09,28,16,14,9,12,2020,4
496,2020-12-10,25,20,31,10,12,2020,4
497,2020-12-11,18,32,39,11,12,2020,4
498,2020-12-14,12,26,29,14,12,2020,4


In [15]:
sales = sales.set_index('Datum') 

In [16]:
# the sample frequency was business days. 
# Let us resample to generate some missing values

In [17]:
sales = sales.resample('D').mean()

In [18]:
sales.head(10)

Unnamed: 0_level_0,Leeuwarden,Groningen,Assen,Day,Month,Year,Quarter
Datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01,21.0,31.0,23.0,1.0,1.0,2019.0,1.0
2019-01-02,22.0,28.0,13.0,2.0,1.0,2019.0,1.0
2019-01-03,17.0,9.0,32.0,3.0,1.0,2019.0,1.0
2019-01-04,28.0,14.0,17.0,4.0,1.0,2019.0,1.0
2019-01-05,,,,,,,
2019-01-06,,,,,,,
2019-01-07,19.0,28.0,24.0,7.0,1.0,2019.0,1.0
2019-01-08,18.0,16.0,12.0,8.0,1.0,2019.0,1.0
2019-01-09,17.0,32.0,7.0,9.0,1.0,2019.0,1.0
2019-01-10,22.0,20.0,30.0,10.0,1.0,2019.0,1.0


In [19]:
# with fillna() method I can fill data. Since it is time related
# forwardfill or backwardfill make sense
sales.fillna(method='bfill').head(10) # or ffill

Unnamed: 0_level_0,Leeuwarden,Groningen,Assen,Day,Month,Year,Quarter
Datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01,21.0,31.0,23.0,1.0,1.0,2019.0,1.0
2019-01-02,22.0,28.0,13.0,2.0,1.0,2019.0,1.0
2019-01-03,17.0,9.0,32.0,3.0,1.0,2019.0,1.0
2019-01-04,28.0,14.0,17.0,4.0,1.0,2019.0,1.0
2019-01-05,19.0,28.0,24.0,7.0,1.0,2019.0,1.0
2019-01-06,19.0,28.0,24.0,7.0,1.0,2019.0,1.0
2019-01-07,19.0,28.0,24.0,7.0,1.0,2019.0,1.0
2019-01-08,18.0,16.0,12.0,8.0,1.0,2019.0,1.0
2019-01-09,17.0,32.0,7.0,9.0,1.0,2019.0,1.0
2019-01-10,22.0,20.0,30.0,10.0,1.0,2019.0,1.0


In [21]:
# I also can interpolate
sales.interpolate().head(10)

Unnamed: 0_level_0,Leeuwarden,Groningen,Assen,Day,Month,Year,Quarter
Datum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01,21.0,31.0,23.0,1.0,1.0,2019.0,1.0
2019-01-02,22.0,28.0,13.0,2.0,1.0,2019.0,1.0
2019-01-03,17.0,9.0,32.0,3.0,1.0,2019.0,1.0
2019-01-04,28.0,14.0,17.0,4.0,1.0,2019.0,1.0
2019-01-05,25.0,18.666667,19.333333,5.0,1.0,2019.0,1.0
2019-01-06,22.0,23.333333,21.666667,6.0,1.0,2019.0,1.0
2019-01-07,19.0,28.0,24.0,7.0,1.0,2019.0,1.0
2019-01-08,18.0,16.0,12.0,8.0,1.0,2019.0,1.0
2019-01-09,17.0,32.0,7.0,9.0,1.0,2019.0,1.0
2019-01-10,22.0,20.0,30.0,10.0,1.0,2019.0,1.0


In [22]:
# I can calculate differences between dates and use different freqs
sales = sales.reset_index()
(sales['Datum'].min() - sales['Datum'].max()) / np.timedelta64(1, 'D')