In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# datetime

In [2]:
import datetime as dt
import time

Basic python module datetime provides date, time and datetime objects. They can be converted to/from strings, can contain timezone information. Can get date and time components from datetime object.

In [3]:
d = dt.datetime(year=2020, month=1, day=31, hour=13, minute=14, second=31)  # usually omit keywords
d
dt.date(2020, 1, 3)
dt.time(12, 32)
d.date(), d.time()

datetime.datetime(2020, 1, 31, 13, 14, 31)

datetime.date(2020, 1, 3)

datetime.time(12, 32)

(datetime.date(2020, 1, 31), datetime.time(13, 14, 31))

In [4]:
d = dt.datetime.strptime('2020-10-20', r'%Y-%m-%d')
d

d.strftime('%m/%d/%Y')

datetime.datetime(2020, 10, 20, 0, 0)

'10/20/2020'

description of all format codes (like %m, %Y) can be found here
https://docs.python.org/3/library/datetime.html#datetime.timezone (section: strftime() and strptime() Format Codes)

In [5]:
date = dt.date(2020, 1, 20)
time = dt.time(11, 23)
dt.datetime.combine(date, time)

datetime.datetime(2020, 1, 20, 11, 23)

In [6]:
# get today or now
dt.date.today()
dt.datetime.now()

datetime.date(2020, 9, 16)

datetime.datetime(2020, 9, 16, 15, 37, 38, 183496)

###  timedelta objects

In [7]:
# time delta

td = dt.datetime.now() - dt.datetime(2020, 2, 1)
td
type(td)
td.days
td.seconds  # residual seconds to days

td.total_seconds()  # today seconds (float -> thus includes microseconds)

datetime.timedelta(days=228, seconds=56258, microseconds=245035)

datetime.timedelta

228

56258

19755458.245035

### computer system time representation

* all computers count time from Unit epoch instant : 1970-01-01:00-00-00 UTC

* UTC is never adjust to daylight saving, etc - the only stable timezone

* On 32-bit systems the time will overflow on 19 Jan 2038 - has to be solved in advance (similar to 2K problem)

* It's preferred to use datetime rather time module
* There may be some minor simplifications when dealing with time due to leap seconds, etc

In [8]:
import time
time.time()  # number of seconds since Unit epoch

1600267058.3006

### naive and aware datetime objects
* naive objects don't contain any timezone information
* aware objects contain timezone

* naive instances do not represent well defined moment in time
* difference is determined by **tzinfo** attribute
* python 3.9 will have new implementation zoneinfo for tzinfo, dateutils also has tzinfo implementations

* working with datetimes in different timezones is complicated for multiple reasons
* one is irregularity of daylight saving time periods - that's why time itself cannot be converted between different timezones without a date (!!!)

In [9]:
import dateutil
import pytz

In [10]:
lon_tz = dateutil.tz.gettz('Europe/London')
lon_tz

# datetime with timezone
dtz = dt.datetime(2010, 12, 21, tzinfo=lon_tz)
dtz

dtz.tzname()
dtz.utcoffset()  # expressed as timedelta

tzfile('/usr/share/zoneinfo/Europe/London')

datetime.datetime(2010, 12, 21, 0, 0, tzinfo=tzfile('/usr/share/zoneinfo/Europe/London'))

'GMT'

datetime.timedelta(0)

### timezone localization and convertion using pytz

In [11]:
pytz.utc  # UTC timezone
eastern = pytz.timezone('US/Eastern')  # timezone() returns tzinfo instance for tz aware datetime
eastern.zone  # return zone string
amsterdam = pytz.timezone('Europe/Amsterdam')

<UTC>

'US/Eastern'

In [12]:
dtm = dt.datetime(2010, 12, 21, 10)
dtz = amsterdam.localize(dtm)  #localize naive datetime

dtm
dtz
dtz.astimezone(eastern)
dtz.astimezone(eastern) == dtz  # comparison between differently localized dt works 

datetime.datetime(2010, 12, 21, 10, 0)

datetime.datetime(2010, 12, 21, 10, 0, tzinfo=<DstTzInfo 'Europe/Amsterdam' CET+1:00:00 STD>)

datetime.datetime(2010, 12, 21, 4, 0, tzinfo=<DstTzInfo 'US/Eastern' EST-1 day, 19:00:00 STD>)

True

In [13]:
pytz.all_timezones[:10]  # list of all timezones (15 shown here) - quite useful to have

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara',
 'Africa/Asmera',
 'Africa/Bamako',
 'Africa/Bangui',
 'Africa/Banjul',
 'Africa/Bissau']

### relative delta functionality (equivalents in pandas ??!)

In [14]:
# relativedelta to use months, years etc
from dateutil.relativedelta import relativedelta
next_month = relativedelta(months=+1)
dt.date.today() + next_month  

relativedelta(dt.date.today(), dt.date.today() + dt.timedelta(days = 1000))  # days to (years, months, days)

dt.date.today() + relativedelta(years=+1, months=-1)  # next year, one month before

datetime.date(2020, 10, 16)

relativedelta(years=-2, months=-8, days=-27)

datetime.date(2021, 8, 16)

In [15]:
# relativedelta has many other uses
import calendar
from dateutil import rrule

today = dt.date.today()

today + relativedelta(weekday=rrule.FR)  # nearest friday
today + relativedelta(day=31, weekday=rrule.FR(-1))  # nearest friday

datetime.date(2020, 9, 18)

datetime.date(2020, 9, 25)

In [16]:
# dateutil.rrule
list(rrule.rrule(freq=rrule.MONTHLY, count=4, dtstart=today))  # returns an iterator

[datetime.datetime(2020, 9, 16, 0, 0),
 datetime.datetime(2020, 10, 16, 0, 0),
 datetime.datetime(2020, 11, 16, 0, 0),
 datetime.datetime(2020, 12, 16, 0, 0)]

In [17]:
# every January select days by a list of weekdays
list(rrule.rrule(rrule.YEARLY, bymonth=1, byweekday=[1, 3, 5], dtstart=today,
             until=today+relativedelta(years=2)))

[datetime.datetime(2021, 1, 2, 0, 0),
 datetime.datetime(2021, 1, 5, 0, 0),
 datetime.datetime(2021, 1, 7, 0, 0),
 datetime.datetime(2021, 1, 9, 0, 0),
 datetime.datetime(2021, 1, 12, 0, 0),
 datetime.datetime(2021, 1, 14, 0, 0),
 datetime.datetime(2021, 1, 16, 0, 0),
 datetime.datetime(2021, 1, 19, 0, 0),
 datetime.datetime(2021, 1, 21, 0, 0),
 datetime.datetime(2021, 1, 23, 0, 0),
 datetime.datetime(2021, 1, 26, 0, 0),
 datetime.datetime(2021, 1, 28, 0, 0),
 datetime.datetime(2021, 1, 30, 0, 0),
 datetime.datetime(2022, 1, 1, 0, 0),
 datetime.datetime(2022, 1, 4, 0, 0),
 datetime.datetime(2022, 1, 6, 0, 0),
 datetime.datetime(2022, 1, 8, 0, 0),
 datetime.datetime(2022, 1, 11, 0, 0),
 datetime.datetime(2022, 1, 13, 0, 0),
 datetime.datetime(2022, 1, 15, 0, 0),
 datetime.datetime(2022, 1, 18, 0, 0),
 datetime.datetime(2022, 1, 20, 0, 0),
 datetime.datetime(2022, 1, 22, 0, 0),
 datetime.datetime(2022, 1, 25, 0, 0),
 datetime.datetime(2022, 1, 27, 0, 0),
 datetime.datetime(2022, 1, 29, 0

In [18]:
# monthly on the second Friday, 10 occurencies
list(rrule.rrule(rrule.MONTHLY, count=10, byweekday=rrule.FR(2), dtstart=today))

[datetime.datetime(2020, 10, 9, 0, 0),
 datetime.datetime(2020, 11, 13, 0, 0),
 datetime.datetime(2020, 12, 11, 0, 0),
 datetime.datetime(2021, 1, 8, 0, 0),
 datetime.datetime(2021, 2, 12, 0, 0),
 datetime.datetime(2021, 3, 12, 0, 0),
 datetime.datetime(2021, 4, 9, 0, 0),
 datetime.datetime(2021, 5, 14, 0, 0),
 datetime.datetime(2021, 6, 11, 0, 0),
 datetime.datetime(2021, 7, 9, 0, 0)]

lots of tricks for individual dates can be done with dateutils, see https://dateutil.readthedocs.io/en/stable/index.html

# numpy datetime64

* python native datetime is a bit too simplistic and also does provide methods to work with sequences of dates (can only have an ndarray of datetimes or dates)

* type is datetime64 to differentiate it from the native datetime format

* numpy implements np.datetime64 type which also serves as dtype for corresponding np.ndarray

* np.datetime64[ns] is a general type (machine independent), on a particular machine it may be represented by '<M8[ns]' '>M8[ns]' (little/big endian etc)
(more generally, see [here](https://stackoverflow.com/questions/29206612/difference-between-data-type-datetime64ns-and-m8ns#:~:text=2%20Answers&text=datetime64%5Bns%5D%20is%20a%20general,of%20NumPy%20to%20the%20next.&text=However%2C%20on%20a%20big%20endian%20machine%2C%20np.), same as int64 and '<i8', '>i8')

In [19]:
import numpy as np

In [21]:
# create single np.datetime64 object
date_iso = '2020-03-04'
np.datetime64(date_iso)  # from ISO string
np.datetime64(dt.date(2020, 1, 1))  # from datetime objects
np.datetime64(dt.datetime(2020, 1, 1, 10, 30))

numpy.datetime64('2020-03-04')

numpy.datetime64('2020-01-01')

numpy.datetime64('2020-01-01T10:30:00.000000')

In [22]:
dtz
np.datetime64(dtz)  # timezones are deprecated - NOT TO USE

datetime.datetime(2010, 12, 21, 10, 0, tzinfo=<DstTzInfo 'Europe/Amsterdam' CET+1:00:00 STD>)

  np.datetime64(dtz)  # timezones are deprecated - NOT TO USE


numpy.datetime64('2010-12-21T09:00:00.000000')

In [23]:
# datetime64 has unit parameter
np.datetime64('2020', 'W')
np.datetime64('2020', 'M')
np.datetime64('2020', 'Y')

numpy.datetime64('2019-12-26')

numpy.datetime64('2020-01')

numpy.datetime64('2020')

In [24]:
x = np.int64(4)
x
type(x)
np.dtype(x)

4

numpy.int64

dtype('int64')

In [25]:
d = np.datetime64('2020-08-13')
type(d)
np.dtype(d)  # dtype also shows unit

d = np.datetime64('2020-08')
np.dtype(d)  # unit is parsed automatically

d = np.datetime64('2020-08', 'D')  # but can map it explicitly
d
np.dtype(d) 

numpy.datetime64

dtype('<M8[D]')

dtype('<M8[M]')

numpy.datetime64('2020-08-01')

dtype('<M8[D]')

In [26]:
np.datetime64('nat')

numpy.datetime64('NaT')

#### np.timedelta64

In [27]:
np.timedelta64(1, 'D')  # has number and unit
np.timedelta64(1, 'D') + np.datetime64('2020-01-01')
np.timedelta64(3, 'h') + np.datetime64('2020-01-01')  # casting to smaller unit

numpy.timedelta64(1,'D')

numpy.datetime64('2020-01-02')

numpy.datetime64('2020-01-01T03','h')

In [28]:
np.datetime64('2020-02-01') - np.datetime64('2020-01-01')
np.datetime64('2020-02-01') - np.datetime64('2020-01-01', 'h')  # cast to hours

numpy.timedelta64(31,'D')

numpy.timedelta64(744,'h')

In [29]:
# timedelta arythmetics
np.timedelta64(1,'W') / np.timedelta64(1,'D')
np.timedelta64(1,'W') % np.timedelta64(10,'D')

7.0

numpy.timedelta64(7,'D')

#### units
full list can be found [here](https://numpy.org/doc/stable/reference/arrays.datetime.html) (datetime unit section)

#### working with business days

In [30]:
np.busday_offset('2011-06-23', 2)  # applied to unit 'D'

np.busday_offset('2011-03-20', 0, roll='forward')
np.busday_offset('2011-03-20', 0, roll='backward')  
# roll allows to specify if choose bday before or after

numpy.datetime64('2011-06-27')

numpy.datetime64('2011-03-21')

numpy.datetime64('2011-03-18')

In [31]:
d1 = np.datetime64('2011-07-11')
d2 = np.datetime64('2011-07-18')
np.busday_count(d1, d2)  # distance in bdays

np.busday_count([d1, d1], [d2, d2])  # can be apply to sequencies of start/end dates (!)

5

array([5, 5])

#### ranges

In [32]:
# for numpy version unit parameter may be useful in many ways, e.g.
np.arange('2005-02', '2005-03', dtype='datetime64[D]')
np.arange('2005-02', '2005-03', dtype='datetime64[W]')  # weekly frequency
# see more in the numpy guide (or put datetime related part here)

array(['2005-02-01', '2005-02-02', '2005-02-03', '2005-02-04',
       '2005-02-05', '2005-02-06', '2005-02-07', '2005-02-08',
       '2005-02-09', '2005-02-10', '2005-02-11', '2005-02-12',
       '2005-02-13', '2005-02-14', '2005-02-15', '2005-02-16',
       '2005-02-17', '2005-02-18', '2005-02-19', '2005-02-20',
       '2005-02-21', '2005-02-22', '2005-02-23', '2005-02-24',
       '2005-02-25', '2005-02-26', '2005-02-27', '2005-02-28'],
      dtype='datetime64[D]')

array(['2005-01-27', '2005-02-03', '2005-02-10', '2005-02-17'],
      dtype='datetime64[W]')

In [33]:
# create array
dtarr = np.array([dt.date(2020, 1, 1), dt.date(2020, 1, 3), dt.date(2020, 2, 1)]).astype(np.datetime64)
dtarr  # unit is D (days)

array(['2020-01-01', '2020-01-03', '2020-02-01'], dtype='datetime64[D]')

In [34]:
dtarr.astype('datetime64[ns]')  # convert unit

array(['2020-01-01T00:00:00.000000000', '2020-01-03T00:00:00.000000000',
       '2020-02-01T00:00:00.000000000'], dtype='datetime64[ns]')

* **pandas only implements an equivalent of datetime64[ns]  (see on pandas objects later)**
* it has other ways to deal with ranges and frequencies

# pandas datetime functionality
* interaction with xarray

In [35]:
import pandas as pd
import xarray as xr

In [36]:
# pandas basic individual datetime type - Timestamp
# use to_datetime to convert to Timestamp
pd.to_datetime(dt.date(2020, 1, 2))

Timestamp('2020-01-02 00:00:00')

In [37]:
date_iso = '2020-08-14'

# to_datetime can take lots of different things as argument
pd.to_datetime('08/14/2020')
pd.to_datetime(np.datetime64(date_iso))
# always better to have ISO string date as argument and specifically parse it

tst = pd.to_datetime(date_iso, format='%Y-%m-%d')
tst
# pd.to_datetime('09/14/2020', format='%Y-%m-%d')  # other formats won't work in this case

Timestamp('2020-08-14 00:00:00')

Timestamp('2020-08-14 00:00:00')

Timestamp('2020-08-14 00:00:00')

In [38]:
# convert to other formats
tst.to_datetime64()
tst.strftime('%m/%d/%Y')

# to native datetime objects
tst.to_pydatetime()
tst.date()
tst.time()

numpy.datetime64('2020-08-14T00:00:00.000000000')

'08/14/2020'

datetime.datetime(2020, 8, 14, 0, 0)

datetime.date(2020, 8, 14)

datetime.time(0, 0)

In [39]:
# Timestamp has various fields and methods
tst.year, tst.month, tst.day, tst.hour, tst.minute
tst.weekday(), tst.day_name()

(2020, 8, 14, 0, 0)

(4, 'Friday')

In [40]:
pd.NaT, pd.Period(pd.NaT), pd.Timedelta(pd.NaT)  # have NaT, can convert it to other formats

(NaT, NaT, NaT)

### pandas datetime arrays (DatetimeIndex)

In [41]:
dates = pd.to_datetime(['2020-01-01', '2020-01-02', '2020-01-03'])
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='datetime64[ns]', freq=None)

In [42]:
pd.Index(['2020-01-01', '2020-01-02', '2020-01-03'])  #could create simple index from it
# to_datetime applied to array-like object create DatetimeIndex

dates.values  # its numpy datetime64 array
pd.Index(dates.values)  # creating index from np.datetime64 automatically creates DatetimeIndex

# it has dtype and freq parameters
# dtype is always datetime64[ns] 
# (only ns is implemented, it's pandas 'equivalent' to numpy.datetime64[ns])
# unlike numpy, it can be tz aware
dates.tz_localize('Europe/London')  # getting datetime64[ns, Europe/London]

Index(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='object')

array(['2020-01-01T00:00:00.000000000', '2020-01-02T00:00:00.000000000',
       '2020-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2020-01-01 00:00:00+00:00', '2020-01-02 00:00:00+00:00',
               '2020-01-03 00:00:00+00:00'],
              dtype='datetime64[ns, Europe/London]', freq=None)

In [43]:
ser = pd.Series(['2020-01-01', '2020-01-02', '2020-01-03'])
ser
pd.to_datetime(ser)  # returns series when applied to series, DatetimeIndex otherwise

0    2020-01-01
1    2020-01-02
2    2020-01-03
dtype: object

0   2020-01-01
1   2020-01-02
2   2020-01-03
dtype: datetime64[ns]

In [44]:
# convert DatetimeIndex to other formats
dates.date
dates.time
dates.strftime('%Y-%m-%d')
dates.values
dates.to_pydatetime()

array([datetime.date(2020, 1, 1), datetime.date(2020, 1, 2),
       datetime.date(2020, 1, 3)], dtype=object)

array([datetime.time(0, 0), datetime.time(0, 0), datetime.time(0, 0)],
      dtype=object)

Index(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='object')

array(['2020-01-01T00:00:00.000000000', '2020-01-02T00:00:00.000000000',
       '2020-01-03T00:00:00.000000000'], dtype='datetime64[ns]')

array([datetime.datetime(2020, 1, 1, 0, 0),
       datetime.datetime(2020, 1, 2, 0, 0),
       datetime.datetime(2020, 1, 3, 0, 0)], dtype=object)

In [45]:
dates2 = pd.to_datetime(['2022-01-01', '2022-01-02'])

# set-like DatetimeIndex operations
dates.union(dates2)
dates.intersection(dates2)
dates.difference(dates2)

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2022-01-01',
               '2022-01-02'],
              dtype='datetime64[ns]', freq=None)

DatetimeIndex([], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03'], dtype='datetime64[ns]', freq=None)

In [46]:
# treating invalid datapoints
try:
    pd.to_datetime(['2009/07/31', 'asd'], errors='raise')  # don't allow invalid values
except ValueError:
    print('ValueError was raised')
pd.to_datetime(['2009/07/31', 'asd'], errors='ignore')  # create pd.Index, leave invalid values
pd.to_datetime(['2009/07/31', 'asd'], errors='coerce')  # map invalid values to NaT

ValueError was raised


Index(['2009/07/31', 'asd'], dtype='object')

DatetimeIndex(['2009-07-31', 'NaT'], dtype='datetime64[ns]', freq=None)

#### working with epochs

In [47]:
pd.to_datetime(1349720105100)  # default unit is 'ns' 
pd.to_datetime(1349720105100, unit='ms')  # need to specify unit if different

# construct from epoch and localize
pd.Timestamp(1262347200000000000).tz_localize('US/Pacific')

# convert timestamp to epoch (with specified unit, seconds below)
tst = pd.Timestamp('2020-01-01')
epochs_sec = (tst - pd.Timestamp('1970-01-01')) // pd.Timedelta('1s')
epochs_sec
pd.Timestamp(epochs_sec, unit='s')

Timestamp('1970-01-01 00:22:29.720105100')

Timestamp('2012-10-08 18:15:05.100000')

Timestamp('2010-01-01 12:00:00-0800', tz='US/Pacific')

1577836800

Timestamp('2020-01-01 00:00:00')

In [48]:
# DatetimeIndex using origin, epochs array and unit
pd.to_datetime([1, 2, 3], unit='D')  # default is '1970-01-01:00-00'
pd.to_datetime([1, 2, 3], unit='D', origin='1960-01-01') 

DatetimeIndex(['1970-01-02', '1970-01-03', '1970-01-04'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)

### creating ranges

In [49]:
pd.date_range('2020-01-01', '2020-01-05')
pd.date_range('2020-01-01', periods=5, freq='W')

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2020-01-05', '2020-01-12', '2020-01-19', '2020-01-26',
               '2020-02-02'],
              dtype='datetime64[ns]', freq='W-SUN')

In [50]:
dates = pd.bdate_range('2020-01-01', '2020-01-05') # has frequency
dates.union(pd.to_datetime(['2020-02-02']))  # freq is None

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-02-02'], dtype='datetime64[ns]', freq=None)

#### advanced range schemes

In [51]:
# using weekmask and holidays
weekmask = 'Mon Wed Fri'
holidays = [dt.datetime(2011, 1, 5), dt.datetime(2011, 3, 14)]
pd.bdate_range('2011-01-01', '2012-01-01', freq='C', weekmask=weekmask, holidays=holidays)
# freq 'C' means custom business days

DatetimeIndex(['2011-01-03', '2011-01-07', '2011-01-10', '2011-01-12',
               '2011-01-14', '2011-01-17', '2011-01-19', '2011-01-21',
               '2011-01-24', '2011-01-26',
               ...
               '2011-12-09', '2011-12-12', '2011-12-14', '2011-12-16',
               '2011-12-19', '2011-12-21', '2011-12-23', '2011-12-26',
               '2011-12-28', '2011-12-30'],
              dtype='datetime64[ns]', length=154, freq='C')

In [52]:
# frequency string identifiers correspond to DateOffset objects (later on this)
# can use DateOffset instead of strings

pd.date_range('2011-01-01', '2011-02-01', freq='W')
pd.date_range('2011-01-01', '2011-02-01', freq=pd.offsets.Week(1))

DatetimeIndex(['2011-01-02', '2011-01-09', '2011-01-16', '2011-01-23',
               '2011-01-30'],
              dtype='datetime64[ns]', freq='W-SUN')

DatetimeIndex(['2011-01-01', '2011-01-08', '2011-01-15', '2011-01-22',
               '2011-01-29'],
              dtype='datetime64[ns]', freq='W')

### Indexing and slicing

In [53]:
ser = pd.Series(range(92), pd.date_range('2020-01-01', periods=92))

ser.loc['2020-01-05']  # individual exact
ser.loc['2020-01'].head()  # individual slicer

ser.loc['2020-01-05': '2020-01-08']  # slice (can use partial strings as well)
# slice in pandas include both ends

4

2020-01-01    0
2020-01-02    1
2020-01-03    2
2020-01-04    3
2020-01-05    4
Freq: D, dtype: int64

2020-01-05    4
2020-01-06    5
2020-01-07    6
2020-01-08    7
Freq: D, dtype: int64

In [54]:
# exact match vs slice depends on resolution of the index
series_minute = pd.Series([1, 2, 3], pd.DatetimeIndex(['2011-12-31 23:59:00',
                                                        '2012-01-01 00:00:00',
                                                        '2012-01-01 00:02:00']))
series_minute.index.resolution

series_minute.loc['2011-12-31 23']   # hours -> returning slice
series_minute['2011-12-31 23:59']  # minute -> exact, bc equals resolution

'minute'

2011-12-31 23:59:00    1
dtype: int64

1

#### boolean indexing and date comparisons

In [55]:
try:
    ser.loc[ser.index < dt.date(2020, 2, 10)]  # cannot compare pandas datetime64[ns] and date
except TypeError:
    print('got TypeError')
        
ser.loc[ser.index < dt.datetime(2020, 2, 10)].tail()  # ok for dt.datetime
ser.loc[ser.index < '2020-02-10'].tail()  # ok for string (accepted by pd.to_datetime)
ser.loc[ser.index > np.datetime64('2020-02-10', 'h')].tail()  # ok for numpy datetime64
ser.loc[ser.index <= pd.to_datetime('2020-02-10')].tail()

got TypeError


2020-02-05    35
2020-02-06    36
2020-02-07    37
2020-02-08    38
2020-02-09    39
Freq: D, dtype: int64

2020-02-05    35
2020-02-06    36
2020-02-07    37
2020-02-08    38
2020-02-09    39
Freq: D, dtype: int64

2020-03-28    87
2020-03-29    88
2020-03-30    89
2020-03-31    90
2020-04-01    91
Freq: D, dtype: int64

2020-02-06    36
2020-02-07    37
2020-02-08    38
2020-02-09    39
2020-02-10    40
Freq: D, dtype: int64

### DateOffset objects
 * similar to a Timedelta that represents a duration of time 
 * but follows specific calendar duration rules. 
 * For example, a Timedelta day will always increment datetimes by 24 hours, while a DateOffset day will increment datetimes to the same time the next day whether a day represents 23, 24 or 25 hours due to daylight savings time. 
 * all DateOffset subclasses that are an hour or smaller (Hour, Minute, Second, Milli, Micro, Nano) behave like Timedelta and respect absolute time

In [56]:
# e.g. of difference with timedeltas

ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki')

# Respects absolute time
ts + pd.Timedelta(days=1)

# Respects calendar time
ts + pd.DateOffset(days=1)

Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki')

Timestamp('2016-10-31 00:00:00+0200', tz='Europe/Helsinki')

In [57]:
pd.DateOffset(days=2)  # instead of generic dateOffset (defaults to 1 day, can specify params)
pd.offsets.Day(2)  # can use specific classes for different offsets

<DateOffset: days=2>

<2 * Days>

correspondance of DateOffset classes and frequencies can be found [here](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html) (section DateOffset objects)

#### calendar delta offsets (like Hour, Day, Week)

In [58]:
tst = pd.to_datetime('2020-02-15')

tst + pd.offsets.Day(3)
tst + pd.offsets.BDay()
pd.offsets.BDay().rollforward(tst)  # rollforward outside grid is the same as adding 1 offset
pd.offsets.BDay().rollback(tst)

# for grid value this will be different
pd.to_datetime('2020-02-17') + pd.offsets.BDay()
pd.offsets.BDay().rollforward(pd.to_datetime('2020-02-17'))

# some offsets have anchoring parameters
pd.offsets.Week(weekday=4).rollforward('2020-09-16')

Timestamp('2020-02-18 00:00:00')

Timestamp('2020-02-17 00:00:00')

Timestamp('2020-02-17 00:00:00')

Timestamp('2020-02-14 00:00:00')

Timestamp('2020-02-18 00:00:00')

Timestamp('2020-02-17 00:00:00')

Timestamp('2020-09-18 00:00:00')

#### period start/end offsets

In [59]:
offset = pd.offsets.BQuarterEnd() 
offset.rollforward('2020-02-13')  # rollforward works as adding one offset
pd.to_datetime('2020-02-13') + pd.offsets.BQuarterEnd()  

pd.to_datetime('2020-02-13') + pd.offsets.BMonthEnd()

Timestamp('2020-03-31 00:00:00')

Timestamp('2020-03-31 00:00:00')

Timestamp('2020-02-28 00:00:00')

#### Business hour offsets
* default to 9h00-17h00

In [60]:
# date offset has frequency and offset
offset = pd.offsets.BusinessHour(start='09:00', end='18:00')  # freq='B', offset (start) at 9h00

# rollforward, rollback methods
tst = pd.Timestamp('2018-01-06 00:00:00')
offset.rollforward(tst)  # same as adding 0 offsets (but remap to the offset grid)
tst + offset  # adding one offset is NOT equal to rollforward

offset.rollback(tst)
tst - offset  # rollback AND subtract one hour (different behaviour from other offsets)

Timestamp('2018-01-08 09:00:00')

Timestamp('2018-01-08 10:00:00')

Timestamp('2018-01-05 18:00:00')

Timestamp('2018-01-05 17:00:00')

### timezones, convertions with xarray

In [61]:
tst = pd.to_datetime('2020-01-01')
idx = pd.date_range('2020-01-01', periods=10)

tst = tst.tz_localize('US/Pacific')  # localize (treat naive as this timezone time)
idx = idx.tz_localize('US/Pacific')
# don't confuse localization with convertion from UTC
tst
idx

# here naive treated as UTC and converted to local time
pd.to_datetime('2020-01-01').tz_localize('UTC').tz_convert('US/Pacific')

tst.tz_convert('US/Eastern')
idx.tz_convert('US/Eastern')

Timestamp('2020-01-01 00:00:00-0800', tz='US/Pacific')

DatetimeIndex(['2020-01-01 00:00:00-08:00', '2020-01-02 00:00:00-08:00',
               '2020-01-03 00:00:00-08:00', '2020-01-04 00:00:00-08:00',
               '2020-01-05 00:00:00-08:00', '2020-01-06 00:00:00-08:00',
               '2020-01-07 00:00:00-08:00', '2020-01-08 00:00:00-08:00',
               '2020-01-09 00:00:00-08:00', '2020-01-10 00:00:00-08:00'],
              dtype='datetime64[ns, US/Pacific]', freq=None)

Timestamp('2019-12-31 16:00:00-0800', tz='US/Pacific')

Timestamp('2020-01-01 03:00:00-0500', tz='US/Eastern')

DatetimeIndex(['2020-01-01 03:00:00-05:00', '2020-01-02 03:00:00-05:00',
               '2020-01-03 03:00:00-05:00', '2020-01-04 03:00:00-05:00',
               '2020-01-05 03:00:00-05:00', '2020-01-06 03:00:00-05:00',
               '2020-01-07 03:00:00-05:00', '2020-01-08 03:00:00-05:00',
               '2020-01-09 03:00:00-05:00', '2020-01-10 03:00:00-05:00'],
              dtype='datetime64[ns, US/Eastern]', freq=None)

In [62]:
idx
idx.tz  # None for naive
idx.tz.zone  # timezone name string

DatetimeIndex(['2020-01-01 00:00:00-08:00', '2020-01-02 00:00:00-08:00',
               '2020-01-03 00:00:00-08:00', '2020-01-04 00:00:00-08:00',
               '2020-01-05 00:00:00-08:00', '2020-01-06 00:00:00-08:00',
               '2020-01-07 00:00:00-08:00', '2020-01-08 00:00:00-08:00',
               '2020-01-09 00:00:00-08:00', '2020-01-10 00:00:00-08:00'],
              dtype='datetime64[ns, US/Pacific]', freq=None)

<DstTzInfo 'US/Pacific' LMT-1 day, 16:07:00 STD>

'US/Pacific'

#### dt accessor

In [63]:
# DatetimeIndex allows to access various fields
idx.day
idx.month

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64')

Int64Index([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype='int64')

In [64]:
# for series having timestampls as values can use dt accessor (similar to .str accessor)
pd.Series(idx).dt.hour

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

#### pandas/xarray convertions and datetimes

In [65]:
# timezone aware index and xarray
idx = pd.date_range('2020-01-01', periods=10)
df = pd.DataFrame(0, index=idx, columns=list('ABC'))
df.index.name = 'date'
df.columns.name = 'field'

xarr = xr.DataArray(df)  # get xarray
xarr  # date filed created from DatetimeIndex is of numpy.datetime64[ns] type
# xarray only supports [ns] unit similar to pandas

In [66]:
xarr['date'].dt.month  # xr.DataArray has .dt accessor similar to pandas series

In [67]:
# datetime slicing similar to pandas works via .sel(), but has to use slice(..,..) instead of :
xarr.sel(date=slice('2020-01-01', '2020-01-05'))
xarr.sel(date='2020-01')

In [68]:
# when df is converted to xarray all tz-aware indexes convert to ns epoch objects 
# (unique representation)
df1 = df.copy()
df1.index = df1.index.tz_localize('US/Eastern')
xarr = xr.DataArray(df1)
xarr

# the same hold simply when DatetimeIndex is assign to an xarray coordinate
xarr['date'] = df1.index
xarr

In [69]:
# when nc epochs convert back to pandas we get tz index
df2 = xarr.to_pandas()
df2
df2.index
# somehow tzinfo is saved?!

field,A,B,C
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-01 00:00:00-05:00,0,0,0
2020-01-02 00:00:00-05:00,0,0,0
2020-01-03 00:00:00-05:00,0,0,0
2020-01-04 00:00:00-05:00,0,0,0
2020-01-05 00:00:00-05:00,0,0,0
2020-01-06 00:00:00-05:00,0,0,0
2020-01-07 00:00:00-05:00,0,0,0
2020-01-08 00:00:00-05:00,0,0,0
2020-01-09 00:00:00-05:00,0,0,0
2020-01-10 00:00:00-05:00,0,0,0


DatetimeIndex(['2020-01-01 00:00:00-05:00', '2020-01-02 00:00:00-05:00',
               '2020-01-03 00:00:00-05:00', '2020-01-04 00:00:00-05:00',
               '2020-01-05 00:00:00-05:00', '2020-01-06 00:00:00-05:00',
               '2020-01-07 00:00:00-05:00', '2020-01-08 00:00:00-05:00',
               '2020-01-09 00:00:00-05:00', '2020-01-10 00:00:00-05:00'],
              dtype='datetime64[ns, US/Eastern]', name='date', freq=None)

In [70]:
# most clear was is to save to UTC as naive and convert
df3 = df1.copy()
df3.index = df3.index.tz_convert('UTC').tz_localize(None)
xarr = xr.DataArray(df3)
xarr
xarr['date'].values

array(['2020-01-01T05:00:00.000000000', '2020-01-02T05:00:00.000000000',
       '2020-01-03T05:00:00.000000000', '2020-01-04T05:00:00.000000000',
       '2020-01-05T05:00:00.000000000', '2020-01-06T05:00:00.000000000',
       '2020-01-07T05:00:00.000000000', '2020-01-08T05:00:00.000000000',
       '2020-01-09T05:00:00.000000000', '2020-01-10T05:00:00.000000000'],
      dtype='datetime64[ns]')

### shift, lag, frequency convertion, resampling, aggregation

In [71]:
idx = pd.date_range('2020-01-01', periods=100, freq='D')
ser = pd.Series(range(100), idx)
ser

2020-01-01     0
2020-01-02     1
2020-01-03     2
2020-01-04     3
2020-01-05     4
              ..
2020-04-05    95
2020-04-06    96
2020-04-07    97
2020-04-08    98
2020-04-09    99
Freq: D, Length: 100, dtype: int64

In [72]:
ser.shift(3)  # usual shift with number, changes the date/value alignment

2020-01-01     NaN
2020-01-02     NaN
2020-01-03     NaN
2020-01-04     0.0
2020-01-05     1.0
              ... 
2020-04-05    92.0
2020-04-06    93.0
2020-04-07    94.0
2020-04-08    95.0
2020-04-09    96.0
Freq: D, Length: 100, dtype: float64

In [73]:
ser.shift(3, freq='D')  # if freq is given, this shifts all dates
ser.shift(1, freq='W')

ser.shift(1, freq=pd.offsets.BDay())  # can pass date offsets instead of freq

2020-01-04     0
2020-01-05     1
2020-01-06     2
2020-01-07     3
2020-01-08     4
              ..
2020-04-08    95
2020-04-09    96
2020-04-10    97
2020-04-11    98
2020-04-12    99
Freq: D, Length: 100, dtype: int64

2020-01-05     0
2020-01-05     1
2020-01-05     2
2020-01-05     3
2020-01-12     4
              ..
2020-04-12    95
2020-04-12    96
2020-04-12    97
2020-04-12    98
2020-04-12    99
Length: 100, dtype: int64

2020-01-02     0
2020-01-03     1
2020-01-06     2
2020-01-06     3
2020-01-06     4
              ..
2020-04-06    95
2020-04-07    96
2020-04-08    97
2020-04-09    98
2020-04-10    99
Length: 100, dtype: int64

In [74]:
# frequency conversion

ser.asfreq('W')  # larger frequency - select points for the corresponding offset dates

# more advance usage of offsets
ser.asfreq(pd.offsets.Week(weekday=3))
ser.asfreq(pd.offsets.MonthEnd())

2020-01-05     4
2020-01-12    11
2020-01-19    18
2020-01-26    25
2020-02-02    32
2020-02-09    39
2020-02-16    46
2020-02-23    53
2020-03-01    60
2020-03-08    67
2020-03-15    74
2020-03-22    81
2020-03-29    88
2020-04-05    95
Freq: W-SUN, dtype: int64

2020-01-02     1
2020-01-09     8
2020-01-16    15
2020-01-23    22
2020-01-30    29
2020-02-06    36
2020-02-13    43
2020-02-20    50
2020-02-27    57
2020-03-05    64
2020-03-12    71
2020-03-19    78
2020-03-26    85
2020-04-02    92
2020-04-09    99
Freq: W-THU, dtype: int64

2020-01-31    30
2020-02-29    59
2020-03-31    90
Freq: M, dtype: int64

In [75]:
ser.asfreq('10h')  # smaller frequency - NaN in the extra datetime points
ser.asfreq('5h', method='pad')  # can ffill etc

2020-01-01 00:00:00    0.0
2020-01-01 10:00:00    NaN
2020-01-01 20:00:00    NaN
2020-01-02 06:00:00    NaN
2020-01-02 16:00:00    NaN
                      ... 
2020-04-07 02:00:00    NaN
2020-04-07 12:00:00    NaN
2020-04-07 22:00:00    NaN
2020-04-08 08:00:00    NaN
2020-04-08 18:00:00    NaN
Freq: 10H, Length: 238, dtype: float64

2020-01-01 00:00:00     0
2020-01-01 05:00:00     0
2020-01-01 10:00:00     0
2020-01-01 15:00:00     0
2020-01-01 20:00:00     0
                       ..
2020-04-08 03:00:00    98
2020-04-08 08:00:00    98
2020-04-08 13:00:00    98
2020-04-08 18:00:00    98
2020-04-08 23:00:00    98
Freq: 5H, Length: 476, dtype: int64

In [76]:
# resampling: .resample is similar to groupBy requires a method application

ser.resample('W').mean()
ser.resample(pd.offsets.BDay(2)).max()  # every second bday

ser.resample(pd.offsets.BMonthEnd()).sum()  # business month end

2020-01-05     2.0
2020-01-12     8.0
2020-01-19    15.0
2020-01-26    22.0
2020-02-02    29.0
2020-02-09    36.0
2020-02-16    43.0
2020-02-23    50.0
2020-03-01    57.0
2020-03-08    64.0
2020-03-15    71.0
2020-03-22    78.0
2020-03-29    85.0
2020-04-05    92.0
2020-04-12    97.5
Freq: W-SUN, dtype: float64

2020-01-01     1
2020-01-03     5
2020-01-07     7
2020-01-09    11
2020-01-13    13
2020-01-15    15
2020-01-17    19
2020-01-21    21
2020-01-23    25
2020-01-27    27
2020-01-29    29
2020-01-31    33
2020-02-04    35
2020-02-06    39
2020-02-10    41
2020-02-12    43
2020-02-14    47
2020-02-18    49
2020-02-20    53
2020-02-24    55
2020-02-26    57
2020-02-28    61
2020-03-03    63
2020-03-05    67
2020-03-09    69
2020-03-11    71
2020-03-13    75
2020-03-17    77
2020-03-19    81
2020-03-23    83
2020-03-25    85
2020-03-27    89
2020-03-31    91
2020-04-02    95
2020-04-06    97
2020-04-08    99
Freq: 2B, dtype: int64

2020-01-31     465
2020-02-28    1246
2020-03-31    2384
2020-04-30     855
Freq: BM, dtype: int64

In [77]:
# upsampling

ser.resample('6h').asfreq()  # same as freq
ser.resample('6h').ffill()  # etc

2020-01-01 00:00:00     0.0
2020-01-01 06:00:00     NaN
2020-01-01 12:00:00     NaN
2020-01-01 18:00:00     NaN
2020-01-02 00:00:00     1.0
                       ... 
2020-04-08 00:00:00    98.0
2020-04-08 06:00:00     NaN
2020-04-08 12:00:00     NaN
2020-04-08 18:00:00     NaN
2020-04-09 00:00:00    99.0
Freq: 6H, Length: 397, dtype: float64

2020-01-01 00:00:00     0
2020-01-01 06:00:00     0
2020-01-01 12:00:00     0
2020-01-01 18:00:00     0
2020-01-02 00:00:00     1
                       ..
2020-04-08 00:00:00    98
2020-04-08 06:00:00    98
2020-04-08 12:00:00    98
2020-04-08 18:00:00    98
2020-04-09 00:00:00    99
Freq: 6H, Length: 397, dtype: int64

In [78]:
ser.resample('W').agg([np.sum, np.mean, np.min])  # agg method for multiple aggregators

Unnamed: 0,sum,mean,amin
2020-01-05,10,2.0,0
2020-01-12,56,8.0,5
2020-01-19,105,15.0,12
2020-01-26,154,22.0,19
2020-02-02,203,29.0,26
2020-02-09,252,36.0,33
2020-02-16,301,43.0,40
2020-02-23,350,50.0,47
2020-03-01,399,57.0,54
2020-03-08,448,64.0,61


### periods

In [79]:
pd.Period('2011-1')  # auto
pd.Period('2011-1', freq='5D')  # specify length

Period('2011-01', 'M')

Period('2011-01-01', '5D')

In [80]:
 pd.Period('2012', freq='A-DEC')

Period('2012', 'A-DEC')

In [81]:
p = pd.Period('2012', freq='A-DEC')
p - 1  # operations with integers
p + 2

Period('2011', 'A-DEC')

Period('2014', 'A-DEC')

In [82]:
pd.Period('2020-01-01', freq='D') + pd.offsets.Hour(24)  # can add offsets if the offset freq
# is smaller and is integer divisible (24 hrs = 1 day)

Period('2020-01-02', 'D')

In [83]:
# period range
pd.period_range('1/1/2011', '1/1/2012', freq='M')

# period index
pd.PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M')

PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05', '2011-06',
             '2011-07', '2011-08', '2011-09', '2011-10', '2011-11', '2011-12',
             '2012-01'],
            dtype='period[M]', freq='M')

PeriodIndex(['2011-01', '2011-02', '2011-03'], dtype='period[M]', freq='M')

In [84]:
# period frequency resampling
p = pd.Period('2011', freq='A-DEC')

p.asfreq('M', how='start')
p.asfreq('M', how='end')
# in particular that's the way to get period start end (in a given frequency)

Period('2011-01', 'M')

Period('2011-12', 'M')

In [85]:
prng = pd.period_range('1/1/2011', '1/1/2012', freq='D')

trng = prng.to_timestamp()
trng

trng.to_period()

DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04',
               '2011-01-05', '2011-01-06', '2011-01-07', '2011-01-08',
               '2011-01-09', '2011-01-10',
               ...
               '2011-12-23', '2011-12-24', '2011-12-25', '2011-12-26',
               '2011-12-27', '2011-12-28', '2011-12-29', '2011-12-30',
               '2011-12-31', '2012-01-01'],
              dtype='datetime64[ns]', length=366, freq='D')

PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04',
             '2011-01-05', '2011-01-06', '2011-01-07', '2011-01-08',
             '2011-01-09', '2011-01-10',
             ...
             '2011-12-23', '2011-12-24', '2011-12-25', '2011-12-26',
             '2011-12-27', '2011-12-28', '2011-12-29', '2011-12-30',
             '2011-12-31', '2012-01-01'],
            dtype='period[D]', length=366, freq='D')