In [2]:
import pandas as pd

## Working with dates

Need to master the followings :

- work with different formats and timezones (know how to translate from a format to another and from a tz to another) 
- filter dates on specific year / hours etc...
- operations with the dates (& with business operations)

source = https://pandas.pydata.org/docs/user_guide/timeseries.html

In [9]:
#create a df for the year 2021 with the month in index
pd.Series(range(12), index = pd.date_range('2021', periods=12, freq='M'))

2021-01-31     0
2021-02-28     1
2021-03-31     2
2021-04-30     3
2021-05-31     4
2021-06-30     5
2021-07-31     6
2021-08-31     7
2021-09-30     8
2021-10-31     9
2021-11-30    10
2021-12-31    11
Freq: M, dtype: int64

In [10]:
#Note that you can specify different frequencies e.g
pd.date_range('2021', periods=12, freq='3M')

DatetimeIndex(['2021-01-31', '2021-04-30', '2021-07-31', '2021-10-31',
               '2022-01-31', '2022-04-30', '2022-07-31', '2022-10-31',
               '2023-01-31', '2023-04-30', '2023-07-31', '2023-10-31'],
              dtype='datetime64[ns]', freq='3M')

In [11]:
d = pd.to_datetime('2021-03-31')
d.day_name()

'Wednesday'

In [12]:
d

Timestamp('2021-03-31 00:00:00')

In [13]:
#Creating a Timestamp
ts = pd.Timestamp('2010-12-05')

#or
import datetime
ts = datetime.datetime(2010,12,5)
ts

datetime.datetime(2010, 12, 5, 0, 0)

In [14]:
#Note that you can specify the timezone
pd.Timestamp('2018-01-18', tz= 'US/Pacific')

Timestamp('2018-01-18 00:00:00-0800', tz='US/Pacific')

In [15]:
#you can access and get ride of it
ts = pd.Timestamp('2018-01-18', tz= 'US/Pacific')
print(ts.tz)
ts_bis = ts.tz_localize(None)
print(ts_bis.tz)

US/Pacific
None


In [16]:
#creating a DatetimeIndex (a list of timestamps)
di = [pd.Timestamp('2010-12-05'),
pd.Timestamp('2010-12-06'),
pd.Timestamp('2010-12-07')]
type(di)

list

In [17]:
#creating an array of datetime e.g. a Datetimeindex
pd.to_datetime(['01/12/2015', '28/11/2012'], dayfirst=True)

DatetimeIndex(['2015-12-01', '2012-11-28'], dtype='datetime64[ns]', freq=None)

In [18]:
pd.Series(di)

0   2010-12-05
1   2010-12-06
2   2010-12-07
dtype: datetime64[ns]

In [19]:
#Quick exercice
s = pd.Series(pd.date_range('2021', periods=12, freq='M'))
df = pd.DataFrame({'dates': s, 'day_name' : s.dt.day_name()})

pandas.Series.dt => Accessor object for datetimelike properties of the Series values.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.html

Attributes: year, month, tzinfo, etc...

In [20]:
tmp = pd.DataFrame({'year': s.dt.year, 'day': s.dt.day})
df = pd.concat([df, tmp], axis=1)

In [21]:
df

Unnamed: 0,dates,day_name,year,day
0,2021-01-31,Sunday,2021,31
1,2021-02-28,Sunday,2021,28
2,2021-03-31,Wednesday,2021,31
3,2021-04-30,Friday,2021,30
4,2021-05-31,Monday,2021,31
5,2021-06-30,Wednesday,2021,30
6,2021-07-31,Saturday,2021,31
7,2021-08-31,Tuesday,2021,31
8,2021-09-30,Thursday,2021,30
9,2021-10-31,Sunday,2021,31


In [22]:
#get the decade 
def decade(y):
    return str(y)[2] + '0'

df['decade'] = df['year'].apply(decade)

In [23]:
df.head(n=3)

Unnamed: 0,dates,day_name,year,day,decade
0,2021-01-31,Sunday,2021,31,20
1,2021-02-28,Sunday,2021,28,20
2,2021-03-31,Wednesday,2021,31,20


In [24]:
#periods
#trimestre = quarters
df['quarters'] = df['dates'].dt.to_period(freq='Q')

Exo w3resource : https://www.w3resource.com/python-exercises/pandas/time-series/index.php

1) 

- a) Datetime object for Jan 15 2012.
- b) Specific date and time of 9:20 pm.
- c) Local date and time.
- d) A date without time.
- e) Current date.
- f) Time from a datetime.
- g) Current local time.

In [25]:
print(pd.Timestamp('2012-01-15'))
print(pd.Timestamp('2012-01-15 21:20:00'))
print(pd.to_datetime('today')) 
#Note a question but can be useful 
print(pd.to_datetime('today').normalize())
print(pd.Timestamp('2012-01-15 21:20:00').date()) #no need for the .dt, already a datetime / timestamp
print(pd.to_datetime('today').date())
print(pd.Timestamp('2012-01-15 21:20:00').time())

2012-01-15 00:00:00
2012-01-15 21:20:00
2021-03-11 17:41:09.135514
2021-03-11 00:00:00
2012-01-15
2021-03-11
21:20:00


In [26]:
#Note that you can do the same with datetime
import datetime
datetime.datetime.now()

datetime.datetime(2021, 3, 11, 17, 41, 9, 149426)

2)  Write a Pandas program to print the day after and before a specified date. Also print the days between two given dates

In [27]:
today = pd.to_datetime('today')
print(f'today {today.date()}')
tomorrow = today + pd.Timedelta(days=1)
print(f'tomorrow {tomorrow.date()}')
yesterday = today - pd.Timedelta(days=1)
print(f'yesterday {yesterday.date()}')

#daterange
date1 = pd.Timestamp('2012-01-15')
date2 = pd.Timestamp('2012-01-25')
pd.Series(pd.date_range(date1, date2, freq='D'))

today 2021-03-11
tomorrow 2021-03-12
yesterday 2021-03-10


0    2012-01-15
1    2012-01-16
2    2012-01-17
3    2012-01-18
4    2012-01-19
5    2012-01-20
6    2012-01-21
7    2012-01-22
8    2012-01-23
9    2012-01-24
10   2012-01-25
dtype: datetime64[ns]

Write a Pandas program to create a time-series with two index labels and random values. Also print the type of the index

In [28]:
idx = pd.date_range('2012-01-01', periods=20, freq='D')
df = pd.DataFrame({'x':np.random.randint(0,10,20), 'type': idx.dtype}, index=idx)
df.head()

Unnamed: 0,x,type
2012-01-01,0,datetime64[ns]
2012-01-02,9,datetime64[ns]
2012-01-03,0,datetime64[ns]
2012-01-04,2,datetime64[ns]
2012-01-05,4,datetime64[ns]


Write a Pandas program to check if a day is a business day (weekday) or not.

In [29]:
#First look at pd.bdate_range() does : it gives you a DatatimeIndex of all the business days bewteen 2 dates! 
# e.g.
#Note how the two weekend days are skipped in the result.
pd.bdate_range(start='1/1/2018', end='1/08/2018') #ALSO be careful at the english notation %m/%d/%y

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-08'],
              dtype='datetime64[ns]', freq='B')

In [30]:
#let's get back to our problem
def isBusinessDay(date):
    return bool(len(pd.bdate_range(date,date)))

In [31]:
#check
print(isBusinessDay('2018-01-01'))
print(isBusinessDay('2018-01-06'))

True
False


Note : bool() python function

The bool() function returns the boolean value of a specified object.

The object will always return True, unless:

- The object is empty, like [], (), {}
- The object is False
- The object is 0
- The object is None

In [32]:
x = pd.Series(pd.date_range('2018-01-01', '2018-01-08'))
y = x.apply(isBusinessDay)
pd.DataFrame({'y':y, 'x':x})

Unnamed: 0,y,x
0,True,2018-01-01
1,True,2018-01-02
2,True,2018-01-03
3,True,2018-01-04
4,True,2018-01-05
5,False,2018-01-06
6,False,2018-01-07
7,True,2018-01-08


Write a Pandas program to convert unix/epoch time to a regular time stamp in UTC. Also convert the said timestamp in to a given time zone

Note : 
**UTC** = Coordinated Universal Time or UTC is the primary time standard by which the world regulates clocks and time. It is within about 1 second of mean solar time at 0° longitude, and is not adjusted for daylight saving time. It is effectively a successor to Greenwich Mean Time

In [33]:
#unix -> datetime
unix_times = ['1095379198.75', '1095379199.00', '1095379199.25']
pd.to_datetime(unix_times, unit='s')

DatetimeIndex(['2004-09-16 23:59:58.750000',        '2004-09-16 23:59:59',
               '2004-09-16 23:59:59.250000'],
              dtype='datetime64[ns]', freq=None)

In [34]:
# By default, time series objects of pandas do not have an assigned time zone.

time_stamp = pd.to_datetime('today')
print('Today', time_stamp)
print("\nConvert the said timestamp in to US/Pacific:")
print(time_stamp.tz_localize('UTC').tz_convert('US/Pacific'))
print("\nConvert the said timestamp in to Europe/Berlin:")
print(time_stamp.tz_localize('UTC').tz_convert('Europe/Berlin'))

Today 2021-03-11 17:41:09.332497

Convert the said timestamp in to US/Pacific:
2021-03-11 09:41:09.332497-08:00

Convert the said timestamp in to Europe/Berlin:
2021-03-11 18:41:09.332497+01:00


 Write a Pandas program to subtract two timestamps of same time zone or different time zone

In [35]:
ts1 = pd.Timestamp('2012-01-10 12:00:22', tz= 'Europe/Berlin')
ts2 = pd.Timestamp('2012-01-10 01:00:22', tz= 'Europe/Berlin')
print(ts1 - ts2)

0 days 11:00:00


In [36]:
#different timezone
ts1 = pd.Timestamp('2012-01-10 12:00:22', tz= 'Europe/Berlin')
ts2 = pd.Timestamp('2012-01-10 01:00:22', tz= 'US/Pacific')

#let's remove the timezones
print(ts1.tz_localize(None) - ts2.tz_localize(None))

0 days 11:00:00


Write a Pandas program to calculate all Thursdays between two given days.

In [37]:
import numpy as np 

def ThursdayCount(start, end):
    #Note that Timedelta objects have read-only instance attributes .days, .seconds, and .microseconds.
    period = (end - start).days
    dr = pd.Series(pd.date_range(start, periods=30, freq='D'))
    print(dr.dt.day_name())
    print('Total count of Thursdays', np.sum(dr.dt.day_name() == 'Thursday'))

In [38]:
day1 = pd.Timestamp('2021-01-01')
day2 = pd.Timestamp('2021-01-31')

In [39]:
(day1 - day2).days

-30

### Filter dates !!

In [3]:
s = pd.Series(pd.date_range('2021', periods=12, freq='M'))

df = pd.concat([s,
           s.dt.month_name(),
           s.dt.year,
           s.dt.to_period(freq='Q'),
           s.apply(lambda s: str(s)[:2] + 's')
          ], 
          axis=1, keys=['date', 'month', 'year', 'Q', 'decade'])

In [4]:
df

Unnamed: 0,date,month,year,Q,decade
0,2021-01-31,January,2021,2021Q1,20s
1,2021-02-28,February,2021,2021Q1,20s
2,2021-03-31,March,2021,2021Q1,20s
3,2021-04-30,April,2021,2021Q2,20s
4,2021-05-31,May,2021,2021Q2,20s
5,2021-06-30,June,2021,2021Q2,20s
6,2021-07-31,July,2021,2021Q3,20s
7,2021-08-31,August,2021,2021Q3,20s
8,2021-09-30,September,2021,2021Q3,20s
9,2021-10-31,October,2021,2021Q4,20s


In [8]:
#1ST WAY: with the indexes

tmp = df.set_index(df['date'])
tmp = tmp.drop('date', axis=1)

In [6]:
tmp.loc['2021-01-31': '2021-03-31']

Unnamed: 0_level_0,month,year,Q,decade
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-31,January,2021,2021Q1,20s
2021-02-28,February,2021,2021Q1,20s
2021-03-31,March,2021,2021Q1,20s


In [9]:
#2nd WAY: without the indexes

df.loc[(df.date >= '2021-01-31') & (df.date <= '2021-07-31')]

Unnamed: 0,date,month,year,Q,decade
0,2021-01-31,January,2021,2021Q1,20s
1,2021-02-28,February,2021,2021Q1,20s
2,2021-03-31,March,2021,2021Q1,20s
3,2021-04-30,April,2021,2021Q2,20s
4,2021-05-31,May,2021,2021Q2,20s
5,2021-06-30,June,2021,2021Q2,20s
6,2021-07-31,July,2021,2021Q3,20s
