## Introduction to Pandas

In [1]:
import pandas as pd
import numpy as np

### Time Series

DataFrame - a 2D array

In [2]:
# generate consecutive dates starting from Aug 2 2016
dates = pd.date_range('20201020', periods=6)
dates

DatetimeIndex(['2020-10-20', '2020-10-21', '2020-10-22', '2020-10-23',
               '2020-10-24', '2020-10-25'],
              dtype='datetime64[ns]', freq='D')

In [3]:
# hourly frequence of date and hour
hours = pd.date_range('10/20/2020', periods=10, freq='H')
hours

DatetimeIndex(['2020-10-20 00:00:00', '2020-10-20 01:00:00',
               '2020-10-20 02:00:00', '2020-10-20 03:00:00',
               '2020-10-20 04:00:00', '2020-10-20 05:00:00',
               '2020-10-20 06:00:00', '2020-10-20 07:00:00',
               '2020-10-20 08:00:00', '2020-10-20 09:00:00'],
              dtype='datetime64[ns]', freq='H')

In [4]:
#slicing - 1st 5 entries
hours[:5]

DatetimeIndex(['2020-10-20 00:00:00', '2020-10-20 01:00:00',
               '2020-10-20 02:00:00', '2020-10-20 03:00:00',
               '2020-10-20 04:00:00'],
              dtype='datetime64[ns]', freq='H')

In [5]:
# slicing - 2nd 5 entries
hours[5:]

DatetimeIndex(['2020-10-20 05:00:00', '2020-10-20 06:00:00',
               '2020-10-20 07:00:00', '2020-10-20 08:00:00',
               '2020-10-20 09:00:00'],
              dtype='datetime64[ns]', freq='H')

In [6]:
# pd.Series(data, index=index)
ts = pd.Series(np.random.randn(len(hours)), index=hours)
ts

2020-10-20 00:00:00    0.902435
2020-10-20 01:00:00    0.699790
2020-10-20 02:00:00   -0.149557
2020-10-20 03:00:00    1.328731
2020-10-20 04:00:00   -0.820532
2020-10-20 05:00:00    1.210367
2020-10-20 06:00:00    0.158224
2020-10-20 07:00:00   -0.271066
2020-10-20 08:00:00   -0.531143
2020-10-20 09:00:00   -0.940547
Freq: H, dtype: float64

In [7]:
ts.shape

(10,)

In [8]:
# change to 45 minute frequency and forward fill
# method='pad' -> propagate the last valid observation forward
converted = ts.asfreq('45Min', method='pad')
converted

2020-10-20 00:00:00    0.902435
2020-10-20 00:45:00    0.902435
2020-10-20 01:30:00    0.699790
2020-10-20 02:15:00   -0.149557
2020-10-20 03:00:00    1.328731
2020-10-20 03:45:00    1.328731
2020-10-20 04:30:00   -0.820532
2020-10-20 05:15:00    1.210367
2020-10-20 06:00:00    0.158224
2020-10-20 06:45:00    0.158224
2020-10-20 07:30:00   -0.271066
2020-10-20 08:15:00   -0.531143
2020-10-20 09:00:00   -0.940547
Freq: 45T, dtype: float64

In [9]:
converted.shape

(13,)

In [11]:
ts.head()

2020-10-20 00:00:00    0.902435
2020-10-20 01:00:00    0.699790
2020-10-20 02:00:00   -0.149557
2020-10-20 03:00:00    1.328731
2020-10-20 04:00:00   -0.820532
Freq: H, dtype: float64

In [12]:
# daily means
ts.resample('D').mean()

2020-10-20    0.15867
Freq: D, dtype: float64

In [14]:
index = pd.date_range('1/1/2000', periods=9, freq='T')
series = pd.Series(range(9), index=index)
series

2000-01-01 00:00:00    0
2000-01-01 00:01:00    1
2000-01-01 00:02:00    2
2000-01-01 00:03:00    3
2000-01-01 00:04:00    4
2000-01-01 00:05:00    5
2000-01-01 00:06:00    6
2000-01-01 00:07:00    7
2000-01-01 00:08:00    8
Freq: T, dtype: int64

In [19]:
ts

2020-10-20 00:00:00    0.902435
2020-10-20 01:00:00    0.699790
2020-10-20 02:00:00   -0.149557
2020-10-20 03:00:00    1.328731
2020-10-20 04:00:00   -0.820532
2020-10-20 05:00:00    1.210367
2020-10-20 06:00:00    0.158224
2020-10-20 07:00:00   -0.271066
2020-10-20 08:00:00   -0.531143
2020-10-20 09:00:00   -0.940547
Freq: H, dtype: float64

In [21]:
# daily means
ts.resample('T').bfill()[0:5] # T stands for minute intervals, 3T for 3 minute intervals

2020-10-20 00:00:00    0.902435
2020-10-20 00:01:00    0.699790
2020-10-20 00:02:00    0.699790
2020-10-20 00:03:00    0.699790
2020-10-20 00:04:00    0.699790
Freq: T, dtype: float64

In [22]:
type(ts)

pandas.core.series.Series

### Time Stamps v.s. Time Spans  

In [23]:
from datetime import datetime
# time stamp
pd.Timestamp(datetime(2016, 8, 2))

Timestamp('2016-08-02 00:00:00')

In [24]:
# or
pd.Timestamp('2012-08-02')

Timestamp('2012-08-02 00:00:00')

In [25]:
# time span - a period of time
pd.Period('2016-08')

Period('2016-08', 'M')

In [26]:
pd.Period('2016-08', freq='D')

Period('2016-08-01', 'D')

In [27]:
# an array of time stamps
dates = [pd.Timestamp('2016-08-02'), pd.Timestamp('2016-08-03'), pd.Timestamp('2016-08-04')]
dates

[Timestamp('2016-08-02 00:00:00'),
 Timestamp('2016-08-03 00:00:00'),
 Timestamp('2016-08-04 00:00:00')]

In [28]:
# time series are index
ts = pd.Series(np.random.randn(3), dates)
ts

2016-08-02   -1.563285
2016-08-03    0.271306
2016-08-04   -0.717749
dtype: float64

In [29]:
type(ts.index)

pandas.core.indexes.datetimes.DatetimeIndex

In [30]:
periods = [pd.Period('2016-08'), pd.Period('2016-09'), pd.Period('2016-10')]
periods

[Period('2016-08', 'M'), Period('2016-09', 'M'), Period('2016-10', 'M')]

In [31]:
ts = pd.Series(np.random.randn(3), periods)
ts

2016-08    1.091859
2016-09   -1.085834
2016-10   -0.991741
Freq: M, dtype: float64

In [42]:
type(ts.index)

pandas.tseries.period.PeriodIndex

Converting to TimeStamps

In [32]:
# convert Series to datetime
ts = pd.to_datetime(pd.Series(['Jul 31, 2016', '2010-08-01', None]))
ts

0   2016-07-31
1   2010-08-01
2          NaT
dtype: datetime64[ns]

In [33]:
type(ts.index)

pandas.core.indexes.range.RangeIndex

In [34]:
pd.to_datetime(['2005/11/23', '2010.12.31'])

DatetimeIndex(['2005-11-23', '2010-12-31'], dtype='datetime64[ns]', freq=None)

In [35]:
# European Style Dates - use dayfirst() - returns DatetimeIndex
pd.to_datetime(['02-08-2016 10:00'], dayfirst=True)

DatetimeIndex(['2016-08-02 10:00:00'], dtype='datetime64[ns]', freq=None)

Invalid Data

In [37]:
# transforms all unknown string to NaT (Not a Time)
pd.to_datetime(['2009/07/31', 'asd'], errors='coerce')

DatetimeIndex(['2009-07-31', 'NaT'], dtype='datetime64[ns]', freq=None)

Range of Time Stamps

In [39]:
index = pd.date_range('2000-1-1', periods=10, freq='M')

In [40]:
index

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-30', '2000-10-31'],
              dtype='datetime64[ns]', freq='M')

Limitations

In [86]:
pd.Timestamp.min

Timestamp('1677-09-22 00:12:43.145225')

In [87]:
pd.Timestamp.max

Timestamp('2262-04-11 23:47:16.854775807')