In [1]:
import numpy as np
import pandas as pd

# 1.日期和时间数据类型及工具

In [2]:
from datetime import datetime

In [2]:
now = datetime.now()
now

datetime.datetime(2019, 7, 16, 15, 51, 57, 525251)

In [3]:
now.year,now.month,now.day

(2019, 7, 16)

datetime以毫秒形式存储日期和时间。timedelta表示两个datetime对象之间的时间差：

In [5]:
delta = datetime(2011,1,7)-datetime(2008,6,24,8,15)
delta

datetime.timedelta(days=926, seconds=56700)

可以给datetime对象加上（或减去）一个或多个timedelta，就会产生一个新对象：

In [6]:
from datetime import timedelta
start = datetime(2011,1,7)
start + timedelta(12)     # 12表示days

datetime.datetime(2011, 1, 19, 0, 0)

In [7]:
start-2*timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

# 字符串和datetime的相互转换

利用str或strftime方法（传入一个格式化字符串），datetime对象和pandas的Timestamp
对象可以被格式化为字符串：

In [8]:
stamp = datetime(2011,1,3)
stamp

datetime.datetime(2011, 1, 3, 0, 0)

In [9]:
str(stamp)

'2011-01-03 00:00:00'

In [13]:
stamp.strftime('%Y-%m-%d')    # %Y 4位数的年  与  %y 2位数的年 

'2011-01-03'

datetime.strptime可以用格式化编码将字符串转为日期：

In [15]:
value  = '2011-01-03'
datetime.strptime(value,'%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [17]:
datestrs = ['7/6/2011','8/6/2011']
[datetime.strptime(x,'%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

datetime.strptime是用过已知格式进行日期解析的最佳方式。但是每次都要编写格式定义
是很麻烦的事。对于常见的日期格式，可以使用dateutil中的parser.parse方法：

In [19]:
from dateutil.parser import parse
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

dateutil可以解析几乎所有人类能够理解的日期表示形式：

In [26]:
parse('Jan 31, 1997  10:45 PM')  # 注意逗号后面需要空格

datetime.datetime(1997, 1, 31, 22, 45)

日出现在月的前面很普遍，传入dayfirst=True即可解决这个问题：

In [27]:
parse('6/12/2011')

datetime.datetime(2011, 6, 12, 0, 0)

In [28]:
parse('6/12/2011',dayfirst = True)

datetime.datetime(2011, 12, 6, 0, 0)

pandas的 to_datetime可以解析多种不同的日起形式（对标准的很快）。 解析是将
字符串格式转换为日期形式（datetime）

In [32]:
datestrs = ['2011-07-06 12:00:00','2011-08-06 00:00:00']
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

还可以处理缺失值（None、空字符串等）

In [33]:
idx = pd.to_datetime(datestrs+[None])
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [37]:
idx[2]    # NaT (Not a Time)是pandas中时间戳数据的null值

NaT

In [35]:
pd.isnull(idx)

array([False, False,  True])

# 2.时间序列基础

pandas最基础的时间序列类型就是以时间戳（通常以Python字符串或datetime对象表示）
为索引的Series：

In [41]:
from datetime import datetime
dates = [datetime(2011,1,2),datetime(2011,1,5),
        datetime(2011,1,7),datetime(2011,1,8),
        datetime(2011,1,10),datetime(2011,1,12)]
ts = pd.Series(np.random.randn(6),index = dates)
ts

2011-01-02    0.380906
2011-01-05   -0.276791
2011-01-07   -1.516513
2011-01-08    1.779521
2011-01-10    0.796415
2011-01-12   -0.419013
dtype: float64

这些datetime对象实际被放在一个DatetimeIndex中：

In [44]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

跟其他Series一样，不同索引的时间序列之间的算术运算会自动按日期对齐：

In [42]:
ts[::2]

2011-01-02    0.380906
2011-01-07   -1.516513
2011-01-10    0.796415
dtype: float64

In [43]:
ts+ts[::2]  # 取开头，每个两个取一个

2011-01-02    0.761812
2011-01-05         NaN
2011-01-07   -3.033025
2011-01-08         NaN
2011-01-10    1.592830
2011-01-12         NaN
dtype: float64

DatetimeIndex中的各个标量值是pandas的Timestamp对象：

In [46]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

只要有需要，TimeStamp可以随时自动转换为datetime对象。此外，它还可以存储频率信息，
且知道如何执行时区转换及其其他操作。

# 索引、选取、子集构造

当根据标签索引选取数据时，时间序列和其它的pandas.Series很想：

In [47]:
stamp = ts.index[2]
stamp

Timestamp('2011-01-07 00:00:00')

In [48]:
ts[2]

-1.5165125232920778

In [49]:
ts[stamp]

-1.5165125232920778

还有一种更为方便的方法：传入一个可以被解释为日期的字符串！！

In [50]:
ts['1/10/2011']

0.7964147767542104

In [51]:
ts['20110110']

0.7964147767542104

对于较长的时间序列，只需传入‘年’或‘年月’即可轻松选取数据的切片：

In [54]:
longer_ts = pd.Series(np.random.randn(1000),index = pd.date_range('1/1/2000',periods=1000))
longer_ts

2000-01-01    0.742272
2000-01-02    0.102663
2000-01-03    1.725411
2000-01-04    1.410775
2000-01-05   -0.240479
2000-01-06   -0.311919
2000-01-07    0.875844
2000-01-08   -0.865756
2000-01-09   -0.612966
2000-01-10   -0.536282
2000-01-11    0.207134
2000-01-12   -0.014450
2000-01-13    0.390230
2000-01-14   -0.202745
2000-01-15    0.981771
2000-01-16   -0.100048
2000-01-17    0.812346
2000-01-18    1.369438
2000-01-19   -0.820046
2000-01-20    0.376921
2000-01-21   -1.551911
2000-01-22   -0.746659
2000-01-23   -1.546927
2000-01-24   -0.366547
2000-01-25    0.208120
2000-01-26    1.008324
2000-01-27   -1.528306
2000-01-28    1.532856
2000-01-29   -1.270758
2000-01-30    0.482593
                ...   
2002-08-28    0.802535
2002-08-29   -0.090735
2002-08-30    0.691798
2002-08-31    1.155396
2002-09-01    0.502199
2002-09-02    0.808253
2002-09-03    0.801747
2002-09-04    1.935028
2002-09-05    0.832459
2002-09-06    0.833510
2002-09-07    0.477503
2002-09-08    1.921062
2002-09-09 

In [55]:
longer_ts['2001']

2001-01-01   -0.089766
2001-01-02   -1.092192
2001-01-03    0.081580
2001-01-04    0.058668
2001-01-05   -2.185395
2001-01-06    1.522199
2001-01-07    1.339346
2001-01-08   -0.325341
2001-01-09   -0.392393
2001-01-10   -1.213513
2001-01-11    1.829604
2001-01-12    0.216604
2001-01-13    1.696069
2001-01-14    0.195496
2001-01-15    1.103463
2001-01-16   -0.692224
2001-01-17   -0.624620
2001-01-18   -1.006607
2001-01-19    1.494443
2001-01-20   -0.662170
2001-01-21    2.031171
2001-01-22    0.260063
2001-01-23   -0.668932
2001-01-24    1.275560
2001-01-25    1.015974
2001-01-26   -0.276546
2001-01-27   -0.306013
2001-01-28   -1.645083
2001-01-29    0.259583
2001-01-30   -0.861081
                ...   
2001-12-02    0.283440
2001-12-03   -1.326613
2001-12-04   -0.367854
2001-12-05   -0.473702
2001-12-06   -0.534433
2001-12-07   -0.026919
2001-12-08   -0.769785
2001-12-09    3.187234
2001-12-10    0.488647
2001-12-11   -1.496580
2001-12-12   -0.618909
2001-12-13    0.709746
2001-12-14 

In [56]:
longer_ts['2001-05']  #  年月

2001-05-01   -1.121568
2001-05-02    0.347226
2001-05-03    0.009897
2001-05-04   -0.396891
2001-05-05    0.969269
2001-05-06    0.179470
2001-05-07   -0.362265
2001-05-08    0.929777
2001-05-09   -1.548803
2001-05-10   -0.170962
2001-05-11   -0.819363
2001-05-12   -0.441534
2001-05-13    2.314359
2001-05-14    0.118261
2001-05-15    0.042587
2001-05-16   -0.136550
2001-05-17   -0.090642
2001-05-18   -1.138472
2001-05-19   -0.976478
2001-05-20   -1.175659
2001-05-21    0.153772
2001-05-22   -0.363741
2001-05-23   -1.402980
2001-05-24    1.303842
2001-05-25   -2.449559
2001-05-26   -0.174047
2001-05-27   -0.456726
2001-05-28   -0.274737
2001-05-29    0.088139
2001-05-30   -0.523603
2001-05-31   -1.643417
Freq: D, dtype: float64

datetime对象也可以进行切片：

In [59]:
ts

2011-01-02    0.380906
2011-01-05   -0.276791
2011-01-07   -1.516513
2011-01-08    1.779521
2011-01-10    0.796415
2011-01-12   -0.419013
dtype: float64

In [60]:
ts[datetime(2011,1,7):]

2011-01-07   -1.516513
2011-01-08    1.779521
2011-01-10    0.796415
2011-01-12   -0.419013
dtype: float64

由于大部分时间序列数据都是按照时间先后排序的，因此可以用不存在于该时间序列中的
时间戳对其进行切片（即范围查询）：

In [61]:
ts

2011-01-02    0.380906
2011-01-05   -0.276791
2011-01-07   -1.516513
2011-01-08    1.779521
2011-01-10    0.796415
2011-01-12   -0.419013
dtype: float64

In [63]:
ts['2011-01-06':'2011-01-11']   # 等价于ts['1/6/2011':'1/11/2011']

2011-01-07   -1.516513
2011-01-08    1.779521
2011-01-10    0.796415
dtype: float64

还有一个等价的实例方法也可以截取两个日期之间的TimeSeries：

In [64]:
ts.truncate(after = '1/9/2011')

2011-01-02    0.380906
2011-01-05   -0.276791
2011-01-07   -1.516513
2011-01-08    1.779521
dtype: float64

这些操作对DataFrame也有效，如对DataFrame的行进行索引：

In [65]:
dates = pd.date_range('1/1/2000',periods = 100,freq = 'W-WED')
long_df = pd.DataFrame(np.random.randn(100,4),index = dates,
                      columns = ['Colorado','texas','NY','Ohip'])
long_df

Unnamed: 0,Colorado,texas,NY,Ohip
2000-01-05,-0.805182,-1.542770,0.382162,-0.395096
2000-01-12,1.134610,-0.466800,1.212198,0.889099
2000-01-19,-0.288055,-0.758024,-0.849322,-1.027380
2000-01-26,0.881601,0.141130,0.142010,-1.615215
2000-02-02,-2.252704,-1.004911,-0.538450,-0.194867
2000-02-09,0.611405,-0.278191,-0.394394,-0.203447
2000-02-16,-0.665616,1.347689,0.973286,1.706687
2000-02-23,-0.245537,1.429769,-0.268758,0.189155
2000-03-01,-1.913623,-0.411990,-0.368302,0.373306
2000-03-08,0.265479,0.941715,-0.443081,-1.454288


In [67]:
long_df.loc['5-2001']

Unnamed: 0,Colorado,texas,NY,Ohip
2001-05-02,-1.781847,0.026045,0.810871,-0.003942
2001-05-09,0.110485,-1.103557,0.529668,-0.451986
2001-05-16,-0.490583,0.83912,-0.079675,-0.599365
2001-05-23,0.395833,-0.561109,0.548563,-2.068128
2001-05-30,-1.064498,-0.787561,0.883396,-0.599602


# 带有重复索引的时间序列

在某些应用场景中，可能会存在多个观测数据落在同一个时间点上的情况：

In [70]:
dates = pd.DatetimeIndex(['1/1/2000','1/2/2000','1/2/2000','1/2/2000','1/3/2000'])
dup_ts = pd.Series(np.arange(5),index = dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

通过检查索引的is_unique属性，我们就可以知道它是不是唯一的：

In [72]:
dup_ts.index.is_unique

False

对这个时间序列进行索引，要么产生标量值，要么产生切片，具体要看所选的时间点是否重复：

In [74]:
dup_ts['1/3/2000']  # 无重复

4

In [75]:
dup_ts['1/2/2000']   # 有重复

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

假设想要具有非唯一时间戳的数据进行聚合。一个办法是使用groupby，并传入level=0：

In [78]:
grouped = dup_ts.groupby(level=0)   # level 分组级别  level = 0 表示按照第一列分组
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [79]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

# 日期的范围、频率以及移动

In [80]:
ts

2011-01-02    0.380906
2011-01-05   -0.276791
2011-01-07   -1.516513
2011-01-08    1.779521
2011-01-10    0.796415
2011-01-12   -0.419013
dtype: float64

In [81]:
resampler = ts.resample('D')
resampler

DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]

# 生成日期范围

pandas.date_range可用于根据指定的频率生成指定长度的DatetimeIndex：

In [82]:
index = pd.date_range('4/1/2012','6/1/2012')
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

默认情况下，date_range会产生按天计算的时间点。如果只传入起始或结束日期，那
就还得传入一个表示一段时间的数字：

In [83]:
pd.date_range(start = '2012-01-02',periods=20)   # periods 表示生成多少个日期

DatetimeIndex(['2012-01-02', '2012-01-03', '2012-01-04', '2012-01-05',
               '2012-01-06', '2012-01-07', '2012-01-08', '2012-01-09',
               '2012-01-10', '2012-01-11', '2012-01-12', '2012-01-13',
               '2012-01-14', '2012-01-15', '2012-01-16', '2012-01-17',
               '2012-01-18', '2012-01-19', '2012-01-20', '2012-01-21'],
              dtype='datetime64[ns]', freq='D')

In [85]:
pd.date_range(end='2012-06-01',periods = 20)   # start   end  periods 至少选取其二

DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',
               '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
               '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
               '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',
               '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')

起始和结束日期定义了日期索引的严格边界。例如，如果你想要生成一个由每月最后一个工作日
组成的日期索引，可以传入‘BM’频率（表示business  end  of  mouth），这样就会
只包含时间间隔内（或刚好在边界上的）符合频率要求的日期：

In [3]:
pd.date_range('2000-01-01','2000-12-01',freq = 'BM')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

date_range默认会保留起始和结束时间戳的时间信息（如果有的话）：

In [4]:
pd.date_range('2012-05-02 12:56:31',periods=5)

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

若希望产生一组被规范化（normalize）到午夜的时间戳，normalize可以实现该功能：

In [5]:
pd.date_range('2012-05-02 12:56:31',periods=5,normalize=True)


DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

# 频率和日期偏移量

在基础频率前面放上一个整数即可创建倍数：

In [7]:
pd.date_range('2000-01-01','2000-01-03 23:59',freq = '4h')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

# WOM日期

WOM(Week Of Month)，以WOM开头能获得诸如‘每月 第3个星期五’之类的日期：
（即每月第几个星期几）

In [8]:
rng = pd.date_range('2012-01-01','2012-09-01',freq = 'WOM-3FRI')
rng

DatetimeIndex(['2012-01-20', '2012-02-17', '2012-03-16', '2012-04-20',
               '2012-05-18', '2012-06-15', '2012-07-20', '2012-08-17'],
              dtype='datetime64[ns]', freq='WOM-3FRI')

In [9]:
list(rng)

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

# 移动（超前或滞后）数据

移动（shifting）指的是沿着时间轴将数据前移或后移。（数据移动）
Series和DataFrame都有一个shift方法用于执行单纯的前移或后移操作，保持索引不变！

In [11]:
ts = pd.Series(np.random.randn(4),index = pd.date_range('1/1/2000',periods=4,freq='M'))
ts        #    freq = 'M'  指每月最后一个日历日

2000-01-31    0.410883
2000-02-29    0.062279
2000-03-31    1.123713
2000-04-30   -0.008231
Freq: M, dtype: float64

In [12]:
ts.shift(2)     # shift（a） 整数表示数据往前移动a位，索引不变，产生缺失数据为NaN

2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.410883
2000-04-30    0.062279
Freq: M, dtype: float64

In [13]:
ts.shift(-2)

2000-01-31    1.123713
2000-02-29   -0.008231
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

shift通常用于计算一个时间序列或多个时间序列（如DataFrame的列）中的百分比变化。
可以这样表达：

In [14]:
ts/ts.shift(1)-1

2000-01-31          NaN
2000-02-29    -0.848427
2000-03-31    17.043231
2000-04-30    -1.007325
Freq: M, dtype: float64

由于单纯的移位操作不会修改索引，所以部分数据会丢弃。因此，如果频率已知，则可将其传给
shift以便时间对时间戳进行位移而不是对数据进行简单位移：

In [15]:
ts.shift(2,freq = 'M')  # 时间戳索引上移（数据下移），保持数据无丢弃（无NaN）

2000-03-31    0.410883
2000-04-30    0.062279
2000-05-31    1.123713
2000-06-30   -0.008231
Freq: M, dtype: float64

In [16]:
ts.shift(3,freq = 'D')   #  1/31 + 3 D = 02-03

2000-02-03    0.410883
2000-03-03    0.062279
2000-04-03    1.123713
2000-05-03   -0.008231
dtype: float64

In [17]:
ts.shift(1,freq = '90T')  # 90T 90分钟

2000-01-31 01:30:00    0.410883
2000-02-29 01:30:00    0.062279
2000-03-31 01:30:00    1.123713
2000-04-30 01:30:00   -0.008231
Freq: M, dtype: float64

# 通过偏移量对日期进行位移

pandas的日期偏移量还可以用在datetime或Timestamp对象上：

In [20]:
from pandas.tseries.offsets import Day,MonthEnd
now = datetime(2011,11,17)
now + 3*Day()

Timestamp('2011-11-20 00:00:00')

如果加的是锚点偏移量（比如MonthEnd），第一次增量会将原日期向前滚动到符合
频率规则的下一个日期：

In [None]:
now+