In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
data = pd.read_fwf("http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii")

In [3]:
data.head()

Unnamed: 0,1950,1,-0.60310E-01
0,1950,2,0.62681
1,1950,3,-0.008128
2,1950,4,0.5551
3,1950,5,0.071577
4,1950,6,0.53857


In [4]:
data = pd.read_fwf("http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii", parse_dates = [[0, 1]], infer_datetime_format = True, header = None, index_col=0)

In [5]:
data.head()

Unnamed: 0_level_0,2
0_1,Unnamed: 1_level_1
1950-01-01,-0.06031
1950-02-01,0.62681
1950-03-01,-0.008128
1950-04-01,0.5551
1950-05-01,0.071577


In [6]:
data.index

DatetimeIndex(['1950-01-01', '1950-02-01', '1950-03-01', '1950-04-01',
               '1950-05-01', '1950-06-01', '1950-07-01', '1950-08-01',
               '1950-09-01', '1950-10-01',
               ...
               '2016-12-01', '2017-01-01', '2017-02-01', '2017-03-01',
               '2017-04-01', '2017-05-01', '2017-06-01', '2017-07-01',
               '2017-08-01', '2017-09-01'],
              dtype='datetime64[ns]', name='0_1', length=813, freq=None)

In [7]:
data.index.names=['month']
data.columns=['values']

In [8]:
data.head()

Unnamed: 0_level_0,values
month,Unnamed: 1_level_1
1950-01-01,-0.06031
1950-02-01,0.62681
1950-03-01,-0.008128
1950-04-01,0.5551
1950-05-01,0.071577


In [9]:
data['1950':'1952'] #What do you notice about the range of dates?

Unnamed: 0_level_0,values
month,Unnamed: 1_level_1
1950-01-01,-0.06031
1950-02-01,0.62681
1950-03-01,-0.008128
1950-04-01,0.5551
1950-05-01,0.071577
1950-06-01,0.53857
1950-07-01,-0.80248
1950-08-01,-0.85101
1950-09-01,0.35797
1950-10-01,-0.3789


In [11]:
# What is the empirical range of dates?

min(data.index)
max(data.index)

Timestamp('1950-01-01 00:00:00')

Timestamp('2017-09-01 00:00:00')

In [13]:
data = data.to_period()
data

Unnamed: 0_level_0,values
month,Unnamed: 1_level_1
1950-01,-0.060310
1950-02,0.626810
1950-03,-0.008128
1950-04,0.555100
1950-05,0.071577
1950-06,0.538570
1950-07,-0.802480
1950-08,-0.851010
1950-09,0.357970
1950-10,-0.378900


In [None]:
# How to visualize?

In [18]:
# What kind of index do we have?
type(data.index)

pandas.core.indexes.period.PeriodIndex

In [23]:
data['1951-11-11':'1951-11-12']

Unnamed: 0_level_0,values
month,Unnamed: 1_level_1
1951-11,-0.068519


In [24]:
data['1951-11-11':'1952-01-12']

Unnamed: 0_level_0,values
month,Unnamed: 1_level_1
1951-11,-0.068519
1951-12,1.9872
1952-01,0.36825


In [None]:
# Which is more appropriate for this data?

In [17]:
# How do various data loads perform?
import timeit

site = 'http://www.cpc.ncep.noaa.gov/products/precip/CWlink/daily_ao_index/monthly.ao.index.b50.current.ascii'

print("infer_datetime_format = True, no date parser")
%timeit pd.read_fwf(site, parse_dates = [[0, 1]], infer_datetime_format = True, header = None,)

print("infer_datetime_format = False, no date parser")
%timeit pd.read_fwf(site, parse_dates = [[0, 1]], infer_datetime_format = False, header = None,)

print("infer_datetime_format = True, date parser provided")
dateparse = lambda x, y: pd.datetime.strptime('%s-%s'%(x,y), '%Y-%m')
%timeit pd.read_fwf(site, parse_dates = [[0, 1]], infer_datetime_format = True, date_parser = dateparse,  header = None,)

print("infer_datetime_format = False, date parser provided")
dateparse = lambda x, y: pd.datetime.strptime('%s-%s'%(x,y), '%Y-%m')
%timeit pd.read_fwf(site, parse_dates = [[0, 1]], infer_datetime_format = False, date_parser = dateparse,  header = None,)

infer_datetime_format = True, no date parser
367 ms ± 109 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
infer_datetime_format = False, no date parser
377 ms ± 72.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
infer_datetime_format = True, date parser provided
444 ms ± 68.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
infer_datetime_format = False, date parser provided
373 ms ± 31.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
# What if you already have the data frame and want to parse columns?
df = pd.DataFrame({'year': [2015, 2016],'month': [2, 3],'day': [4, 5],'hour': [2, 3]})
df

Unnamed: 0,day,hour,month,year
0,4,2,2,2015
1,5,3,3,2016


In [35]:
pd.to_datetime(df)

0   2015-02-04 02:00:00
1   2016-03-05 03:00:00
dtype: datetime64[ns]

In [36]:
pd.to_datetime(df[['year', 'month', 'day']])

0   2015-02-04
1   2016-03-05
dtype: datetime64[ns]

In [39]:
# Does it work with other column names? --> NO
# What if you already have the data frame and want to parse columns?
df1 = pd.DataFrame({'yeer': [2015, 2016],'moonth': [2, 3],'daye': [4, 5],'our': [2, 3]})
df1

Unnamed: 0,daye,moonth,our,yeer
0,4,2,2,2015
1,5,3,3,2016


In [40]:
pd.to_datetime(df1)

ValueError: to assemble mappings requires at least that [year, month, day] be specified: [day,month,year] is missing

In [None]:
# Go get your own time series data, load it in, and see what you can see
# Hint: http://pandas.pydata.org/pandas-docs/stable/remote_data.html

In [None]:
# 1. plot 2. get range of dates 3. convert between time and period index

In [28]:
# Let's experiment with truncate convenience function
ts = pd.Series(range(10), index = pd.date_range('7/31/2015', freq = 'M', periods = 10))
ts.truncate(before='10/31/2015', after='12/31/2015')

2015-10-31    3
2015-11-30    4
2015-12-31    5
Freq: M, dtype: int32

In [41]:
# You can truncate in a way that breaks frequency
ts[[0, 1, 6]].index

DatetimeIndex(['2015-07-31', '2015-08-31', '2016-01-31'], dtype='datetime64[ns]', freq=None)

In [31]:
# It will save you when it can
ts.iloc[0:10:2].index

DatetimeIndex(['2015-07-31', '2015-09-30', '2015-11-30', '2016-01-31',
               '2016-03-31'],
              dtype='datetime64[ns]', freq='2M')