# Week 14
# Time Series Data

Time series data is a data set where instances are indexed by time. It is an important form of structured data in many fields such as finance, economics, ecology, neuroscience, and physics. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Date and Time Data Types and Tools

In Python, the `datetime.datetime` class is widely used to represent date and time data.

In [2]:
from datetime import datetime

datetime.now()

datetime.datetime(2020, 11, 30, 22, 31, 22, 680026)

In [3]:
datetime.now().year

2020

In [4]:
datetime.now().day

30

In [5]:
datetime.now().month

11

We can use `datetime.timedelta` to represent the temporal difference between two `datetime` objects.

In [8]:
from datetime import timedelta

delta = timedelta(5)

datetime.now() + delta

datetime.datetime(2020, 12, 5, 22, 39, 26, 651062)

In [9]:
date1 = datetime(2019, 12, 12)
date2 = datetime.now()
date1 - date2

datetime.timedelta(days=-355, seconds=4806, microseconds=763516)

**Convert between string and datetime**

In [10]:
# datetime to string
date = datetime(2011, 1, 3, 23, 30, 45)
str(date)

'2011-01-03 23:30:45'

In [11]:
# Convert to format "YYYY-MM-DD"
date.strftime("%Y/%m/%d %H:%M, %A")

'2011/01/03 23:30, Monday'

Datetime formats:
- %Y: Four-digit year
- %y: Two-digit year
- %m: Two-digit month
- %d: Two-digit day
- %H: Hour 0 - 23
- %I: Hour 1 - 12
- %M: Two-digit minute
- %S: Second
- %w: Weekday

[More on this](https://docs.python.org/2/library/datetime.html)

In [14]:
# Exercise: convert date to "01/03/2011"
date.strftime("%m/%d/%Y")

'01/03/2011'

In [23]:
# Exercise: convert date to "01-03-2011 00:00"
date.strftime("%m-%d-%Y %H:%H")

'01-03-2011 23:23'

**Parse a datetime string**

In [24]:
# String to datetime
from dateutil.parser import parse
parse("2011-01-03")

datetime.datetime(2011, 1, 3, 0, 0)

In [25]:
parse("Jan 31, 1997 10:45 PM")

datetime.datetime(1997, 1, 31, 22, 45)

In [26]:
# Many countries use format "DD/MM/YYYY". We need to set dayfirst=True
# so that the date is correctly recognized.
parse("06/12/2011", dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

In [27]:
parse("06/12/2011")

datetime.datetime(2011, 6, 12, 0, 0)

## 2. Time Series Basics

In [28]:
# Create a list of datetime objects
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 2, 7), datetime(2011, 2, 8),
         datetime(2011, 3, 10), datetime(2011, 3, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02   -0.326016
2011-01-05   -1.402467
2011-02-07   -1.832132
2011-02-08    1.518269
2011-03-10    0.125453
2011-03-12    0.997269
dtype: float64

In [29]:
# Select 01/05
ts['2011-01-05']

-1.402467004644445

In [30]:
ts[1]

-1.402467004644445

In [31]:
ts['01/05/2011']

-1.402467004644445

In [32]:
ts['20110105']

-1.402467004644445

In [33]:
# Select a range of dates
ts['2011-02']

2011-02-07   -1.832132
2011-02-08    1.518269
dtype: float64

In [34]:
ts['2011-02-01':'2011-02-8'] # the end datetime is also included

2011-02-07   -1.832132
2011-02-08    1.518269
dtype: float64

In [35]:
ts['2011-02-01':]

2011-02-07   -1.832132
2011-02-08    1.518269
2011-03-10    0.125453
2011-03-12    0.997269
dtype: float64

In [36]:
ts[:"2011-03-10"]

2011-01-02   -0.326016
2011-01-05   -1.402467
2011-02-07   -1.832132
2011-02-08    1.518269
2011-03-10    0.125453
dtype: float64

## 3. Date Ranges

In [37]:
# manually populate a list of dates
dates = [datetime(2011, 1, 2), datetime(2011, 3, 10), datetime(2011, 4, 1)]
# ts[dates] # Pandas no longer supports missing indices
ts[ts.index.isin(dates)]

2011-01-02   -0.326016
2011-03-10    0.125453
dtype: float64

In [38]:
# Create a range of dates
daterange = pd.date_range('2011-01-01', periods=8)
print(daterange)

DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', '2011-01-04',
               '2011-01-05', '2011-01-06', '2011-01-07', '2011-01-08'],
              dtype='datetime64[ns]', freq='D')


In [39]:
daterange = pd.date_range('2011-01-01', periods=5, freq='2D')
print(daterange)

DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', '2011-01-07',
               '2011-01-09'],
              dtype='datetime64[ns]', freq='2D')


In [40]:
daterange = pd.date_range("2011-01-01", periods=5, freq="10H")
print(daterange)

DatetimeIndex(['2011-01-01 00:00:00', '2011-01-01 10:00:00',
               '2011-01-01 20:00:00', '2011-01-02 06:00:00',
               '2011-01-02 16:00:00'],
              dtype='datetime64[ns]', freq='10H')


In [41]:
# Sample business days only
daterange = pd.date_range("2011-01-01", periods=10, freq="B")
print(daterange)

DatetimeIndex(['2011-01-03', '2011-01-04', '2011-01-05', '2011-01-06',
               '2011-01-07', '2011-01-10', '2011-01-11', '2011-01-12',
               '2011-01-13', '2011-01-14'],
              dtype='datetime64[ns]', freq='B')


In [42]:
ts[daterange]

2011-01-03         NaN
2011-01-04         NaN
2011-01-05   -1.402467
2011-01-06         NaN
2011-01-07         NaN
2011-01-10         NaN
2011-01-11         NaN
2011-01-12         NaN
2011-01-13         NaN
2011-01-14         NaN
Freq: B, dtype: float64

In [43]:
ts[ts.index.isin(daterange)]

2011-01-05   -1.402467
dtype: float64

## 4. Shifting Data


In [44]:
prices = pd.DataFrame(np.random.rand(4) + 100,
                      index=pd.date_range('2019-11-01', periods=4),
                      columns=['Price'])
prices

Unnamed: 0,Price
2019-11-01,100.083897
2019-11-02,100.541786
2019-11-03,100.654319
2019-11-04,100.018832


In [45]:
prices - 100

Unnamed: 0,Price
2019-11-01,0.083897
2019-11-02,0.541786
2019-11-03,0.654319
2019-11-04,0.018832


In [46]:
# How to create a column storing yesterday's price?
for date in prices.index:
    yesterday = date - timedelta(days=1)
    if yesterday in prices.index:
        prices.loc[date, "Yesterday's Price"] = prices.loc[yesterday, "Price"]
prices

Unnamed: 0,Price,Yesterday's Price
2019-11-01,100.083897,
2019-11-02,100.541786,100.083897
2019-11-03,100.654319,100.541786
2019-11-04,100.018832,100.654319


In [47]:
prices = pd.DataFrame(np.random.rand(4) + 100,
                      index=pd.date_range('2019-11-01', periods=4),
                      columns=['Price'])

prices


Unnamed: 0,Price
2019-11-01,100.750601
2019-11-02,100.556073
2019-11-03,100.482643
2019-11-04,100.759614


In [48]:
prices_yesterday = prices.shift(1)
prices_yesterday

Unnamed: 0,Price
2019-11-01,
2019-11-02,100.750601
2019-11-03,100.556073
2019-11-04,100.482643


In [49]:
prices = pd.merge(prices, prices_yesterday, left_index=True, right_index=True,
                  suffixes=["Today", "Yesterday"])
prices

Unnamed: 0,PriceToday,PriceYesterday
2019-11-01,100.750601,
2019-11-02,100.556073,100.750601
2019-11-03,100.482643,100.556073
2019-11-04,100.759614,100.482643


In [50]:
# Exercise: Compute the percent changes between yesterday and today's price
# Formula: percent = (today's price - yesterday's price) / yesterday's price

prices['Percentage'] = (prices['PriceToday'] - prices['PriceYesterday'])/ prices['PriceYesterday']
prices

Unnamed: 0,PriceToday,PriceYesterday,Percentage
2019-11-01,100.750601,,
2019-11-02,100.556073,100.750601,-0.001931
2019-11-03,100.482643,100.556073,-0.00073
2019-11-04,100.759614,100.482643,0.002756
