# Chapter 2: Time series graphics

Load common libraries and settings:

In [1]:
import warnings
warnings.filterwarnings(
    "ignore",
    category=UserWarning,
    message=".*FigureCanvasAgg is non-interactive.*"
)
import os
os.environ["NIXTLA_ID_AS_COL"] = "true"
import numpy as np
np.set_printoptions(suppress=True)
np.random.seed(1)
import random
random.seed(1)
import pandas as pd
pd.set_option("max_colwidth", 100)
pd.set_option("display.precision", 3)
from utilsforecast.plotting import plot_series as plot_series_utils
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams.update({
    "figure.figsize": (8, 5),
    "figure.dpi": 100,
    "savefig.dpi": 300,
    "figure.constrained_layout.use": True,
    "axes.titlesize": 12,
    "axes.labelsize": 10,
    "xtick.labelsize": 9,
    "ytick.labelsize": 9,
    "legend.fontsize": 9,
    "legend.title_fontsize": 10,
})
import matplotlib as mpl
from cycler import cycler
mpl.rcParams['axes.prop_cycle'] = cycler(color=["#000000", "#000000"])
from fpppy.utils import plot_series

Load additional libraries:

In [2]:
from scipy.stats import pearsonr
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf

## 2.1 `DataFrame` Objects

### The index variables

A time series is a list of numbers indexed by a time step. 

In [3]:
x = [123, 39, 78, 52, 110]
yr = list(range(2015, 2020))
df = pd.DataFrame({"Year": yr, "Observation": x})
df.set_index("Year", inplace=True)
df

Unnamed: 0_level_0,Observation
Year,Unnamed: 1_level_1
2015,123
2016,39
2017,78
2018,52
2019,110


In [6]:
df.columns

Index(['Observation'], dtype='str')

### The key variables

In [8]:
olympic_running = pd.read_csv("./data/olympic_running_unparsed.csv")
olympic_running.head(10)

Unnamed: 0,Year,Length,Sex,Time
0,1896,100,men,12.0
1,1900,100,men,11.0
2,1904,100,men,11.0
3,1908,100,men,10.8
4,1912,100,men,10.8
5,1916,100,men,
6,1920,100,men,10.8
7,1924,100,men,10.6
8,1928,100,men,10.8
9,1932,100,men,10.3


In [9]:
olrun = olympic_running.copy()
olrun.shape

(312, 4)

In [10]:
olrun.columns

Index(['Year', 'Length', 'Sex', 'Time'], dtype='str')

In [12]:
olrun['Sex'].unique()

<StringArray>
['men', 'women']
Length: 2, dtype: str

In [13]:
olrun['Length'].unique()

array([  100,   200,   400,   800,  1500,  5000, 10000])

### Working with timeseries dataframes

In [None]:
pbs = pd.read_csv("./data/PBS_unparsed.csv")        # sales data on pharma products in Australia
pbs['Month'] = pd.to_datetime(pbs['Month'])
pbs.head()

Unnamed: 0,Month,Concession,Type,ATC1,ATC1_desc,ATC2,ATC2_desc,Scripts,Cost
0,1991-07-01,Concessional,Co-payments,A,Alimentary tract and metabolism,A01,STOMATOLOGICAL PREPARATIONS,18228,67877.0
1,1991-08-01,Concessional,Co-payments,A,Alimentary tract and metabolism,A01,STOMATOLOGICAL PREPARATIONS,15327,57011.0
2,1991-09-01,Concessional,Co-payments,A,Alimentary tract and metabolism,A01,STOMATOLOGICAL PREPARATIONS,14775,55020.0
3,1991-10-01,Concessional,Co-payments,A,Alimentary tract and metabolism,A01,STOMATOLOGICAL PREPARATIONS,15380,57222.0
4,1991-11-01,Concessional,Co-payments,A,Alimentary tract and metabolism,A01,STOMATOLOGICAL PREPARATIONS,14371,52120.0


In [15]:
pbs.shape

(67596, 9)

In [None]:
a10 = pbs.query('ATC2 == "A10"')    # select *rows* corresponding to diabetes drugs (ATC code A10)
a10 = a10.filter(['Month', 'Concession', 'Type', 'Cost'])   # Select only some *columns* for analysis
a10.head()

Unnamed: 0,Month,Concession,Type,Cost
1524,1991-07-01,Concessional,Co-payments,2093000.0
1525,1991-08-01,Concessional,Co-payments,1796000.0
1526,1991-09-01,Concessional,Co-payments,1777000.0
1527,1991-10-01,Concessional,Co-payments,1849000.0
1528,1991-11-01,Concessional,Co-payments,1686000.0


The `df.filter()` method differs from the `df.loc()` method in that the former is primarily used for subsetting rows or columns based on matching criteria in their labels, e.g. partial string matching, regex, `items`, `like`, etc, whereas the latter selects data by explicit row/column labels or boolean conditions. `filter()` doesn't filter based on content, only on labels of the index; `loc()` can filter based on the content (values) with the df e.g. `df.loc(df['col']>5).

The `agg()` method allows us to aggregate data across keys, e.g. in computing the total cost per month regardless of `Concession` and `Type`.

In [17]:
total_cost_df = a10.groupby('Month', as_index=False).agg({'Cost': 'sum'})
total_cost_df.rename(columns={'Cost': 'TotalC'}, inplace=True)
total_cost_df.head()

Unnamed: 0,Month,TotalC
0,1991-07-01,3527000.0
1,1991-08-01,3181000.0
2,1991-09-01,3252000.0
3,1991-10-01,3611000.0
4,1991-11-01,3566000.0
