In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

sns.set_context('talk')

# Data Preparation

In [2]:
df_raw = pd.read_csv("online_retail_II.csv")
df_raw.shape

(1067371, 8)

In [3]:
df_raw.iloc[0]

Invoice                                     489434
StockCode                                    85048
Description    15CM CHRISTMAS GLASS BALL 20 LIGHTS
Quantity                                        12
InvoiceDate                    2009-12-01 07:45:00
Price                                         6.95
Customer ID                                13085.0
Country                             United Kingdom
Name: 0, dtype: object

In [4]:
df_raw['Customer ID'] = df_raw['Customer ID'].astype(str).apply(lambda x: x.split('.')[0])
df_raw['InvoiceDate'] = pd.to_datetime(df_raw['InvoiceDate'])
df_raw['subtotal'] = df_raw['Quantity'] * df_raw['Price']

In [None]:
## to remove hour and minute, convert to a date object
df_raw['invoice_date_week'] = pd.to_datetime(
    pd.to_datetime(df_raw['InvoiceDate']) - pd.to_timedelta(df_raw['InvoiceDate'].dt.dayofweek, unit='d')
).dt.date

## convert back to datetime object, so we can use it for pd.Grouper()
df_raw['invoice_date_week'] = pd.to_datetime(df_raw['invoice_date_week'])

In [None]:
from pandas.tseries.offsets import MonthEnd

df_raw['month_end_date'] = df_raw['InvoiceDate'] + MonthEnd(1)

In [None]:
print(f"Number of unique item codes: {df_raw['StockCode'].nunique():,}")

In [None]:
agg_date = df_raw.groupby(['StockCode']).agg({
    'invoice_date_week': ['min','max','nunique']
}).reset_index()

agg_date.columns = [
    col[0] if col[1] == '' else col[0]+'_'+col[1] for col in agg_date.columns
]

In [None]:
sns.violinplot(agg_date['invoice_date_week_nunique'])
plt.title('Each item has a different number of dates in the sales data')
sns.despine();

In [None]:
df_raw['Quantity'].describe()

In [None]:
negative_quantity = (df_raw['Quantity'] < 0).sum() / df_raw.shape[0] 
print(f"{negative_quantity:.2%} of the records have negative sales quantity")

Interesting -- the total sales quantity can be extremely negative. To simplify, we'll only take items with positive all-time sales quantity.

In [None]:
agg_quantity = df_raw.groupby('StockCode').sum()['Quantity'].sort_values().reset_index()
agg_quantity.head()

In [None]:
positive_quantity_item_list = agg_quantity.query('Quantity > 0')['StockCode'].tolist()
df_selected = df_raw.query('StockCode in @positive_quantity_item_list')

# Observe sales history

In [None]:
df_selected.iloc[0]

In [None]:
df_agg = df_selected.groupby([
    pd.Grouper(
        key='invoice_date_week',
        freq='W'
    ),
    'StockCode'
]).sum()['Quantity'].reset_index().rename(
    columns={
        'StockCode': 'stock_code',
        'Quantity': 'quantity'
    }
)

df_agg.head()

In [None]:
f, ax = plt.subplots(figsize=(16, 4))

sns.lineplot(
    data=df_agg.groupby('invoice_date_week').sum()['quantity'].reset_index(),
    x='invoice_date_week',
    y='quantity',
    ax=ax
)

ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y %b'))
ax.set_xlabel('')
ax.set_ylabel('sales quantity')
plt.title("Changepoint in the end of 2010?")
sns.despine();

## Generate complete time series

In [None]:
from preprocess_time_series import generate_complete_records

GENERATE_SERIES = False
SERIES_FILE_NAME = 'weekly-item-full-series.parquet'

if GENERATE_SERIES:
    df_agg_full = generate_complete_records(df=df_agg, date_freq='W')    
    df_agg_full.to_parquet(SERIES_FILE_NAME)
else:
    df_agg_full = pd.read_parquet(SERIES_FILE_NAME)
    
assert df_agg_full['quantity'].sum() == df_agg['quantity'].sum(), "Different values after generating complete records!"

In [None]:
mask = (df_agg_full['stock_code'] == 'ABCIndex')
df_agg_full[mask]

## Observe new time series data

In [None]:
agg_quantity.tail()

In [None]:
f, ax = plt.subplots(figsize=(12,4))

SELECTED_ITEM = '85123A'
df_agg.query('stock_code == @SELECTED_ITEM').plot.line(
    x='invoice_date_week', 
    y='quantity', 
    ax=ax
);
sns.despine();

# Clustering

## Approach 1: use summary statistics, then run common clustering algorithms

In [None]:
## TO DO

## Approach 2: compute similarity measures of multiple time series

### 2a. Use Euclidean distance

In [None]:
import tslearn

### 2b. Use Dynamic Time Warping