# KPI Validation

### Download the dataset from [Kaggle](https://www.kaggle.com/carrie1/ecommerce-data) and update path accordingly.

## Imports

In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.core.display import display

## Load/Preprocess Ecommerce data

In [35]:
ecom_df = pd.read_csv('./data/ecom_data.csv')

# A few rows have negative values for UnitPrice and Quantity. Consider only non-negative UnitPrice & Quantity
ecom_df = ecom_df[ecom_df['UnitPrice'] >= 0]
ecom_df = ecom_df[ecom_df['Quantity'] >= 0]

# Turn CustomerID into an object, instead of numerical
ecom_df['CustomerID'] = ecom_df['CustomerID'].astype(str)

# Convert InvoiceDate to a datetime object
ecom_df['date'] = pd.to_datetime(ecom_df['InvoiceDate'])
# ecom_df['InvoiceDate'] = pd.to_datetime(ecom_df['InvoiceDate'])
# ecom_df.rename(columns={'InvoiceDate': 'date'}, inplace=True)

# Feature extraction for PeriodOfDay
def get_day_period(hour: int) -> str:
    if hour < 6:
        return "night"
    elif 6 <= hour < 12:
        return "morning"
    elif 12 <= hour < 17:
        return "afternoon"
    elif 17 <= hour < 22:
        return "evening"
    else:
        return "night"
ecom_df['PeriodOfDay'] = ecom_df['date'].apply(lambda d: get_day_period(d.hour))

# Feature extraction for DayOfWeek
ecom_df['DayOfWeek'] = ecom_df['date'].apply(lambda d: d.strftime('%A'))

# Drop Description, not needed. Keep StockCode(useful for differentiating the products)
ecom_df.drop('Description', axis=1, inplace=True)

# FeatureExtraction for ItemTotalPrice
ecom_df['ItemTotalPrice'] = ecom_df['UnitPrice'] * ecom_df['Quantity']

In [36]:
ecom_df

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,date,PeriodOfDay,DayOfWeek,ItemTotalPrice
0,536365,85123A,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,2010-12-01 08:26:00,morning,Wednesday,15.30
1,536365,71053,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,2010-12-01 08:26:00,morning,Wednesday,20.34
2,536365,84406B,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,2010-12-01 08:26:00,morning,Wednesday,22.00
3,536365,84029G,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,2010-12-01 08:26:00,morning,Wednesday,20.34
4,536365,84029E,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,2010-12-01 08:26:00,morning,Wednesday,20.34
...,...,...,...,...,...,...,...,...,...,...,...
541904,581587,22613,12,12/9/2011 12:50,0.85,12680.0,France,2011-12-09 12:50:00,afternoon,Friday,10.20
541905,581587,22899,6,12/9/2011 12:50,2.10,12680.0,France,2011-12-09 12:50:00,afternoon,Friday,12.60
541906,581587,23254,4,12/9/2011 12:50,4.15,12680.0,France,2011-12-09 12:50:00,afternoon,Friday,16.60
541907,581587,23255,4,12/9/2011 12:50,4.15,12680.0,France,2011-12-09 12:50:00,afternoon,Friday,16.60


## Create KPI validation functions

### Are the Agg types going to be restricted to mean, sum, count?

**^ Need to discuss in sync up**

In [37]:
# ecom_df.groupby([pd.Grouper(key="date", freq="D")]).agg({'Quantity': 'mean'}).reset_index()
# ecom_df.set_index('date').resample('D', level='date').agg({'PeriodOfDay': pd.Series.mode}).reset_index()

### Check #1 in documentation

In [38]:
def validate_column_fits_agg_type(
    df: pd.core.frame.DataFrame, 
    column_name: str, 
    agg_type, 
    date_column_name: str
):
    """Validate that agg_type is valid for the specified column.

    :param df: pandas DataFrame with input data
    :type df: pandas.core.frame.DataFrame
    :param column_name: Name of the column to validate
    :type column_name: str
    :param agg_type: Aggregation function
    :type agg_type: str or pandas aggregation function
    :raises TypeError: categorical columns cannot be aggregated by mean.
    :return: [description]
    :rtype: [type]
    """
    # try:
    #     df.set_index(date_column_name).resample('D', level=date_column_name).agg({column_name: agg_type})
    #     return True
    # except:
    #     return False
        # raise TypeError(f'{column_name} must be numeric to use a {agg_type} aggregation.')

    if str(df[column_name].dtype) == 'object' and agg_type != 'count':
        raise TypeError(f'{column_name} must be numeric to use a {agg_type} aggregation.')
    return True

### Check #3 in documentation

In [39]:
from pandas.api.types import is_datetime64_any_dtype as is_datetime

def validate_date_column_is_parseable(
    df: pd.core.frame.DataFrame,
    date_column_name: str,
    date_format: str = None,
    unix_unit: str = None
) -> bool:
    """Validates whether the date_column is parseable

    :param df: pandas DataFrame with input data.
    :type df: pd.core.frame.DataFrame
    :param date_column_name: Name of the date column.
    :type date_column_name: str
    :param date_format: Provided datetime format (https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior), defaults to None
    :type date_format: str, optional, Must include either date_format or unix_unit.
    :param unix_unit: Units if the specified date column is in unix time, defaults to None
    :type unix_unit: str, optional, Must include either date_format or unix_unit.
    :return: Bool based on whether the date column is parseable.
    :rtype: bool
    """

    # Exit if column doesn't exist
    if date_column_name not in df.columns:
        raise KeyError(f"{date_column} does not exist in df")

    if is_datetime(df[date_column_name]):
        return True

    # User specified a unix time unit ('s', 'ms', 'ns', etc.)
    if unix_unit:
        pd.to_datetime(df[date_column_name], unit=unix_unit)
        return True
    else: # Try to infer datetime format with pandas
        pd.to_datetime(df[date_column_name], format=date_format, infer_datetime_format=True)
        return True

### Check #2 in documentation

In [45]:
def validate_kpi_not_datetime(
    df: pd.core.frame.DataFrame,
    kpi_column_name: str,
    date_column_name: str,
) -> bool:
    """Checks if kpi column is same as the date column.

    :param df: pandas DataFrame with input data
    :type df: pd.core.frame.DataFrame
    :param kpi_column_name: Name of the column used for KPI
    :type kpi_column_name: str
    :param date_column_name: Name of the date column
    :type date_column_name: str
    :return: Bool where True indicates the kpi column is not the same as the date column.
    :rtype: [type]
    """
    if kpi_column_name == date_column_name:
        raise ValueError("kpi column cannot be the same as the date column")
    return True

### Wrapper function

In [46]:
def validate_kpi(
    df: pd.core.frame.DataFrame,
    kpi_column_name: str,
    agg_type: str,
    date_column_name: str,
    date_format: str = None,
    unix_unit: str = None
) -> bool:
    print("Validate column fits agg type:", validate_column_fits_agg_type(df, column_name=kpi_column_name, agg_type=agg_type, date_column_name=date_column_name))

    print("Validate date column is parseable:", validate_date_column_is_parseable(df, date_column_name=date_column_name, date_format=date_format, unix_unit=unix_unit))

    print("Validate kpi not datetime:", validate_kpi_not_datetime(df, kpi_column_name=kpi_column_name, date_column_name=date_column_name))

In [49]:
validate_kpi(ecom_df, kpi_column_name='ItemTotalPrice', agg_type='sum', date_column_name='date')

Validate column fits agg type: True
Validate date column is parseable: True
Validate kpi not datetime: True
