# Experimentation and Uplift Testing

Trial stores were performed in stores 77, 86 and 88

In [1]:
import re
import xlrd
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import matplotlib.patches as mpatches
from datetime import datetime, timedelta
from scipy.stats import ttest_ind, linregress

pd.options.mode.chained_assignment = None  

In [2]:
# customer_data = pd.read_csv('QVI_purchase_behaviour.csv')
# transaction_data = pd.read_excel('QVI_transaction_data.xlsx')

#cleaned transaction_data merged with customer_data
data = pd.read_csv('QVI_data.csv', parse_dates=['DATE']) 

## I. Data Cleaning

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264834 entries, 0 to 264833
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   LYLTY_CARD_NBR    264834 non-null  int64         
 1   DATE              264834 non-null  datetime64[ns]
 2   STORE_NBR         264834 non-null  int64         
 3   TXN_ID            264834 non-null  int64         
 4   PROD_NBR          264834 non-null  int64         
 5   PROD_NAME         264834 non-null  object        
 6   PROD_QTY          264834 non-null  int64         
 7   TOT_SALES         264834 non-null  float64       
 8   PACK_SIZE         264834 non-null  int64         
 9   BRAND             264834 non-null  object        
 10  LIFESTAGE         264834 non-null  object        
 11  PREMIUM_CUSTOMER  264834 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(6), object(4)
memory usage: 24.2+ MB


In [4]:
data.head()

Unnamed: 0,LYLTY_CARD_NBR,DATE,STORE_NBR,TXN_ID,PROD_NBR,PROD_NAME,PROD_QTY,TOT_SALES,PACK_SIZE,BRAND,LIFESTAGE,PREMIUM_CUSTOMER,YEARMONTH
0,1000,2018-10-17,1,1,5,Natural Chip Compny SeaSalt175g,2,6.0,175,NATURAL,YOUNG SINGLES/COUPLES,Premium,201810
1,1002,2018-09-16,1,2,58,Red Rock Deli Chikn&Garlic Aioli 150g,1,2.7,150,RRD,YOUNG SINGLES/COUPLES,Mainstream,201809
2,1003,2019-03-07,1,3,52,Grain Waves Sour Cream&Chives 210G,1,3.6,210,GRNWVES,YOUNG FAMILIES,Budget,201903
3,1003,2019-03-08,1,4,106,Natural ChipCo Hony Soy Chckn175g,1,3.0,175,NATURAL,YOUNG FAMILIES,Budget,201903
4,1004,2018-11-02,1,5,96,WW Original Stacked Chips 160g,1,1.9,160,WOOLWORTHS,OLDER SINGLES/COUPLES,Mainstream,201811


We need to define the measures for our stores. We calculate these measures for the following:
- total sales of chips
- number of customers
- average number of transactions per customer
- avergae number of chips bought per customer
- average price per unit of chips

In [32]:
def calculate_measures(data) -> pd.DataFrame:
    """
    Function to calculate for measures of each store per month.
    This will return a DataFrame with the needed measurements.
    """
    
    data['YEARMONTH'] = data.DATE.dt.strftime("%Y%m").astype(int)
    
    measures = data.groupby(['STORE_NBR','YEARMONTH'])\
                   .agg({"TXN_ID":"count",
                         "TOT_SALES":"sum",
                         "PROD_QTY":"sum",
                         "LYLTY_CARD_NBR":"nunique"})\
                   .reset_index()\
                   .rename(columns={"TXN_ID":"TXN_COUNT",
                                    "LYLTY_CARD_NBR":"CUST_COUNT"})
                                                                        
    measures['TXN_PER_CUST'] = measures.TXN_COUNT/measures.CUST_COUNT
    measures['CHPS_PER_CUST'] = measures.PROD_QTY/measures.CUST_COUNT
    measures['PRICE_PER_UNIT'] = measures.TOT_SALES/measures.PROD_QTY
    
    return measures

There are 3 trials which began in Feb 2019. In order to compare with a control group, we would need to have a control group with the following criteria:
- stores with a complete set of months
- stores with pre-trial measures highly correlated to our trial stores

In [121]:
def correlation(t_store_nbr, c_store_nbr, columns, measures) -> pd.DataFrame:
    """
    Function to calculate for the correlation of each control store
    to the given trial store.
    """
    
    corr_dict = {"YEARMONTH": [],
                 "TRIAL_STORE_NBR": [],
                 "CONTROL_STORE_NBR": [],
                 "CORR_SCORE": []}

    for i in c_store_lst:

        trial_store = measures[measures.STORE_NBR==t_store_nbr]
        control_store = measures[measures.STORE_NBR==i]

        corr_dict["YEARMONTH"].extend(trial_store['YEARMONTH'].to_list())
        corr_dict['TRIAL_STORE_NBR'].extend(trial_store['STORE_NBR'].to_list())
        corr_dict['CONTROL_STORE_NBR'].extend(control_store['STORE_NBR'].to_list())
        corr_dict["CORR_SCORE"].extend(trial_store[columns].reset_index()\
                               .corrwith(control_store[columns].reset_index(),
                                axis=1,method='pearson',drop=True).to_list())
        
    return pd.DataFrame(corr_dict)

We can also calculate the magnitude distance for each measure

In [180]:
def magnitude_distance(t_store_nbr, c_store_nbr, columns, measures) -> pd.DataFrame:
    """
    Function to calculate for the magnitude distance. This
    will return standardized distances for each column.
    """    
    
    df1 = measures[measures.STORE_NBR==t_store_nbr][columns].reset_index(drop=True)
    
    distance = []
    for i in c_store_nbr:
        
        control = measures[measures.STORE_NBR==i]
        df2 = control[columns].reset_index(drop=True)
        diff = abs(df1.subtract(df2))

        diff['CONTROL_STORE_NBR'] = i
        diff['YEARMONTH'] = control['YEARMONTH'].to_list()

        distance.append(diff)
        
    final = pd.concat(distance)
    final['TRIAL_STORE_NBR'] = t_store_nbr

    for col in columns:
        final[col] = 1 - ((final[col]-final[col].min())/(final[col].max()-final[col].min()))
    
    #we average the magnitude distance for each store and month
    final['MAG_SCORE'] = final[columns].mean(axis=1)

    return final[['YEARMONTH','TRIAL_STORE_NBR','CONTROL_STORE_NBR','MAG_SCORE']]

To rank the stores, we can take the average of our correlation and distance measurements for each store.

In [191]:
def combined_measures(t_store_nbr, c_store_nbr, columns, measures):
    """
    This function will call both correlation() and magnitude_distance()
    to average their measures for a final control score
    """
    
    indices = ['YEARMONTH','TRIAL_STORE_NBR','CONTROL_STORE_NBR']
    
    corr_measure = correlation(t_store_nbr, c_store_nbr, columns, measures)
    mag_measure = magnitude_distance(t_store_nbr, c_store_nbr, columns, measures)
    
    combined = corr_measure.merge(mag_measure, on=indices)
    combined['CONTROL_SCORE'] = sum([abs(combined.CORR_SCORE),combined.MAG_SCORE])*0.5
    
    return combined

In [192]:
columns = ['TOT_SALES', 'CUST_COUNT', 'TXN_PER_CUST', 
               'CHPS_PER_CUST', 'PRICE_PER_UNIT']

measures = calculate_measures(data)
pre_trial_measures = measures[measures.YEARMONTH < 201902]

c_store_lst = measures.groupby('STORE_NBR')['YEARMONTH'].count().reset_index()
c_store_lst = c_store_lst[c_store_lst.YEARMONTH == 12]['STORE_NBR'].to_list()

In [193]:
combined = combined_measures(77, c_store_lst, columns, pre_trial_measures)
combined[~combined.CONTROL_STORE_NBR.isin([77,86,88])].sort_values('CONTROL_SCORE', ascending=False)

Unnamed: 0,YEARMONTH,TRIAL_STORE_NBR,CONTROL_STORE_NBR,CORR_SCORE,MAG_SCORE,CONTROL_SCORE
344,201808,77,53,0.991356,0.980338,0.985847
1552,201812,77,233,0.980377,0.989279,0.984828
1549,201809,77,233,0.985600,0.982129,0.983864
296,201809,77,46,0.981897,0.979844,0.980870
1696,201809,77,255,0.984318,0.973026,0.978672
...,...,...,...,...,...,...
20,201901,77,3,0.027737,0.496185,0.261961
23,201809,77,4,0.055155,0.434642,0.244898
22,201808,77,4,0.088102,0.396272,0.242187
24,201810,77,4,0.030369,0.355894,0.193132


In [194]:
combined = combined_measures(86, c_store_lst, columns, pre_trial_measures)
combined[~combined.CONTROL_STORE_NBR.isin([77,86,88])].sort_values('CONTROL_SCORE', ascending=False)

Unnamed: 0,YEARMONTH,TRIAL_STORE_NBR,CONTROL_STORE_NBR,CORR_SCORE,MAG_SCORE,CONTROL_SCORE
658,201807,86,101,0.994090,0.987400,0.990745
614,201812,86,94,0.999747,0.978261,0.989004
698,201812,86,106,0.990203,0.985313,0.987758
471,201809,86,71,0.996266,0.974832,0.985549
547,201808,86,83,0.999037,0.971673,0.985355
...,...,...,...,...,...,...
650,201901,86,99,0.710989,0.184792,0.447890
1063,201901,86,159,0.704341,0.169735,0.437038
6,201901,86,1,0.532853,0.334865,0.433859
1718,201810,86,258,0.653186,0.210655,0.431921


In [195]:
combined = combined_measures(88, c_store_lst, columns, pre_trial_measures)
combined[~combined.CONTROL_STORE_NBR.isin([77,86,88])].sort_values('CONTROL_SCORE', ascending=False)

Unnamed: 0,YEARMONTH,TRIAL_STORE_NBR,CONTROL_STORE_NBR,CORR_SCORE,MAG_SCORE,CONTROL_SCORE
432,201812,88,65,0.989624,0.980697,0.985161
477,201808,88,72,0.997380,0.955501,0.976440
617,201808,88,95,0.997915,0.944327,0.971121
537,201812,88,81,0.998222,0.943219,0.970721
503,201901,88,75,0.999291,0.938267,0.968779
...,...,...,...,...,...,...
647,201810,88,99,0.488026,0.121567,0.304796
1325,201809,88,198,0.458653,0.149192,0.303923
275,201809,88,42,0.486355,0.116759,0.301557
646,201809,88,99,0.467890,0.108392,0.288141
