In [1]:
!pip install mlxtend --upgrade
!pip install xlsxwriter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mlxtend
  Downloading mlxtend-0.21.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 5.3 MB/s 
Installing collected packages: mlxtend
  Attempting uninstall: mlxtend
    Found existing installation: mlxtend 0.14.0
    Uninstalling mlxtend-0.14.0:
      Successfully uninstalled mlxtend-0.14.0
Successfully installed mlxtend-0.21.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlsxwriter
  Downloading XlsxWriter-3.0.3-py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 5.0 MB/s 
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.0.3


In [2]:
import numpy as np
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from scipy import stats

In [3]:
def chi2_pvalue(x):
    '''
    Return the p-value of given chi-square(dof=1) statistics. 
    '''
    return 1 - stats.chi2.cdf(x, 1)

def chi_square(df:pd.DataFrame, transaction_num:int):
    '''
    Input the association rules DataFrame and the number of total transactions.
    Return a DataFrame with two new columns indicating the chi square statistic, corresponding p-value (dof=1).
    '''
    x1 = df["antecedent support"]
    x2 = df["consequent support"]
    x3 = df["support"]
    df['chi_square'] = np.round(transaction_num * ((x3-x1*x2)**2/(x1*x2) + (x1-x3-x1*(1-x2))**2/(x1*(1-x2)) + (x2-x3-x2*(1-x1))**2/(x2*(1-x1)) + (1-x1-x2+x3-(1-x1)*(1-x2))**2/((1-x1)*(1-x2))), decimals=1)
    df['p_value'] = np.round(df['chi_square'].apply(chi2_pvalue), decimals=3)
    df['leverage'] = x3 - x2*x1

  
def confidences(df:pd.DataFrame):
    '''
    Input the association rules DataFrame.
    Return a DataFrame with new columns of confidences.
    '''
    x1 = df["antecedent support"]
    x2 = df["consequent support"]
    x3 = df["support"]
    df['confidence'] = x3/x1
    df['confidence_inverse'] = x3/x2
    df['lift'] = x3/(x1*x2)
    df['max_confidence'] = x3/np.minimum(x1, x2)
    df['all_confidence'] = x3/np.maximum(x1, x2)
    df['kulczynski'] = (x3/x1 + x3/x2)/2
    df['cosine'] = x3/np.sqrt(x1*x2)

def conviction(df:pd.DataFrame):
    '''
    Input the association rules DataFrame.
    Return a DataFrame with new column of conviction of the original and inverse association.
    '''
    x1 = df["antecedent support"]
    x2 = df["consequent support"]
    x3 = df["confidence"]
    x4 = df["confidence_inverse"]

    df['conviction'] = (1 - x2)/(1 - x3)
    df['conviction_inverse'] = (1 - x1)/(1 - x4)

def imbalance_ratio(df:pd.DataFrame):
    '''
    Input the association rules DataFrame.
    Return a DataFrame with new column of imbalance_ratio.
    '''
    x1 = df["antecedent support"]
    x2 = df["consequent support"]
    x3 = df["support"]
    df['imbalance_ratio'] = np.abs(x1 - x2)/(x1 + x2 - x3)
    
def drop_duplicate_association(df:pd.DataFrame):
    '''
    Input the association rules DataFrame.
    Drop rows of duplicate assocaitions.
    '''
    df['association'] = [ tuple(sorted(list(x|y))) for x,y in zip(df["antecedents"], df["consequents"])]
    df.drop_duplicates(subset=['association'], inplace=True)
    df.drop(columns=['association'], inplace = True)
    df.index=range(0, len(df))

In [4]:
def frequent_pattern(df, min_support:float, apriori=False):
  '''
  Input a transactions series and minimal support.
  Return a frequent pattern DataFrame default by fptree algorithm. 
  Apriori algorithm is implemented if apriori=True. 

  '''
  te = TransactionEncoder()
  te_array = te.fit(df).transform(df)
  d = pd.DataFrame(te_array, columns=te.columns_)
  if apriori:
    fp = apriori(d, min_support=min_support, use_colnames=True)
  else:
    fp = fpgrowth(d, min_support=min_support, use_colnames=True)
  return fp

In [5]:
def association(df, min_support:float, min_confidence:float):
  '''
  Input a transactions series, minimal support and minimal confidence.
  Return a association rules DataFrame.
  '''
  n=len(df)
  ar = association_rules(frequent_pattern(df, min_support), metric='confidence', min_threshold=min_confidence).drop(columns=["lift","leverage"])
  chi_square(ar, n)
  confidences(ar)
  conviction(ar)
  imbalance_ratio(ar)
  ar = ar.loc[:,["antecedents", "consequents", "antecedent support", \
                                                            "consequent support", "support", "confidence", "confidence_inverse", \
                                                            "conviction","conviction_inverse", "lift", "chi_square", "p_value", "leverage", "all_confidence", \
                                                            "max_confidence", "kulczynski", "cosine", "imbalance_ratio"]]
  return ar

In [6]:
def correlation(df, min_support:float, min_confidence:float):
  '''
  Input a transactions series, minimal support and minimal confidence.
  Return a correlation DataFrame.
  '''
  n = len(df)
  corr = association(df, min_support, min_confidence)
  corr = corr.loc[corr.all_confidence>=min_confidence, :]
  drop_duplicate_association(corr)

  return corr

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
transaction = pd.read_csv("/content/drive/MyDrive/Project Data Mining/Predicting Coupon Redemption/customer_transaction_data.csv", parse_dates=['date'])
fp_item = transaction.groupby(['date', 'customer_id'])['item_id'].unique().reset_index(drop=True)

In [13]:
fp_item.head()

0                         [10537, 19560, 21653, 32083]
1    [4642, 5566, 9309, 9645, 12433, 12543, 13174, ...
2    [5525, 6497, 6588, 7005, 7364, 8145, 8261, 979...
3    [4408, 5454, 5797, 5865, 7200, 9198, 9532, 980...
4    [10423, 10565, 17035, 27801, 30645, 30832, 340...
Name: item_id, dtype: object

In [None]:
transaction.head(10)

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0
5,2012-01-02,1501,57397,1,71.24,-28.14,0.0
6,2012-01-02,857,12424,1,106.5,-14.25,0.0
7,2012-01-02,857,14930,1,110.07,0.0,0.0
8,2012-01-02,857,16657,1,89.05,-35.26,0.0
9,2012-01-02,67,10537,3,32.06,0.0,0.0


In [None]:
transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   date             1324566 non-null  datetime64[ns]
 1   customer_id      1324566 non-null  int64         
 2   item_id          1324566 non-null  int64         
 3   quantity         1324566 non-null  int64         
 4   selling_price    1324566 non-null  float64       
 5   other_discount   1324566 non-null  float64       
 6   coupon_discount  1324566 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(3)
memory usage: 70.7 MB


# 1. Determining minimal support (proportion) and minimal confidence.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
n_fp = []
n_1itemset = []
for i in range(1, 11):
  min_supp = i/1000
  fp = frequent_pattern(fp_item, min_supp)
  fp_1item = fp[fp.itemsets.apply(len)==1]
  n_fp.append(len(fp))
  n_1itemset.append(len(fp_1item))

In [None]:
min_support = np.array(range(1, 11))/1000
df = pd.DataFrame({'min_support':min_support, "number of frequent patterns":n_fp, "number of frequent 1-itemset":n_1itemset})
df["number of frequent k-itemset(k>1)"]=df.iloc[:,1]-df.iloc[:,2]
df

|      | min_support | number of frequent patterns | number of frequent 1-itemset | number of frequent k-itemset(k>1) |
| ---: | ----------: | --------------------------: | ---------------------------: | --------------------------------- |
|    0 |       0.001 |                        1664 |                         1598 | 66                                |
|    1 |       0.002 |                         474 |                          467 | 7                                 |
|    2 |       0.003 |                         217 |                          216 | 1                                 |
|    3 |       0.004 |                         127 |                          127 | 0                                 |
|    4 |       0.005 |                          81 |                           81 | 0                                 |
|    5 |       0.006 |                          50 |                           50 | 0                                 |
|    6 |       0.007 |                          38 |                           38 | 0                                 |
|    7 |       0.008 |                          31 |                           31 | 0                                 |
|    8 |       0.009 |                          25 |                           25 | 0                                 |
|    9 |       0.010 |                          20 |                           20 | 0                                 |

In [None]:
fig, ax = plt.subplots(figsize=(12,9))

ax = sns.barplot(x=df["min_support"], y=df["number of frequent patterns"], color='b', label='total')
ax = sns.barplot(x=df["min_support"], y=df["number of frequent 1-itemset"], color='r', label='1-itemset')

ax.set(xlabel="min_support", ylabel="count")
plt.legend()

In [None]:
obs = []
for i in (0.001, 0.002):
  for j in range(20, 65, 5):
    min_confidence = j/100
    n_ar = len(association(fp_item, i, min_confidence))
    n_corr = len(correlation(fp_item, i, min_confidence))
    obs.append({"min_support":i, "min_confidence":min_confidence, "number of association rules":n_ar, "number of correlations":n_corr})

In [None]:
df2 = pd.DataFrame(obs)
df2

|      | min_support | min_confidence | number of association ruless | number of correlation |
| ---: | ----------: | -------------: | ---------------------------: | --------------------- |
|    0 |       0.001 |           0.20 |                           52 | 21                    |
|    1 |       0.001 |           0.25 |                           43 | 16                    |
|    2 |       0.001 |           0.30 |                           32 | 8                     |
|    3 |       0.001 |           0.35 |                           20 | 5                     |
|    4 |       0.001 |           0.40 |                           10 | 1                     |
|    5 |       0.001 |           0.45 |                            3 | 1                     |
|    6 |       0.001 |           0.50 |                            0 | 0                     |
|    7 |       0.001 |           0.55 |                            0 | 0                     |
|    8 |       0.001 |           0.60 |                            0 | 0                     |
|    9 |       0.002 |           0.20 |                            6 | 2                     |
|   10 |       0.002 |           0.25 |                            4 | 1                     |
|   11 |       0.002 |           0.30 |                            2 | 0                     |
|   12 |       0.002 |           0.35 |                            1 | 0                     |
|   13 |       0.002 |           0.40 |                            1 | 0                     |
|   14 |       0.002 |           0.45 |                            0 | 0                     |
|   15 |       0.002 |           0.50 |                            0 | 0                     |
|   16 |       0.002 |           0.55 |                            0 | 0                     |
|   17 |       0.002 |           0.60 |                            0 | 0                     |

# 2. Example Analysis

In [None]:
min_support = 0.001
min_confidence = 0.25

In [None]:
item = pd.read_csv("/content/drive/MyDrive/Project Data Mining/Predicting Coupon Redemption/item_data.csv")
item.head()

Unnamed: 0,item_id,brand,brand_type,category
0,1,1,Established,Grocery
1,2,1,Established,Miscellaneous
2,3,56,Local,Bakery
3,4,56,Local,Grocery
4,5,56,Local,Grocery


In [None]:
ar = association(fp_item, min_support, min_confidence)
corr = correlation(fp_item, min_support, min_confidence)

In [None]:
def value(x:frozenset):
  y, = x
  return y

ar.antecedents = ar.antecedents.apply(value)
ar.consequents = ar.consequents.apply(value)

corr.antecedents = corr.antecedents.apply(value)
corr.consequents = corr.consequents.apply(value)

In [None]:
ar_item = ar.merge(item, left_on='antecedents', right_on='item_id', how='left').merge(item, left_on='consequents', right_on='item_id', suffixes=("_antecedents", "_consequents"),  how='left')
corr_item = corr.merge(item, left_on='antecedents', right_on='item_id', how='left').merge(item, left_on='consequents', right_on='item_id', suffixes=("_antecedents", "_consequents"),  how='left')

In [None]:
coupon_item = pd.read_csv("/content/drive/MyDrive/Project Data Mining/Predicting Coupon Redemption/coupon_item_mapping.csv")

In [None]:
coupon_item_ = coupon_item.groupby('item_id')['coupon_id'].unique()

In [None]:
coupon_item_

item_id
1                  [22, 31]
4            [166, 327, 82]
7                  [23, 32]
10       [317, 318, 22, 31]
12                 [22, 31]
                ...        
74054      [40, 19, 22, 31]
74056              [24, 33]
74057              [24, 33]
74058              [20, 29]
74061              [20, 29]
Name: coupon_id, Length: 36289, dtype: object

In [None]:
coup_item_ = coupon_item_.reset_index()

In [None]:
coup_item_[coup_item_.item_id==27801]

Unnamed: 0,item_id,coupon_id
13270,27801,[960]


In [None]:
corr_coupon = corr_item.merge(coupon_item_, left_on='antecedents', right_on='item_id', how='left').merge(coupon_item_, left_on='consequents', right_on='item_id', suffixes=("_antecedents", "_consequents"),  how='left').drop(columns=['item_id_antecedents','item_id_consequents'])

In [None]:
corr_coupon.to_csv('/content/drive/MyDrive/Project Data Mining/Predicting Coupon Redemption/ccorr_coupon.csv', index=False)

In [None]:
ar_item.to_csv('Association.csv', index=False)
corr_item.to_csv('Correlation.csv', index=False)