### Sample program for Association Analysis (Market Basket Analysis) using FP-Growth  

#### Import libraries  

In [None]:
import pandas as pd
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [None]:
csv_in = 'online_retail_small_cleaned.csv'

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

#### Check appearance of each StockCode  

In [None]:
top_sc = df['StockCode'].value_counts()
print(top_sc.size)
print(top_sc.head())

#### Format transaction data  
Before:  
```
invoice1, stockcode1, ...  
invoice1, stockcode2, ...  
invoice2, stockcode1, ...  
invoice3, stockcode1, ...  
invoice3, stockcode3, ...  
```

After:  
```
[  
  [stockcode1, stockcode2],    
  [stockcode1],  
  [stockcode1, stockcode3],  
  ...  
]  
```

and stockcode should be interger.  

#### Assign an integer (ID) to each stockcode  

In [None]:
id2sc = sorted(list(set(df['StockCode'])))
sc2id = {}
for i in range(len(id2sc)):
    sc2id[id2sc[i]] = i

In [None]:
df['StockCode_ID'] = df['StockCode'].map(lambda x: sc2id[x])
display(df.head())

#### Grouping transaction data by InvoiceNo  

In [None]:
invoices = []
for r in df.groupby('InvoiceNo'):
    #print(len(r))  # debug
    #print(r[0])  # debug
    #print(r[1])  # debug
    #break  # debug
    s1 = set(r[1]['StockCode_ID'])
    invoices.append(list(s1))
print(len(invoices))

#### Market Basket Analysis by FP-Growth  

In [None]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 40)

In [None]:
#print(patterns)

In [None]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.8)

In [None]:
print(rules)

In [None]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [None]:
display(df_res.sort_values(by='Conf', ascending=False))

#### Get original StockCode  

In [None]:
print(id2sc[1474])
print(id2sc[1475])
print(id2sc[1478])

#### Calculation of Lift  

In [None]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [None]:
display(df_res.sort_values(by='Conf', ascending=False))