### dm-11-practice2  

#### Import libraries  

In [97]:
import pandas as pd
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [98]:
csv_in = 'invoices.csv'

#### Read CSV file  

In [99]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(252, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   InvoiceID  252 non-null    object
 1   ItemID     252 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 4.1+ KB
None


Unnamed: 0,InvoiceID,ItemID
0,S001,5
1,S001,3
2,S002,1
3,S002,3
4,S002,6


#### Check appearance of each StockCode  

In [100]:
top_sc = df['ItemID'].value_counts()
print(top_sc.size)
print(top_sc.head())

10
3    46
1    38
2    31
4    25
6    21
Name: ItemID, dtype: int64


#### Format transaction data  
Before:  
```
invoice1, stockcode1, ...  
invoice1, stockcode2, ...  
invoice2, stockcode1, ...  
invoice3, stockcode1, ...  
invoice3, stockcode3, ...  
```

After:  
```
[  
  [stockcode1, stockcode2],    
  [stockcode1],  
  [stockcode1, stockcode3],  
  ...  
]  
```

and stockcode should be interger.  

#### Grouping transaction data by InvoiceNo  

In [101]:
invoices = []
for r in df.groupby('InvoiceID'):
    s1 = set(r[1]['ItemID'])
    invoices.append(list(s1))
print(len(invoices))
print(invoices[:3])

100
[[3, 5], [1, 2, 3, 6], [7]]


#### Market Basket Analysis by FP-Growth  

In [102]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 5)

Wall time: 2 ms


In [103]:
#print(patterns)

In [104]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.3)

Wall time: 0 ns


In [105]:
print(rules)

{(10,): ((3,), 0.4375), (1, 9): ((3,), 0.8333333333333334), (3, 9): ((2,), 0.45454545454545453), (2, 3): ((1,), 0.4166666666666667), (2, 9): ((3,), 0.5555555555555556), (1, 3): ((6,), 0.30434782608695654), (1, 6): ((3,), 0.6363636363636364), (3, 6): ((1,), 0.7777777777777778), (1, 4): ((3,), 0.625), (3, 4): ((1,), 0.45454545454545453), (1,): ((3,), 0.6052631578947368), (1, 2): ((3,), 0.4166666666666667), (3,): ((1,), 0.5)}


In [106]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [107]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf
1,"(1, 9)","(3,)",0.833333
7,"(3, 6)","(1,)",0.777778
6,"(1, 6)","(3,)",0.636364
8,"(1, 4)","(3,)",0.625
10,"(1,)","(3,)",0.605263
4,"(2, 9)","(3,)",0.555556
12,"(3,)","(1,)",0.5
2,"(3, 9)","(2,)",0.454545
9,"(3, 4)","(1,)",0.454545
0,"(10,)","(3,)",0.4375


#### Calculation of Lift  

In [108]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [109]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf,Lift
1,"(1, 9)","(3,)",0.833333,1.811594
7,"(3, 6)","(1,)",0.777778,2.046784
6,"(1, 6)","(3,)",0.636364,1.383399
8,"(1, 4)","(3,)",0.625,1.358696
10,"(1,)","(3,)",0.605263,1.315789
4,"(2, 9)","(3,)",0.555556,1.207729
12,"(3,)","(1,)",0.5,1.315789
2,"(3, 9)","(2,)",0.454545,1.466276
9,"(3, 4)","(1,)",0.454545,1.196172
0,"(10,)","(3,)",0.4375,0.951087


In [110]:
print( df_res.loc[ df_res['Conf'].idxmax() ])
print( df_res.loc[ df_res['Lift'].idxmax() ])

LHS       (1, 9)
RHS         (3,)
Conf    0.833333
Lift    1.811594
Name: 1, dtype: object
LHS       (3, 6)
RHS         (1,)
Conf    0.777778
Lift    2.046784
Name: 7, dtype: object


**Ans.  
Max Conf: 0.83, LHS: 1 and 9, RHS: 3  
Max Lift: 0.78, LHS: 3 and 6, RHS: 1**