### dm-11-practice2  

#### Import libraries  

In [1]:
import pandas as pd
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [2]:
csv_in = 'dm-end1-4.csv'

#### Read CSV file  

In [3]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(1000, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Invoice  1000 non-null   object
 1   Item     1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB
None


Unnamed: 0,Invoice,Item
0,T1018,173
1,T1098,109
2,T1033,116
3,T1064,198
4,T1058,161


#### Check appearance of each StockCode  

In [4]:
top_inv = df['Invoice'].value_counts()
print(top_inv.size)
print(top_inv.head())

101
T1068    17
T1093    16
T1070    16
T1017    16
T1090    16
Name: Invoice, dtype: int64


In [5]:
top_sc = df['Item'].value_counts()
print(top_sc.size)
print(top_sc.head())

99
135    21
133    20
166    17
169    16
188    16
Name: Item, dtype: int64


#### Format transaction data  
Before:  
```
invoice1, stockcode1, ...  
invoice1, stockcode2, ...  
invoice2, stockcode1, ...  
invoice3, stockcode1, ...  
invoice3, stockcode3, ...  
```

After:  
```
[  
  [stockcode1, stockcode2],    
  [stockcode1],  
  [stockcode1, stockcode3],  
  ...  
]  
```

and stockcode should be interger.  

#### Grouping transaction data by InvoiceNo  

In [6]:
invoices = []
for r in df.groupby('Invoice'):
    s1 = set(r[1]['Item'])
    invoices.append(list(s1))
print(len(invoices))
print(invoices[:3])

101
[[162, 169, 187, 112, 116, 150, 123, 190], [196, 197, 166, 169, 170, 138, 172, 112, 145, 149, 183], [162, 197, 133, 104, 140, 142, 112, 182, 153]]


#### Market Basket Analysis by FP-Growth  

In [7]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 5)

CPU times: user 5.58 ms, sys: 1.12 ms, total: 6.71 ms
Wall time: 6.61 ms


In [8]:
#print(patterns)

In [9]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.3)

CPU times: user 133 µs, sys: 0 ns, total: 133 µs
Wall time: 138 µs


In [10]:
print(rules)

{(149,): ((169,), 0.5555555555555556), (169,): ((149,), 0.35714285714285715), (142,): ((133,), 0.5555555555555556), (136,): ((135,), 0.5555555555555556), (111,): ((196,), 0.5555555555555556), (196,): ((133,), 0.3333333333333333), (162,): ((182,), 0.45454545454545453), (113,): ((182,), 0.4166666666666667), (166,): ((171,), 0.35294117647058826), (171,): ((166,), 0.6), (144,): ((152,), 0.35714285714285715), (152,): ((144,), 0.45454545454545453), (135,): ((138,), 0.3), (138,): ((135,), 0.5), (187,): ((133,), 0.38461538461538464), (188,): ((135,), 0.3333333333333333)}


In [11]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [12]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf
9,"(171,)","(166,)",0.6
0,"(149,)","(169,)",0.555556
2,"(142,)","(133,)",0.555556
3,"(136,)","(135,)",0.555556
4,"(111,)","(196,)",0.555556
13,"(138,)","(135,)",0.5
6,"(162,)","(182,)",0.454545
11,"(152,)","(144,)",0.454545
7,"(113,)","(182,)",0.416667
14,"(187,)","(133,)",0.384615


#### Calculation of Lift  

In [13]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [14]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf,Lift
9,"(171,)","(166,)",0.6,3.564706
0,"(149,)","(169,)",0.555556,4.007937
2,"(142,)","(133,)",0.555556,3.300654
3,"(136,)","(135,)",0.555556,2.805556
4,"(111,)","(196,)",0.555556,3.740741
13,"(138,)","(135,)",0.5,2.525
6,"(162,)","(182,)",0.454545,4.590909
11,"(152,)","(144,)",0.454545,3.279221
7,"(113,)","(182,)",0.416667,4.208333
14,"(187,)","(133,)",0.384615,2.285068


In [15]:
print( df_res.loc[ df_res['Conf'].idxmax() ])
print( df_res.loc[ df_res['Lift'].idxmax() ])

LHS       (171,)
RHS       (166,)
Conf         0.6
Lift    3.564706
Name: 9, dtype: object
LHS       (162,)
RHS       (182,)
Conf    0.454545
Lift    4.590909
Name: 6, dtype: object


**Ans.  
Max Conf: 0.83, LHS: 1 and 9, RHS: 3  
Max Lift: 0.78, LHS: 3 and 6, RHS: 1**