### dm-11-practice2  

#### Import libraries  

In [1]:
import pandas as pd
import pyfpgrowth  # https://fp-growth.readthedocs.io/en/latest/

#### Parameters  

In [2]:
csv_in = 'dm-12-quiz.csv'

#### Read CSV file  

In [3]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(318, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318 entries, 0 to 317
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   InvoiceNo  318 non-null    object
 1   ItemNo     318 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 5.1+ KB
None


Unnamed: 0,InvoiceNo,ItemNo
0,T001,5
1,T001,4
2,T001,6
3,T002,2
4,T003,2


#### Check appearance of each StockCode  

In [4]:
top_sc = df['ItemNo'].value_counts()
print(top_sc.size)
print(top_sc.head())

7
2    74
3    60
4    48
1    47
6    37
Name: ItemNo, dtype: int64


#### Format transaction data  
Before:  
```
invoice1, stockcode1, ...  
invoice1, stockcode2, ...  
invoice2, stockcode1, ...  
invoice3, stockcode1, ...  
invoice3, stockcode3, ...  
```

After:  
```
[  
  [stockcode1, stockcode2],    
  [stockcode1],  
  [stockcode1, stockcode3],  
  ...  
]  
```

and stockcode should be interger.  

#### Grouping transaction data by InvoiceNo  

In [5]:
invoices = []
for r in df.groupby('InvoiceNo'):
    s1 = set(r[1]['ItemNo'])
    invoices.append(list(s1))
print(len(invoices))
print(invoices[:3])

150
[[4, 5, 6], [2], [2, 3, 7]]


#### Market Basket Analysis by FP-Growth  

In [6]:
%time patterns = pyfpgrowth.find_frequent_patterns(invoices, 5)

CPU times: user 865 µs, sys: 145 µs, total: 1.01 ms
Wall time: 1.01 ms


In [7]:
#print(patterns)

In [8]:
%time rules = pyfpgrowth.generate_association_rules(patterns, 0.3)

CPU times: user 96 µs, sys: 0 ns, total: 96 µs
Wall time: 98.7 µs


In [9]:
print(rules)

{(1, 3): ((2,), 0.3125), (1, 2): ((4,), 0.35), (3,): ((2,), 0.3448275862068966), (2,): ((4,), 0.35135135135135137)}


In [10]:
results = []
for x in rules:
    ret = [x, rules[x][0], rules[x][1]]
    results.append(ret)
df_res = pd.DataFrame(results)
df_res.columns = ['LHS', 'RHS', 'Conf']

In [11]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf
3,"(2,)","(4,)",0.351351
1,"(1, 2)","(4,)",0.35
2,"(3,)","(2,)",0.344828
0,"(1, 3)","(2,)",0.3125


#### Calculation of Lift  

In [12]:
n_all = len(invoices)
lift = []
for i in range(df_res.shape[0]):
    rhs = df_res.at[i, 'RHS']
    conf = df_res.at[i, 'Conf']
    n_rhs = 0
    for items in invoices:
        if set(items) >= set(rhs):
            n_rhs += 1
    lift1 = conf / (n_rhs / n_all)
    lift.append(lift1)
    
df_res['Lift'] = lift

In [13]:
display(df_res.sort_values(by='Conf', ascending=False))

Unnamed: 0,LHS,RHS,Conf,Lift
3,"(2,)","(4,)",0.351351,1.097973
1,"(1, 2)","(4,)",0.35,1.09375
2,"(3,)","(2,)",0.344828,0.698975
0,"(1, 3)","(2,)",0.3125,0.633446


In [14]:
print( df_res.loc[ df_res['Conf'].idxmax() ])
print( df_res.loc[ df_res['Lift'].idxmax() ])

LHS         (2,)
RHS         (4,)
Conf    0.351351
Lift    1.097973
Name: 3, dtype: object
LHS         (2,)
RHS         (4,)
Conf    0.351351
Lift    1.097973
Name: 3, dtype: object


**Ans.  
Max Conf: 0.83, LHS: 1 and 9, RHS: 3  
Max Lift: 0.78, LHS: 3 and 6, RHS: 1**