### Sample program for Association Analysis (Market Basket Analysis)   

#### Import libraries  

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#### Parameters  

In [None]:
csv_in = 'online_retail_small.csv'
csv_out = 'online_retail_small_cleaned.csv'

#### Read CSV file  

In [None]:
df_all = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df_all.shape)
print(df_all.info())
display(df_all.head())

#### Check missing values  

In [None]:
df_missing = df_all[df_all.isnull().any(axis=1)]
print(df_missing.shape[0])
display(df_missing.head())

#### Delete rows with missing values  

In [None]:
df = df_all.dropna().reset_index(drop=True)
print(df.shape)
print(df.info())
display(df.head())

#### Fix data type of columns  

In [None]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype('int')
print(df.info())
display(df.head())

#### Check canceled invoice  

In [None]:
df['Cancel'] = df['InvoiceNo'].map(lambda x: str(x)[0])
display(df.head())

In [None]:
print(df['Cancel'].value_counts())

#### Use only non-canceled invoices  

In [None]:
df = df[ df['Cancel']=='5' ]
print(df.shape)

#### How many invoices, items, customers, countries?  

In [None]:
top_iv = df['InvoiceNo'].value_counts()
top_sc = df['StockCode'].value_counts()
top_cs = df['CustomerID'].value_counts()
top_ct = df['Country'].value_counts()
print('#Invoices:', top_iv.size)
print(top_iv.head())
print('#StockCode:', top_sc.size)
print(top_sc.head())
print('#CustomerID:', top_cs.size)
print(top_cs.head())
print('#Country:', top_ct.size)
print(top_ct.head())

In [None]:
n_plt = 20
top_iv_idx = top_iv[:20].index.astype('str')
plt.bar(top_iv_idx, top_iv[:n_plt])
plt.xticks(rotation=90)
plt.xlabel('InvoiceNo')
plt.ylabel('#transactions')
plt.show()

In [None]:
n_plt = 20
plt.bar(top_sc[:n_plt].index, top_sc[:n_plt])
plt.xticks(rotation=90)
plt.xlabel('StockCode')
plt.ylabel('#transactions')
plt.show()

In [None]:
n_plt = 20
top_cs_idx = top_cs[:20].index.astype('str')
plt.bar(top_cs_idx, top_cs[:n_plt])
plt.xticks(rotation=90)
plt.xlabel('CustomerID')
plt.ylabel('#transactions')
plt.show()

In [None]:
n_plt = 20
plt.bar(top_ct[:n_plt].index, top_ct[:n_plt])
plt.xticks(rotation=90)
plt.xlabel('Country')
plt.ylabel('#transactions')
plt.show()

#### Basket Analysis (jaccard, support, confidence, lift) of the top two items  

##### Get StockCode for the top two items  

In [None]:
top_sc1 = top_sc.index[0]
top_sc2 = top_sc.index[1]
print(top_sc1, top_sc2)

##### Collect invoices of these items  

In [None]:
df_sc1 = df[ df['StockCode']==top_sc1 ]
df_sc2 = df[ df['StockCode']==top_sc2 ]

In [None]:
iv_all = set(df['InvoiceNo'])
iv_sc1 = set(df_sc1['InvoiceNo'])
iv_sc2 = set(df_sc2['InvoiceNo'])
iv_both = iv_sc1 & iv_sc2
iv_either = iv_sc1 | iv_sc2

In [None]:
n_all = len(iv_all)
n_sc1 = len(iv_sc1)
n_sc2 = len(iv_sc2)
n_both = len(iv_both)
n_either = len(iv_either)
print(n_all, n_sc1, n_sc2, n_both, n_either)

##### Jaccard Coeff.  

In [None]:
print('Jac(sc1 => sc2):',  n_both / n_either)

##### Support (支持度)  

In [None]:
print('Supp(sc1 => sc2):', n_both / n_all)

##### Confidence (確信度, 信頼度)  

In [None]:
print('Conf(sc1 = sc2):', n_both / n_sc1)  # Confidence based on top_sc1
print('Conf(sc2 = sc1):', n_both / n_sc2)  # Confidence based on top_sc2

##### Lift (リフト値)  

In [None]:
a = n_both / n_sc2
b = n_sc1 / n_all
print('Lift(sc1 => sc2):', a/b)

#### Write the current data to a CSV file  

In [None]:
df = df.drop(columns='Cancel')
df.to_csv(csv_out, index=False)