In [23]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from sklearn.preprocessing import LabelEncoder


In [24]:
pd.set_option('display.max_row', 111)
pd.set_option('display.max_column', 111)
plt.style.use('dark_background')

data = pd.read_csv("./KaDo.csv")
base_df = data.copy()

In [25]:
# Delete unused columns, drop NAs, create additionnal columns, and only keep TICKET_IDS with multiple product which are not the same.
def preprocessing_df(df):
    df.drop(columns=['MOIS_VENTE', 'PRIX_NET'], axis=1, inplace=True)
    df.dropna(axis='columns')
    encoder = LabelEncoder()
    df['ALL_LIBELLES'] = None
    df['CODE_LIBELLE'] = encoder.fit_transform(df['LIBELLE'])
    df['CODE_LIBELLE'] = df['CODE_LIBELLE'].apply(lambda x: '_'+str(x))
    # Keep only ticket ids with multiple product
    df = df.drop_duplicates()
    single_tickets = df.duplicated(subset='TICKET_ID', keep=False)
    df = df[single_tickets]
    return df

df = preprocessing_df(base_df)

In [26]:
# Keep in case
# all_ticket_ids = df['TICKET_ID'].tolist()
# all_libelles_list = []
# for ticket_id in all_ticket_ids:
#     all_libelles = df[df['TICKET_ID'] == ticket_id]['LIBELLE'].tolist()
#     all_libelles_list.append(all_libelles)
# df['ALL_LIBELLE'] = all_libelles_list

In [27]:
# Separate two dataframes, one for calculating the association rules and the other for looking up the libelles(description) to display on our recommendations
def get_products_orders_df(df):
    orders = df[['TICKET_ID', 'CODE_LIBELLE']]
    products = df[['CODE_LIBELLE', 'LIBELLE']].copy()

    # Drop duplicated products
    products = products[~products.duplicated()]
    products = products[~products.duplicated(subset=['CODE_LIBELLE'])]

    # Set the index to CODE_LIBELLE
    products = products.set_index('CODE_LIBELLE')
    # Convert to Series for easier lookups
    products = products['LIBELLE']
    orders = orders.groupby('TICKET_ID')['CODE_LIBELLE'].apply(list).reset_index()

    return orders, products

orders_df, products_df = get_products_orders_df(df)

In [28]:
# orders = orders.groupby('TICKET_ID')['CODE_LIBELLE'].apply(list).reset_index()

In [29]:
# fit the TransactionEncoder and do the transformation
def encode_orders_df(orders_df):
    te = TransactionEncoder()
    orders_1hot = te.fit(orders_df['CODE_LIBELLE']).transform(orders_df['CODE_LIBELLE'])
    orders_1hot = pd.DataFrame(orders_1hot, columns=te.columns_)
    return orders_1hot

orders_1hot = encode_orders_df(orders_df)

In [30]:
# APRIORI METHOD
# ap = apriori(orders_1hot, min_support=0.0001, max_len=10, use_colnames=True)
# ap

In [31]:
%%timeit -n1 -r1

# FPGROWTH METHOD, cf: https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/

# the orders_1hot value should be the return of encode_orders_df function
fpgrowth(orders_1hot, min_support=0.001, max_len=5, use_colnames=True)

46.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [32]:
fp = fpgrowth(orders_1hot, min_support=0.001, max_len=5, use_colnames=True)
fp

Unnamed: 0,support,itemsets
0,0.007102,(_733)
1,0.002401,(_29)
2,0.001738,(_469)
3,0.002097,(_1279)
4,0.006841,(_687)
...,...,...
1369,0.001329,"(_707, _859)"
1370,0.001282,"(_707, _270, _859)"
1371,0.001024,"(_1469, _1413)"
1372,0.001124,"(_1363, _1448)"


In [33]:
rules = association_rules(fp, metric="lift", min_threshold=5)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(_733),(_742),0.007102,0.007436,0.001254,0.176562,23.745132,0.001201,1.205391
1,(_742),(_733),0.007436,0.007102,0.001254,0.168648,23.745132,0.001201,1.194317
2,(_733),(_1358),0.007102,0.008315,0.001048,0.14749,17.738715,0.000988,1.163254
3,(_1358),(_733),0.008315,0.007102,0.001048,0.125988,17.738715,0.000988,1.136023
4,(_1345),(_1344),0.009621,0.014105,0.002069,0.215102,15.249597,0.001934,1.25608


In [34]:
def predict(antecedent, rules, max_results=10):
    
    # get the rules for this antecedent
    preds = rules[rules['antecedents'] == antecedent]
    
    # a way to convert a frozen set with one element to string
    preds = preds['consequents'].apply(iter).apply(next)
    
    return preds[:max_results]

In [35]:
preds = predict({'_726'}, rules)
preds

238     _712
1427    _728
1806    _729
1814    _725
1819    _729
1832    _728
1837    _730
1855    _728
1865    _725
1870    _728
Name: consequents, dtype: object

In [36]:
print('Original product :', products_df['_726'], '\n')

print('Recommended products :')
for stockid in set(preds):  
    print(products_df[stockid])

Original product : GD JDM4 GRENADE FL200ML 

Recommended products :
GD FL200ML JDM PAMPLEMOUSSE
GD JDM4 PAMPLEMOUSSE FL 200ML
GD JDM4 CIT VERT FL 200ML
GD JDM4 LOTUS FL200ML
GD JDM4 ORANGE FL 200ML
