In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder
from sklearn.preprocessing import LabelEncoder


In [2]:
pd.set_option('display.max_row', 111)
pd.set_option('display.max_column', 111)
plt.style.use('dark_background')

data = pd.read_csv("./KaDo.csv")
df = data.copy()[:5000]

In [3]:
# Delete unused columns, drop NAs, create additionnal columns, and only keep TICKET_IDS with multiple product which are not the same.
df.drop(columns=['MOIS_VENTE', 'PRIX_NET'], axis=1, inplace=True)
df.dropna(axis='columns')
# df['NB OF RELATIONS'] = 0
# df['RELATED CLI_IDs'] = None
encoder = LabelEncoder()
df['ALL_LIBELLES'] = None
df['CODE_LIBELLE'] = encoder.fit_transform(df['LIBELLE'])
df['CODE_LIBELLE'] = df['CODE_LIBELLE'].apply(lambda x: '_'+str(x))
# Keep only ticket ids with multiple product
df = df.drop_duplicates()
single_tickets = df.duplicated(subset='TICKET_ID', keep=False)
df = df[single_tickets]
# NB: Pour le ticket id 36417517 on peut remarquer qu'en produits achetés ensemble il y a une ressemblance dans le libelle où l'on retrouve LILAS.
# UPDATE : Même chose pour le ticket id 32975823....
# Peut-être intéréssant de concaténer la maille et le libelle et observer les ressemblances ?
df.head(30)

Unnamed: 0,TICKET_ID,FAMILLE,UNIVERS,MAILLE,LIBELLE,CLI_ID,ALL_LIBELLES,CODE_LIBELLE
0,35592159,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 PAMPLEMOUSSE FL 200ML,1490281,,_367
2,35592159,SOINS DU VISAGE,VIS_CJOUR Jeunes Specifique,VIS_JEUNE_ET_LEVRE,CR JR PARF BIO.SPE AC.SENT.50ML,1490281,,_120
3,35592159,SOINS DU VISAGE,VIS_DEMAQ AAAR,VIS_AAAR_DEMAQLOTION,EAU MICELLAIRE 3 THES FL200ML,1490281,,_201
4,35592159,HYGIENE,HYG_DOUCHE JARDINMONDE,HYG_JDM,GD JDM4 TIARE FL 200ML,1490281,,_369
5,35509899,PARFUMAGE,PARF_EAUX PARFUMS,PARF_PARFUM,EDT UN MATIN AU JARDIN 100ML MUGUET,13290776,,_232
6,35509899,SOINS DU CORPS,CORPS_LAIT HUILE PLAISIRNATURE,CORPS_HYDR_LAIT_HUILE,LAIT VELOUTE COCO PN2 400ML,13290776,,_473
7,36417517,HYGIENE,HYG_DOUCHE PARFUMS,HYG_PARFUMEE,GD LILAS FP FL200ML,13290776,,_370
10,36417517,SOINS DU CORPS,CORPS_LAIT HUILE PARFUMS,CORPS_HYDR_LAIT_HUILE,LAIT LILAS FP FL200ML,13290776,,_462
12,36417517,PARFUMAGE,PARF_EAUX PARFUMS,PARF_PARFUM,EDT UN MATIN AU JARDIN 100ML LILAS,13290776,,_258
15,33064616,SOINS DU VISAGE,VIS_CJOUR Jeunes Specifique,VIS_JEUNE_ET_LEVRE,CR JOUR PX/MIX HYDRA/VEG P50ML,20200041,,_116


In [4]:
# Keep in case
# all_ticket_ids = df['TICKET_ID'].tolist()
# all_libelles_list = []
# for ticket_id in all_ticket_ids:
#     all_libelles = df[df['TICKET_ID'] == ticket_id]['LIBELLE'].tolist()
#     all_libelles_list.append(all_libelles)
# df['ALL_LIBELLE'] = all_libelles_list

In [5]:
# Separate two dataframes, one for calculating the association rules and the other for looking up the libelles(description) to display on our recommendations
orders = df[['TICKET_ID', 'CODE_LIBELLE']]
products = df[['CODE_LIBELLE', 'LIBELLE']].copy()

# Drop duplicated products
products = products[~products.duplicated()]
products = products[~products.duplicated(subset=['CODE_LIBELLE'])]

# Set the index to CODE_LIBELLE
products = products.set_index('CODE_LIBELLE')
# Convert to Series for easier lookups
products = products['LIBELLE']

In [6]:
def string_list(x):
    return [str(i) for i in x]

orders = orders.groupby('TICKET_ID')['CODE_LIBELLE'].apply(list).reset_index()
orders.head()

Unnamed: 0,TICKET_ID,CODE_LIBELLE
0,32938156,"[_340, _457, _201]"
1,32944029,"[_690, _689]"
2,32948566,"[_718, _702, _705, _481, _479, _426, _564, _53..."
3,32949078,"[_183, _466, _250]"
4,32950212,"[_0, _211, _560, _558, _561, _411]"


In [7]:
# fit the TransactionEncoder and do the transformation
te = TransactionEncoder()
orders_1hot = te.fit(orders['CODE_LIBELLE']).transform(orders['CODE_LIBELLE'])
orders_1hot = pd.DataFrame(orders_1hot, columns=te.columns_)
orders_1hot.head()

Unnamed: 0,_0,_1,_10,_100,_101,_102,_104,_105,_106,_107,_108,_109,_11,_110,_111,_112,_113,_114,_115,_116,_117,_119,_12,_120,_121,_122,_123,_124,_125,_126,_127,_128,_129,_13,_130,_131,_132,_133,_134,_135,_136,_137,_138,_139,_14,_140,_141,_142,_143,_144,_145,_146,_147,_148,_149,...,_764,_765,_766,_767,_768,_769,_77,_770,_771,_772,_773,_774,_775,_776,_777,_778,_779,_78,_780,_781,_782,_783,_784,_785,_786,_787,_788,_789,_79,_790,_791,_792,_793,_8,_80,_81,_82,_83,_84,_85,_86,_87,_88,_89,_9,_90,_91,_92,_93,_94,_95,_96,_97,_98,_99
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
# APRIORI METHOD
ap = apriori(orders_1hot, min_support=0.0001, max_len=2, use_colnames=True)
ap

Unnamed: 0,support,itemsets
0,0.006706,(_0)
1,0.000838,(_1)
2,0.002515,(_10)
3,0.001676,(_100)
4,0.005029,(_101)
...,...,...
6082,0.000838,"(_88, _91)"
6083,0.001676,"(_91, _89)"
6084,0.000838,"(_96, _89)"
6085,0.000838,"(_9, _99)"


In [22]:
%%timeit -n1 -r1

# FPGROWTH METHOD, cf: https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/
fpgrowth(orders_1hot, min_support=0.0001, max_len=2, use_colnames=True)

259 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [23]:
# FPGROWTH METHOD, cf: https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/fpgrowth/
fp = fpgrowth(orders_1hot, min_support=0.0001, max_len=2, use_colnames=True)
fp

Unnamed: 0,support,itemsets
0,0.015088,(_201)
1,0.010059,(_457)
2,0.000838,(_340)
3,0.012573,(_690)
4,0.007544,(_689)
...,...,...
6082,0.000838,"(_133, _83)"
6083,0.000838,"(_133, _175)"
6084,0.000838,"(_235, _364)"
6085,0.000838,"(_524, _250)"


In [24]:
rules = association_rules(fp, metric="lift", min_threshold=10)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(_457),(_201),0.010059,0.015088,0.002515,0.25,16.569444,0.002363,1.313216
1,(_201),(_457),0.015088,0.010059,0.002515,0.166667,16.569444,0.002363,1.18793
2,(_457),(_162),0.010059,0.015088,0.001676,0.166667,11.046296,0.001525,1.181894
3,(_162),(_457),0.015088,0.010059,0.001676,0.111111,11.046296,0.001525,1.113684
4,(_457),(_340),0.010059,0.000838,0.000838,0.083333,99.416667,0.00083,1.089995


In [25]:
def predict(antecedent, rules, max_results= 6):
    
    # get the rules for this antecedent
    preds = rules[rules['antecedents'] == antecedent]
    
    # a way to convert a frozen set with one element to string
    preds = preds['consequents'].apply(iter).apply(next)
    
    return preds[:max_results]

In [26]:
preds = predict({'_457'}, rules)
preds

0       _201
2       _162
4       _340
1156    _486
1575    _110
1590    _101
Name: consequents, dtype: object

In [29]:
print('Original product :', products['_457'], '\n')

print('Recommended products :')
for stockid in preds:  
    print(products[stockid])

Original product : LAIT DEMAQ 3 THES FL200ML 

Recommended products :
EAU MICELLAIRE 3 THES FL200ML
CREME NUIT AntiAge GLOBAL 50ml
FDT ROSE ROS200 TT CLAIR CN3FL30
LOTION 3 THES FL200ML
CR FONDAMENT PEAU SECHE  AAG50
CORRECT A LA ROSE TT CLAIR CN3 5G
