In [90]:
import re
import pyfpgrowth
import pandas as pd
import numpy as np
import plotly.express as px
import networkx as nx
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from sklearn.preprocessing import LabelEncoder

In [91]:
# загружаем исходные данные
data = pd.read_csv('./data/Groceries_dataset.csv')
data.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [92]:
# проверяем данные на наличие пропусков
is_nan = data.isna().sum()
is_nan

Member_number      0
Date               0
itemDescription    0
dtype: int64

In [93]:
# конвертируем наименования товаров в числовые метки
labelencoder = LabelEncoder()
encode = labelencoder.fit_transform(data['itemDescription'])
data.drop(['itemDescription'], inplace=True, axis=1)
data['itemDescription'] = encode
display(data)

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,156
1,2552,05-01-2015,164
2,2300,19-09-2015,109
3,1187,12-12-2015,102
4,3037,01-02-2015,164
...,...,...,...
38760,4471,08-10-2014,135
38761,2022,23-02-2014,19
38762,1097,16-04-2014,17
38763,1510,03-12-2014,64


In [94]:
# конвертируем в формат, который принимает pyfpgrowth
# ожидается, что будет передан двумерный массив с товарами в корзинах
# Идентификаторы товаров являются целыми числами
data.itemDescription = data.itemDescription.transform(lambda x: [x])
transactions = [item for item in data.groupby(['Member_number','Date']).sum()['itemDescription'].reset_index(drop=True)]
transactions

[[130, 164, 132, 165],
 [164, 105, 128],
 [20, 92],
 [130, 73],
 [138, 108],
 [56, 40],
 [130, 164, 122],
 [164, 138],
 [8, 162],
 [56, 138, 160],
 [63, 102],
 [15, 164],
 [156, 150],
 [16, 145],
 [130, 122],
 [123, 45],
 [61, 43],
 [122, 122],
 [46, 32],
 [20, 59],
 [102, 73],
 [109, 164, 156],
 [122, 119, 28],
 [102, 133],
 [164, 28, 103, 122],
 [123, 164, 105],
 [122, 122],
 [160, 88],
 [12, 140, 122],
 [164, 120],
 [133, 134],
 [164, 56, 27, 56, 53, 28, 11, 122],
 [68, 83, 84, 44],
 [156, 138, 165, 123, 165, 49, 163, 107],
 [70, 165, 95],
 [156, 165],
 [105, 77],
 [21, 33],
 [109, 34, 1],
 [78, 135],
 [12, 143],
 [56, 12],
 [109, 18, 122],
 [41, 105],
 [164, 56, 18, 65, 70, 102, 165],
 [30, 19, 12, 122],
 [56, 165],
 [156, 123, 122, 99, 133],
 [63, 14],
 [115, 164],
 [61, 164],
 [162, 90, 49, 121],
 [123, 89],
 [102, 93],
 [69, 94, 12],
 [164, 19],
 [156, 63],
 [93, 160],
 [164, 130, 11, 165],
 [20, 37],
 [68, 16],
 [164, 122],
 [30, 164],
 [128, 8],
 [64, 122, 28],
 [1, 11],
 [27,

In [105]:
# поиск шаблонов выше порога поддержки
patterns = pyfpgrowth.find_frequent_patterns(transactions, 6/len(data))
# поиск , которые связаны с другим с определенной минимальной вероятностью
rules = pyfpgrowth.generate_association_rules(patterns, 0.25)
print('Rules:')
display(rules)

Rules:


{(114,): ((141,), 1.0),
 (79,): ((12, 104, 122), 1.0),
 (12, 79): ((104, 122), 0.5),
 (79, 104): ((12, 122), 1.0),
 (79, 122): ((12, 104), 1.0),
 (12, 79, 104): ((122,), 1.0),
 (12, 79, 122): ((104,), 1.0),
 (12, 104, 122): ((), 0.5),
 (79, 104, 122): ((12,), 1.0),
 (3, 4): ((8, 49, 56, 67, 88, 102), 1.0),
 (4, 56): ((), 0.5),
 (4, 67): ((3, 8, 49, 56, 88, 102), 1.0),
 (3, 4, 56): ((8, 49, 67, 88, 102), 0.5),
 (3, 4, 67): ((8, 49, 56, 88, 102), 1.0),
 (3, 56, 67): ((), 0.5),
 (4, 56, 67): ((3, 8, 49, 88, 102), 0.5),
 (4, 88): ((3, 8, 49, 56, 67, 102), 1.0),
 (3, 88): ((8, 49, 56, 67, 102), 1.0),
 (3, 4, 88): ((8, 49, 56, 67, 102), 1.0),
 (3, 67, 88): ((8, 49, 56, 102), 1.0),
 (4, 67, 88): ((3, 8, 49, 56, 102), 1.0),
 (4, 56, 88): ((3, 8, 49, 67, 102), 0.5),
 (56, 67, 88): ((8, 49, 102), 1.0),
 (3, 56, 88): ((8, 49, 67, 102), 0.5),
 (3, 4, 56, 67): ((8, 49, 88, 102), 1.0),
 (3, 4, 56, 88): ((8, 49, 67, 102), 1.0),
 (3, 4, 67, 88): ((8, 49, 56, 102), 1.0),
 (3, 56, 67, 88): ((8, 49, 102)