In [158]:
import itertools
import os

from collections import Counter

import pandas as pd
import numpy as np

from efficient_apriori import apriori

# Продажи магазина

In [52]:
data_folder = './baskets'
info_file_path = os.path.join(data_folder, 'baskets-info.txt')
data_file_path = os.path.join(data_folder, 'baskets.csv')

In [53]:
with open(info_file_path, 'r', encoding='cp1251') as f:
    info = f.read()

> {{info}}

In [54]:
with open(data_file_path, 'r', encoding='cp1251') as f:
    lines = f.readlines()

In [55]:
norm_line = list(filter(lambda line: bool(line), map(lambda line: line.strip(), lines)))

In [56]:
baskets = list(map(lambda line: list(map(int, line.strip().split(','))), norm_line))

In [57]:
total_baskets = len(baskets)

In [58]:
elements = [element for basket in baskets for element in basket]
element_counter = Counter(elements)
elements_values = [k for k, v in element_counter.items()]
elements_counts = [v for k, v in element_counter.items()]

In [67]:
l_num = 100000
def stat_sum(elements, part=1.):
    edge = int((l_num * part * len(elements)) / l_num)
    return np.sum(sorted(elements)[:edge])
def stat_count(elements, part=1.):
    edge = int((l_num * part * len(elements)) / l_num)
    return sorted(elements)[edge]
def stat_supp(elements, part=1.):
    return stat_count(elements, part) / total_baskets

**Статистика**

Количество транзакций: `{{total_baskets}}`  
По количеству вхождений:
- **count**: `{{np.sum(elements_counts)}}`
- **unique**: `{{np.unique(elements_values).shape[0]}}`
- **min**: `{{np.min(elements_counts)}}`, элемент `{{elements_values[np.argmin(elements_counts)]}}`
- **max**: `{{np.max(elements_counts)}}`, элемент `{{elements_values[np.argmax(elements_counts)]}}`
- **std**: `{{np.std(elements_counts)}}`  
- **mean**: `{{np.mean(elements_counts)}}`
- **top10**:  sum `{{stat_sum(elements_counts, 0.1)}}`, count `{{stat_count(elements_counts, 0.1)}}`, supp `{{stat_supp(elements_counts, 0.1)}}`
- **top25**: sum `{{stat_sum(elements_counts, 0.25)}}`, count `{{stat_count(elements_counts, 0.25)}}`, supp `{{stat_supp(elements_counts, 0.25)}}`
- **top50**: sum `{{stat_sum(elements_counts, 0.5)}}`, count `{{stat_count(elements_counts, 0.5)}}`, supp `{{stat_supp(elements_counts, 0.5)}}`
- **top75**: sum `{{stat_sum(elements_counts, 0.75)}}`, count `{{stat_count(elements_counts, 0.75)}}`, supp `{{stat_supp(elements_counts, 0.75)}}`
- **top90**: sum `{{stat_sum(elements_counts, 0.9)}}`, count `{{stat_count(elements_counts, 0.9)}}`, supp `{{stat_supp(elements_counts, 0.9)}}`
- **top95**: sum `{{stat_sum(elements_counts, 0.95)}}`, count `{{stat_count(elements_counts, 0.95)}}`, supp `{{stat_supp(elements_counts, 0.95)}}`
- **top98**: sum `{{stat_sum(elements_counts, 0.98)}}`, count `{{stat_count(elements_counts, 0.98)}}`, supp `{{stat_supp(elements_counts, 0.98)}}`

In [121]:
def show_top_itemset(itemset, top=5):
    for size, values in itemset.items():
        print(f'# Size {size}')
        values = sorted([(k, v) for k, v in values.items()], key=lambda x: x[1], reverse=True)[:top]
        for (k, v) in values:
            print(f'Basket({",".join(map(str, k))}): {v}') 

### Sample 1
Min support `0.01`  
Min confidence `0.5`

In [122]:
itemsets_1_50, rules_1_50 = apriori(baskets, min_support=0.01,  min_confidence=0.5, verbosity=1)

Generating itemsets.
 Counting itemsets of length 1.
  Found 16470 candidate itemsets of length 1.
  Found 70 large itemsets of length 1.
 Counting itemsets of length 2.
  Found 2415 candidate itemsets of length 2.
  Found 58 large itemsets of length 2.
 Counting itemsets of length 3.
  Found 37 candidate itemsets of length 3.
  Found 25 large itemsets of length 3.
 Counting itemsets of length 4.
  Found 6 candidate itemsets of length 4.
  Found 6 large itemsets of length 4.
 Counting itemsets of length 5.
  Found 0 candidate itemsets of length 5.
Itemset generation terminated.

Generating rules from itemsets.
 Generating rules of size 2.
 Generating rules of size 3.
 Generating rules of size 4.
Rule generation terminated.



In [123]:
show_top_itemset(itemsets_1_50)

# Size 1
Basket(39): 50675
Basket(48): 42135
Basket(38): 15596
Basket(32): 15167
Basket(41): 14945
# Size 2
Basket(39,48): 29142
Basket(39,41): 11414
Basket(38,39): 10345
Basket(41,48): 9018
Basket(32,39): 8455
# Size 3
Basket(39,41,48): 7366
Basket(38,39,48): 6102
Basket(32,39,48): 5402
Basket(38,39,41): 3051
Basket(38,41,48): 2374
# Size 4
Basket(38,39,41,48): 1991
Basket(32,39,41,48): 1646
Basket(32,38,39,48): 1236
Basket(38,39,48,170): 1193
Basket(36,38,39,48): 1080


In [124]:
itemsets_1_50

{1: {(9,): 1372,
  (19,): 1005,
  (31,): 920,
  (32,): 15167,
  (36,): 2936,
  (37,): 1074,
  (38,): 15596,
  (39,): 50675,
  (41,): 14945,
  (45,): 911,
  (48,): 42135,
  (49,): 1120,
  (60,): 1489,
  (65,): 4472,
  (78,): 1060,
  (79,): 1600,
  (89,): 3837,
  (101,): 2237,
  (110,): 2794,
  (117,): 1026,
  (123,): 1302,
  (147,): 1779,
  (161,): 1010,
  (170,): 3099,
  (175,): 970,
  (179,): 998,
  (185,): 1376,
  (201,): 1133,
  (225,): 3257,
  (237,): 3032,
  (242,): 911,
  (249,): 1160,
  (255,): 1474,
  (258,): 987,
  (264,): 895,
  (270,): 1734,
  (271,): 2094,
  (286,): 1183,
  (301,): 1204,
  (310,): 2594,
  (338,): 1274,
  (413,): 1880,
  (438,): 1863,
  (475,): 2167,
  (479,): 926,
  (522,): 974,
  (533,): 1487,
  (548,): 1137,
  (589,): 1119,
  (592,): 1227,
  (604,): 1209,
  (677,): 1110,
  (740,): 1181,
  (783,): 965,
  (824,): 1210,
  (956,): 911,
  (1004,): 1102,
  (1146,): 1426,
  (1327,): 1786,
  (1393,): 1161,
  (2238,): 1715,
  (2958,): 904,
  (3270,): 950,
  (10515

### Sample 2
Min support `0.01`  
Min confidence `0.8`

In [91]:
itemsets_1_80, rules_1_80 = apriori(baskets, min_support=0.01,  min_confidence=0.8, verbosity=1)

Generating itemsets.
 Counting itemsets of length 1.
  Found 16470 candidate itemsets of length 1.
  Found 70 large itemsets of length 1.
 Counting itemsets of length 2.
  Found 2415 candidate itemsets of length 2.
  Found 58 large itemsets of length 2.
 Counting itemsets of length 3.
  Found 37 candidate itemsets of length 3.
  Found 25 large itemsets of length 3.
 Counting itemsets of length 4.
  Found 6 candidate itemsets of length 4.
  Found 6 large itemsets of length 4.
 Counting itemsets of length 5.
  Found 0 candidate itemsets of length 5.
Itemset generation terminated.

Generating rules from itemsets.
 Generating rules of size 2.
 Generating rules of size 3.
 Generating rules of size 4.
Rule generation terminated.



In [127]:
show_top_itemset(itemsets_1_80)

# Size 1
Basket(39): 50675
Basket(48): 42135
Basket(38): 15596
Basket(32): 15167
Basket(41): 14945
# Size 2
Basket(39,48): 29142
Basket(39,41): 11414
Basket(38,39): 10345
Basket(41,48): 9018
Basket(32,39): 8455
# Size 3
Basket(39,41,48): 7366
Basket(38,39,48): 6102
Basket(32,39,48): 5402
Basket(38,39,41): 3051
Basket(38,41,48): 2374
# Size 4
Basket(38,39,41,48): 1991
Basket(32,39,41,48): 1646
Basket(32,38,39,48): 1236
Basket(38,39,48,170): 1193
Basket(36,38,39,48): 1080


In [126]:
rules_1_80

[{36} -> {38},
 {37} -> {38},
 {110} -> {38},
 {170} -> {38},
 {286} -> {38},
 {36, 39} -> {38},
 {36, 48} -> {38},
 {39, 110} -> {38},
 {39, 170} -> {38},
 {48, 110} -> {38},
 {48, 170} -> {38},
 {41, 48} -> {39},
 {48, 225} -> {39},
 {36, 39, 48} -> {38},
 {38, 41, 48} -> {39},
 {39, 48, 110} -> {38},
 {39, 48, 170} -> {38}]

### Sample 3
Min support `0.3`  
Min confidence `0.5`

In [153]:
itemsets_20_80, rules_20_80 = apriori(baskets, min_support=0.3,  min_confidence=0.5)

In [154]:
show_top_itemset(itemsets_20_80)

# Size 1
Basket(39): 50675
Basket(48): 42135
# Size 2
Basket(39,48): 29142


In [155]:
rules_20_80

[{48} -> {39}, {39} -> {48}]

# Детали