In [108]:
import random

import numpy as np  # noqa
import pandas as pd
import plotly.express as px  # noqa
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder

In [109]:
df = pd.read_csv("../data/supermarket.group.csv")
# df['Product'] = [x.split(';') for x in df['Product']]
# df.to_csv("../data/supermarket_new.csv", index=False)

In [110]:
df.head()

Unnamed: 0,TransactionID,Product
0,160698,"['ketchups', 'sauces', 'adjika', 'pasta', 'tea']"
1,160747,"['pasta', 'honey', 'tea']"
2,161217,"['ketchups', 'sauces', 'adjika', 'pasta', 'che..."
3,161243,"['ketchups', 'sauces', 'adjika', 'pasta', 'che..."
4,161354,"['ketchups', 'sauces', 'adjika', 'pasta', 'tea']"


In [111]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44 entries, 0 to 43
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   TransactionID  44 non-null     int64 
 1   Product        44 non-null     object
dtypes: int64(1), object(1)
memory usage: 832.0+ bytes


In [112]:
te = TransactionEncoder()
transaction_list = df["Product"].apply(eval).tolist()
te_ary = te.fit_transform(transaction_list)
df_processed = pd.DataFrame(te_ary, columns=te.columns_)

In [113]:
df.shape

(44, 2)

# Apriori and Association rules

In [114]:
frequent_itemsets: pd.DataFrame = apriori(
    df_processed, min_support=0.1, use_colnames=True
)
aso_rules: pd.DataFrame = association_rules(
    frequent_itemsets, metric="lift", min_threshold=0.5
)

In [115]:
print("Frequent Itemsets\n", frequent_itemsets.head())
print("Rules\n", aso_rules.head())

Frequent Itemsets
     support    itemsets
0  0.522727    (adjika)
1  0.431818   (cheeses)
2  0.318182  (crackers)
3  0.500000     (honey)
4  0.522727  (ketchups)
Rules
   antecedents consequents  antecedent support  consequent support   support  \
0    (adjika)   (cheeses)            0.522727            0.431818  0.227273   
1   (cheeses)    (adjika)            0.431818            0.522727  0.227273   
2    (adjika)     (honey)            0.522727            0.500000  0.204545   
3     (honey)    (adjika)            0.500000            0.522727  0.204545   
4    (adjika)  (ketchups)            0.522727            0.522727  0.522727   

   confidence      lift  leverage  conviction  zhangs_metric  
0    0.434783  1.006865  0.001550    1.005245       0.014286  
1    0.526316  1.006865  0.001550    1.007576       0.012000  
2    0.391304  0.782609 -0.056818    0.821429      -0.367893  
3    0.409091  0.782609 -0.056818    0.807692      -0.357143  
4    1.000000  1.913043  0.249483       

# Сравнение результаты с различными параметрами

In [116]:
def test_aso_params(df_processed, support_values=None, confidence_values=None):
    if confidence_values is None:
        confidence_values = [0.5, 0.6, 0.7]
    if support_values is None:
        support_values = [0.1, 0.2, 0.3]
    results = []
    for min_support in support_values:
        for min_confidence in confidence_values:
            frequent_itemsets = apriori(
                df_processed, min_support=min_support, use_colnames=True
            )
            rules = association_rules(
                frequent_itemsets, metric="confidence", min_threshold=min_confidence
            )
            results.append((min_support, min_confidence, rules))
    return results


def compare_aso_rules(results):
    for result in results:
        min_support, min_confidence, rules = result
        print(f"\nParameters: Min Support={min_support}, Min Confidence={min_confidence}")
        print("Number of Rules:", len(rules))
        print("Association Rules:")
        print(rules.head(5))
        print("--------------------------------------")

In [117]:
association_rules_list = test_aso_params(df_processed)
compare_aso_rules(association_rules_list)


Parameters: Min Support=0.1, Min Confidence=0.5
Number of Rules: 300
Association Rules:
  antecedents consequents  antecedent support  consequent support   support  \
0   (cheeses)    (adjika)            0.431818            0.522727  0.227273   
1    (adjika)  (ketchups)            0.522727            0.522727  0.522727   
2  (ketchups)    (adjika)            0.522727            0.522727  0.522727   
3    (adjika)     (pasta)            0.522727            0.545455  0.454545   
4     (pasta)    (adjika)            0.545455            0.522727  0.454545   

   confidence      lift  leverage  conviction  zhangs_metric  
0    0.526316  1.006865  0.001550    1.007576       0.012000  
1    1.000000  1.913043  0.249483         inf       1.000000  
2    1.000000  1.913043  0.249483         inf       1.000000  
3    0.869565  1.594203  0.169421    3.484848       0.780952  
4    0.833333  1.594203  0.169421    2.863636       0.820000  
--------------------------------------

Parameters: Min Su

# Ответы на вопросы

### Какой товар с наибольшей достоверностью берут с вафлями

In [118]:
waffles_rules = aso_rules[aso_rules["antecedents"].apply(lambda x: "waffles" in x)]
most_likely_product = (
    waffles_rules.iloc[0]["consequents"]
    if not waffles_rules.empty
    else "No association rules found for waffles."
)
print(most_likely_product)

frozenset({'cheeses'})


### Человек взял мед и сыры, какой один из товаров он скорее всего не возьмёт

In [119]:
honey_cheeses_rules = aso_rules[
    aso_rules["consequents"].apply(lambda x: {"honey", "cheeses"}.issubset(x))
]
least_likely_product = (
    honey_cheeses_rules.iloc[0]["antecedents"]
    if not honey_cheeses_rules.empty
    else "No association rules found for honey and cheeses."
)
print(least_likely_product)

frozenset({'tea'})


### 5 самых популярных наборов товаров (в наборе может быть один или несколько товаров)

In [120]:
popular_sets = frequent_itemsets.sort_values(by="support", ascending=False).head(5)
print(popular_sets) if not popular_sets.empty else print("No frequent itemsets found.")

    support    itemsets
7  0.750000       (tea)
5  0.545455     (pasta)
0  0.522727    (adjika)
4  0.522727  (ketchups)
6  0.522727    (sauces)


### Описать 4-5 ассоциативных правил, полученных при реализации алгоритмов

In [127]:
top_rules = aso_rules.head(10)
for idx, rule in top_rules.iterrows():
    print(f"\n{idx + 1}.Rule:")
    print("\tAntecedents:", list(rule["antecedents"]))
    print("\tConsequents:", list(rule["consequents"]))
    print("\tSupport:", rule["support"])
    print("\tConfidence:", rule["confidence"])


1.Rule:
	Antecedents: ['adjika']
	Consequents: ['cheeses']
	Support: 0.22727272727272727
	Confidence: 0.43478260869565216

2.Rule:
	Antecedents: ['cheeses']
	Consequents: ['adjika']
	Support: 0.22727272727272727
	Confidence: 0.5263157894736842

3.Rule:
	Antecedents: ['adjika']
	Consequents: ['honey']
	Support: 0.20454545454545456
	Confidence: 0.391304347826087

4.Rule:
	Antecedents: ['honey']
	Consequents: ['adjika']
	Support: 0.20454545454545456
	Confidence: 0.4090909090909091

5.Rule:
	Antecedents: ['adjika']
	Consequents: ['ketchups']
	Support: 0.5227272727272727
	Confidence: 1.0

6.Rule:
	Antecedents: ['ketchups']
	Consequents: ['adjika']
	Support: 0.5227272727272727
	Confidence: 1.0

7.Rule:
	Antecedents: ['adjika']
	Consequents: ['pasta']
	Support: 0.45454545454545453
	Confidence: 0.8695652173913043

8.Rule:
	Antecedents: ['pasta']
	Consequents: ['adjika']
	Support: 0.45454545454545453
	Confidence: 0.8333333333333334

9.Rule:
	Antecedents: ['adjika']
	Consequents: ['sauces']
	Su

# Предсказание выбора следующего продукта(ов)

In [122]:
def predict_next_products(current_products, rules, top_next_n=2):
    predicted_products = []
    for item in current_products:
        filtered_rules = rules[
            rules["antecedents"].apply(lambda x: item in x)
        ]
        if not filtered_rules.empty:
            sorted_rules = filtered_rules.sort_values(by="lift", ascending=False)
            top_consequents = list(sorted_rules["consequents"][:top_next_n - 1])
            predicted_products.extend(top_consequents)
    predicted_products = [list(x) for x in set(predicted_products)]
    return predicted_products

In [123]:
unique_products = df['Product'].apply(lambda x: eval(x)).explode().unique()
df_predict_next = []
for _ in range(len(unique_products)):
    current_products = random.choices(unique_products, k=random.randint(1, 3))
    predicted_next_products = predict_next_products(current_products, aso_rules)
    data = {"Current Products": current_products, "Predicted Products": predicted_next_products}
    df_predict_next.append(pd.Series(data))
df_predict_next

[Current Products             [ketchups]
 Predicted Products    [[pasta, sauces]]
 dtype: object,
 Current Products                                [pasta, tea]
 Predicted Products    [[adjika, sauces, cheeses], [waffles]]
 dtype: object,
 Current Products                                  [honey, cheeses]
 Predicted Products    [[adjika, pasta], [ketchups, pasta, sauces]]
 dtype: object,
 Current Products      [cheeses, cheeses]
 Predicted Products     [[adjika, pasta]]
 dtype: object,
 Current Products                          [honey]
 Predicted Products    [[ketchups, pasta, sauces]]
 dtype: object,
 Current Products         [waffles]
 Predicted Products    [[crackers]]
 dtype: object,
 Current Products                             [pasta, waffles]
 Predicted Products    [[adjika, sauces, cheeses], [crackers]]
 dtype: object,
 Current Products           [tea, adjika, crackers]
 Predicted Products    [[pasta, sauces], [waffles]]
 dtype: object,
 Current Products                [adjika, 