In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('csv/online_retail.csv')
data.head()

Уникальные значения товаров

In [None]:
data['Description'].value_counts()

## Очистка данных
На данном этапе нужно очистить данные чтобы с ними было возможно работать. Проверим значение цены.
Все данные, которые не имеют или имеют отрицательную цену должны быть очищены

In [None]:
data.loc[data['UnitPrice'] <= 0]

In [None]:
data = data[data['UnitPrice'] > 0]

In [None]:
data[data['UnitPrice'] < 0]

Добавим колонку с куплей/продажей

In [None]:
data['Transaction'] = data['Quantity'].apply(lambda x: 'Sold' if x < 0 else 'Bought')
data['Transaction'].value_counts()

## Топ 10 покупателей по числу проданных товаров

In [None]:
customer_totals = data.groupby('CustomerID')['Quantity'].sum()
sorted_totals = customer_totals.sort_values(ascending=False)
top_customers = sorted_totals.head(10)
top_customers

In [None]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['Date'] = data['InvoiceDate'].dt.date
data['Time'] = data['InvoiceDate'].dt.strftime('%H:%M:%S')
data

In [None]:
# Create new column with time interval labels
time = data['InvoiceDate'].dt.hour
data['Time of Day'] = [
    'Early Morning' if x < 6 else
    'Morning' if x < 12 else
    'Noon' if x < 14 else
    'Afternoon' if x < 18 else
    'Evening' if x < 22 else 'Night' for x in time]
data

In [None]:
import seaborn as sns
sns.countplot(data, x='Time of Day')

# Assosiations (Market bucket analysis)

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('csv/groceries_dataset.csv')
df.head()

In [None]:
df['transaction'] = df['Member_number'].astype(str) + '_' + df['Date'].astype(str)
df

In [None]:
df2 = pd.crosstab(df['transaction'], df['itemDescription'])
df2.head()

In [None]:
def encode(item_freq):
    res = False
    if item_freq > 0:
        res = True
    return res

basket_input = df2.applymap(encode)
basket_input

## Apriori

In [None]:
from mlxtend.frequent_patterns import apriori, fpgrowth
from mlxtend.frequent_patterns import association_rules

apriori_items = apriori(basket_input, min_support=0.004, use_colnames=True)
apriori_rules = association_rules(apriori_items, metric="lift")

In [None]:
apriori_rules.sort_values(["support", "confidence", "lift"], ascending=False).head(8)

## FPGrowth

In [None]:
fpgrowth_items: pd.DataFrame = fpgrowth(basket_input, min_support=0.001, use_colnames=True, max_len=5, verbose=True)
fpgrowth_items = fpgrowth_items[fpgrowth_items.itemsets.map(len) > 2]

In [None]:
fpgrowth_items.sort_values("support", ascending=False).head(8)