In [1]:
import pandas as pd
import mlxtend
from mlxtend.frequent_patterns import apriori, association_rules

data = pd.read_csv("online-retail-dataset.csv", sep=',')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [2]:
# Eseguiamo una semplice pulizia dei dati
data['Description'] = data['Description'].str.strip()

data.dropna(axis=0, subset=['InvoiceNo'], inplace=True)
data['InvoiceNo'] = data['InvoiceNo'].astype('str')

data = data[~data['InvoiceNo'].str.contains('C')]

data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
# Consolidiamo gli articoli per una transazione per riga, ciascun prodotto 1 codice
# Consideriamo le vendite in località ristrette

# Transazioni in Francia
basket_France = (data[data['Country'] == "France"]
                .groupby(['InvoiceNo', 'Description'])['Quantity']
                .sum().unstack().reset_index().fillna(0)
                .set_index('InvoiceNo'))

# Transazioni in UK
basket_UK = (data[data['Country'] == "United Kingdom"]
            .groupby(['InvoiceNo', 'Description'])['Quantity']
            .sum().unstack().reset_index().fillna(0)
            .set_index('InvoiceNo'))

In [4]:
# Convertiamo tutti i valori positivi in 1 e tutti i valori negativi in 0

def hot_encode(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
    
basket_encoded = basket_France.applymap(hot_encode)
basket_France = basket_encoded
basket_encoded = basket_UK.applymap(hot_encode)
basket_UK = basket_encoded

In [5]:
# Generiamo dunque un set di articoli che hanno un supporto almeno al 7%
# Dunque generiamo regole con il supporto, la sicurezza e il lift corrispondenti

# Costruiamo il modello
frq_items = apriori(basket_France, min_support=0.05, use_colnames=True)
# Collezioniamo le regole in un database
rules = association_rules(frq_items, metric="lift", min_threshold=1)
rules = rules.sort_values(['confidence', 'lift'], ascending=[False, False])
rules



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
44,(JUMBO BAG WOODLAND ANIMALS),(POSTAGE),0.076531,0.765306,0.076531,1.000000,1.306667,0.017961,inf
258,"(PLASTERS IN TIN CIRCUS PARADE, RED TOADSTOOL ...",(POSTAGE),0.051020,0.765306,0.051020,1.000000,1.306667,0.011974,inf
270,"(RED TOADSTOOL LED NIGHT LIGHT, PLASTERS IN TI...",(POSTAGE),0.053571,0.765306,0.053571,1.000000,1.306667,0.012573,inf
300,"(SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...",(SET/6 RED SPOTTY PAPER PLATES),0.102041,0.127551,0.099490,0.975000,7.644000,0.086474,34.897959
302,"(SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...",(SET/6 RED SPOTTY PAPER CUPS),0.102041,0.137755,0.099490,0.975000,7.077778,0.085433,34.489796
...,...,...,...,...,...,...,...,...,...
37,(POSTAGE),(JAM MAKING SET PRINTED),0.765306,0.053571,0.051020,0.066667,1.244444,0.010022,1.014031
27,(POSTAGE),(CIRCUS PARADE CHILDRENS EGG CUP),0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297
97,(POSTAGE),(PARTY BUNTING),0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297
227,(POSTAGE),"(LUNCH BAG RED RETROSPOT, LUNCH BAG WOODLAND)",0.765306,0.056122,0.051020,0.066667,1.187879,0.008070,1.011297
