# Task 4


In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gsp import *

**Loading the dataset**

For this task we use the original dataset, after having cleaning it. In fact we are now importing the dataset exported from the Task 1 notebook, after assessing and improving data quality (handling missing values and outliers).

In [3]:
df = pd.read_csv('dataset/df.csv', sep=',', index_col=0)

In [4]:
df.head()

Unnamed: 0,BasketID,BasketDate,Sale,CustomerID,CustomerCountry,ProdID,ProdDescr,Qta,Sale_per_Qta
0,536365,2010-01-12 08:26:00,2.55,17850.0,United Kingdom,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,15.3
1,536365,2010-01-12 08:26:00,3.39,17850.0,United Kingdom,71053,WHITE METAL LANTERN,6,20.34
2,536365,2010-01-12 08:26:00,2.75,17850.0,United Kingdom,84406B,CREAM CUPID HEARTS COAT HANGER,8,22.0
3,536365,2010-01-12 08:26:00,3.39,17850.0,United Kingdom,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,20.34
4,536365,2010-01-12 08:26:00,3.39,17850.0,United Kingdom,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,20.34


## Modelling sequences

Now we model each customer as a sequence of baskets.

In [5]:
baskets = df.groupby('CustomerID')['BasketID'].unique().apply(list)
baskets.head()

CustomerID
12347.0    [537626, 542237, 549222, 556201, 562032, 57351...
12348.0                             [539318, 541998, 548955]
12349.0                                             [577609]
12350.0                                             [543037]
12352.0    [544156, 545323, 546869, 547390, 567505, 56869...
Name: BasketID, dtype: object

In [6]:
min_baskets = 2

In [7]:
customers_to_drop = baskets[baskets.apply(len) < min_baskets].index

In [8]:
baskets.drop(index=customers_to_drop, inplace=True)

This is a relational representation: each row of the dataframe has a basket ID and one product bought. For our analysis, we need the transactions: the list of product bought for each basket. 

In [9]:
transactions = df[df.CustomerID.isin(baskets.index)].groupby('BasketID')['ProdID'].unique().apply(list)
transactions.head()

BasketID
536365       [85123A, 71053, 84406B, 84029G, 84029E, 21730]
536366                                              [22633]
536367    [22745, 22748, 22749, 22310, 84969, 22623, 217...
536368                         [22960, 22913, 22912, 22914]
536369                                              [21756]
Name: ProdID, dtype: object

In [10]:
df2 = pd.DataFrame(columns=['CustomerID', 'basket_list'])
for CustomerID in baskets.index:
    customer_list = []
    for BasketID in baskets.loc[CustomerID]:
        customer_list.append(transactions.loc[BasketID])
    current_customer = pd.DataFrame(data= {'CustomerID': CustomerID, 'basket_list': [customer_list]})
    df2 = pd.concat([df2, current_customer])

In [11]:
df2.set_index('CustomerID', inplace=True)

In [12]:
df2

Unnamed: 0_level_0,basket_list
CustomerID,Unnamed: 1_level_1
12347.0,"[[85116, 22375, 71477, 22771, 22772, 22773, 22..."
12348.0,"[[84991, 21213, 22952, 21977], [21726], [22437]]"
12352.0,"[[21380, 22064, 21232, 22646, 22779, 22654, 21..."
12356.0,"[[22138, 22062, 22066, 22131, 22195, 22937, 84..."
12358.0,"[[15060B, 22059, 37447, 15056P, 15056N, 20679,..."
...,...
18273.0,"[[79302M], [79302M]]"
18282.0,"[[23295, 22089, 21108, 21109], [22699, 22818, ..."
18283.0,"[[22356, 20726, 22384, 22386, 20717, 20718, 85..."
18287.0,"[[22755, 22754, 22753, 22756, 22758, 22757, 22..."


In [13]:
dataset = []
for row in df2.values:
    dataset.append(row[0])

In [14]:
help(apriori)

Help on function apriori in module gsp:

apriori(dataset, minSupport, verbose=False)



In [15]:
print("Number of input sequences: ", len(dataset))
print("Total number of events: ", sum([len(seq) for seq in dataset])) 

Number of input sequences:  2720
Total number of events:  17378


In [17]:
for y in [5, 10, 15, 20, 40, 50]:
    print((len(dataset)*y) / 100.0)

136.0
272.0
408.0
544.0
1088.0
1360.0


In [18]:
itemsets_5 = apriori(dataset, minSupport=136, verbose=False) # run partita alle 13.10

In [26]:
itemsets_5

[([['15036']], 137),
 ([['15056N']], 142),
 ([['16161P']], 157),
 ([['20676']], 154),
 ([['20711']], 136),
 ([['20712']], 208),
 ([['20713']], 185),
 ([['20718']], 199),
 ([['20719']], 202),
 ([['20723']], 191),
 ([['20724']], 276),
 ([['20725']], 465),
 ([['20726']], 336),
 ([['20727']], 410),
 ([['20728']], 411),
 ([['20914']], 288),
 ([['20969']], 140),
 ([['20971']], 233),
 ([['20972']], 261),
 ([['20973']], 168),
 ([['20974']], 165),
 ([['20975']], 196),
 ([['20981']], 137),
 ([['20983']], 152),
 ([['21034']], 373),
 ([['21080']], 360),
 ([['21086']], 146),
 ([['21094']], 152),
 ([['21121']], 198),
 ([['21122']], 178),
 ([['21124']], 152),
 ([['21136']], 198),
 ([['21154']], 146),
 ([['21155']], 179),
 ([['21156']], 179),
 ([['21164']], 144),
 ([['21165']], 150),
 ([['21166']], 243),
 ([['21169']], 162),
 ([['21172']], 204),
 ([['21174']], 192),
 ([['21175']], 286),
 ([['21181']], 295),
 ([['21210']], 203),
 ([['21212']], 520),
 ([['21213']], 238),
 ([['21231']], 168),
 ([['21232'

In [19]:
itemsets_10 = apriori(dataset, minSupport=272, verbose=False)

In [27]:
itemsets_10

[([['20724']], 276),
 ([['20725']], 465),
 ([['20726']], 336),
 ([['20727']], 410),
 ([['20728']], 411),
 ([['20914']], 288),
 ([['21034']], 373),
 ([['21080']], 360),
 ([['21175']], 286),
 ([['21181']], 295),
 ([['21212']], 520),
 ([['21485']], 321),
 ([['21733']], 309),
 ([['21754']], 324),
 ([['21755']], 296),
 ([['21790']], 365),
 ([['21791']], 282),
 ([['21889']], 282),
 ([['21915']], 283),
 ([['21931']], 282),
 ([['21977']], 327),
 ([['22077']], 335),
 ([['22086']], 476),
 ([['22111']], 352),
 ([['22112']], 317),
 ([['22114']], 280),
 ([['22138']], 460),
 ([['22139']], 405),
 ([['22144']], 272),
 ([['22149']], 282),
 ([['22178']], 319),
 ([['22197']], 308),
 ([['22382']], 426),
 ([['22383']], 378),
 ([['22384']], 395),
 ([['22386']], 306),
 ([['22411']], 312),
 ([['22457']], 487),
 ([['22469']], 462),
 ([['22470']], 407),
 ([['22551']], 286),
 ([['22554']], 299),
 ([['22558']], 293),
 ([['22621']], 300),
 ([['22629']], 283),
 ([['22662']], 276),
 ([['22666']], 409),
 ([['22697']]

In [20]:
itemsets_15 = apriori(dataset, minSupport=408, verbose=False)

In [28]:
itemsets_15

[([['20725']], 465),
 ([['20727']], 410),
 ([['20728']], 411),
 ([['21212']], 520),
 ([['22086']], 476),
 ([['22138']], 460),
 ([['22382']], 426),
 ([['22457']], 487),
 ([['22469']], 462),
 ([['22666']], 409),
 ([['22720']], 528),
 ([['22960']], 462),
 ([['22961']], 434),
 ([['23203']], 432),
 ([['23245']], 416),
 ([['23298']], 508),
 ([['47566']], 570),
 ([['84879']], 517),
 ([['85099B']], 506),
 ([['85123A']], 624)]

In [21]:
itemsets_20 = apriori(dataset, minSupport=544, verbose=False)

In [29]:
itemsets_20

[([['47566']], 570), ([['85123A']], 624)]

In [None]:
# itemsets_25 = apriori(dataset, minSupport=680, verbose=False)

In [None]:
# itemsets_30 = apriori(dataset, minSupport=816, verbose=False)

In [24]:
print('5% of support: ', str(len(itemsets_5)))
print('10% of support: ' , str(len(itemsets_10)))
print('15% of support: ' , str(len(itemsets_15)))
print('20% of support: ' , str(len(itemsets_20)))
#print('25% of support: ' str(len(itemsets_25)))
#print('30% of support: ' str(len(itemsets_30)))

5% of support:  749
10% of support:  94
15% of support:  20
20% of support:  2


In [25]:
for pattern in itemsets_5:
    if len(pattern) > 1:
        print(pattern[0])
        print(pattern[1])

[['15036']]
137
[['15056N']]
142
[['16161P']]
157
[['20676']]
154
[['20711']]
136
[['20712']]
208
[['20713']]
185
[['20718']]
199
[['20719']]
202
[['20723']]
191
[['20724']]
276
[['20725']]
465
[['20726']]
336
[['20727']]
410
[['20728']]
411
[['20914']]
288
[['20969']]
140
[['20971']]
233
[['20972']]
261
[['20973']]
168
[['20974']]
165
[['20975']]
196
[['20981']]
137
[['20983']]
152
[['21034']]
373
[['21080']]
360
[['21086']]
146
[['21094']]
152
[['21121']]
198
[['21122']]
178
[['21124']]
152
[['21136']]
198
[['21154']]
146
[['21155']]
179
[['21156']]
179
[['21164']]
144
[['21165']]
150
[['21166']]
243
[['21169']]
162
[['21172']]
204
[['21174']]
192
[['21175']]
286
[['21181']]
295
[['21210']]
203
[['21212']]
520
[['21213']]
238
[['21231']]
168
[['21232']]
251
[['21314']]
182
[['21430']]
159
[['21479']]
217
[['21481']]
215
[['21484']]
182
[['21485']]
321
[['21497']]
148
[['21498']]
197
[['21499']]
151
[['21500']]
141
[['21506']]
147
[['21533']]
187
[['21535']]
182
[['21558']]
143
[['215

In [30]:
def get_description(productID):
    return df[df.ProdID == productID].ProdDescr.values[0]

def itemset_to_descriptions(itemset):
    itemset_with_descriptions = []
    for sequence in itemset:
        
        set_of_products = sequence[0]
        support = sequence[1]
        
        pattern = []
        set_of_descriptions = []
        
        for products in set_of_products:
            description_sequences = []
            
            for product in products:
                product_descr = get_description(product)
                description_sequences.append(product_descr)
            
            set_of_descriptions.append(description_sequences)
        
        pattern = set_of_descriptions, support
        itemset_with_descriptions.append(pattern)
        
    return itemset_with_descriptions

In [33]:
prova = itemset_to_descriptions(itemsets_5)

In [34]:
prova

[([['ASSORTED COLOURS SILK FAN']], 137),
 ([['EDWARDIAN PARASOL NATURAL']], 142),
 ([['WRAP ENGLISH ROSE ']], 157),
 ([['RED RETROSPOT BOWL']], 154),
 ([['JUMBO BAG TOYS ']], 136),
 ([['JUMBO BAG WOODLAND ANIMALS']], 208),
 ([['JUMBO BAG OWLS']], 185),
 ([['RED RETROSPOT SHOPPER BAG']], 199),
 ([['WOODLAND CHARLOTTE BAG']], 202),
 ([['STRAWBERRY CHARLOTTE BAG']], 191),
 ([['RED RETROSPOT CHARLOTTE BAG']], 276),
 ([['LUNCH BAG RED RETROSPOT']], 465),
 ([['LUNCH BAG WOODLAND']], 336),
 ([['LUNCH BAG  BLACK SKULL.']], 410),
 ([['LUNCH BAG CARS BLUE']], 411),
 ([['SET/5 RED RETROSPOT LID GLASS BOWLS']], 288),
 ([['RED FLORAL FELTCRAFT SHOULDER BAG']], 140),
 ([['PINK BLUE FELT CRAFT TRINKET BOX']], 233),
 ([['PINK CREAM FELT CRAFT TRINKET BOX ']], 261),
 ([['12 PENCIL SMALL TUBE WOODLAND']], 168),
 ([['12 PENCILS SMALL TUBE SKULL']], 165),
 ([['12 PENCILS SMALL TUBE RED RETROSPOT']], 196),
 ([['12 PENCILS TALL TUBE WOODLAND']], 137),
 ([['12 PENCILS TALL TUBE RED RETROSPOT']], 152),
 ([['R