This notebook performs Basket Market Analysis on the dataset to establish sequence purchasing pattern of products by customer segments.

The references used for this notebook were the following:
- https://pythondata.com/market-basket-analysis-with-python-and-pandas/?fbclid=IwAR0HqIYj8kJcd8xcI0HkVzUOohSvfsp-N7t61cj3rf-FwfffXteZ19WSP3E
- https://towardsdatascience.com/mba-for-breakfast-4c18164ef82b
    
Inspiration was based on the paper:
- Rodrigues, F., &amp; Ferreira, B. (2016). Product recommendation based on shared Customer's Behaviour. Procedia Computer Science, 100, 136-146. doi:10.1016/j.procs.2016.09.133

In [1]:
! pip install pandas mlxtend



In [2]:
# Loading basic needed libraries
import pandas as pd
import gc
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Loading libraries for S3 bucket connection
import boto3
import io
from io import StringIO,BytesIO, TextIOWrapper
import gzip

client = boto3.client('s3') 
resource = boto3.resource('s3') 

In [3]:
# Reading purchase dataset - dataset came from notebook - purchase_dataset_gathering.ipynb
purchase_df = pd.read_csv('s3://myaws-capstone-bucket/eCommerce_purchase_data.csv')
purchase_df.head()

Unnamed: 0,user_id,user_session,event_time,category_code,category_id,brand,product_id,category
0,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564,2019-10-01 00:02:14 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone
1,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68,2019-10-01 00:04:37 UTC,electronics.smartphone,2053013555631882655,apple,1002532,2053013555631882655_electronics.smartphone
2,555332717,1dea3ee2-2ded-42e8-8e7a-4e2ad6ae942f,2019-10-01 00:07:07 UTC,furniture.bathroom.toilet,2053013557418656265,santeri,13800054,2053013557418656265_furniture.bathroom.toilet
3,524601178,2af9b570-0942-4dcd-8f25-4d84fba82553,2019-10-01 00:09:26 UTC,electronics.audio.headphone,2053013554658804075,apple,4804055,2053013554658804075_electronics.audio.headphone
4,551377651,3c80f0d6-e9ec-4181-8c5c-837a30be2d68,2019-10-01 00:09:54 UTC,electronics.audio.headphone,2053013554658804075,apple,4804056,2053013554658804075_electronics.audio.headphone


In [4]:
purchase_df.nunique()

user_id          1817173
user_session     4544395
event_time       4415560
category_code        139
category_id          932
brand               4081
product_id         96037
category             932
dtype: int64

In [5]:
purchase_df.shape

(5707926, 8)

In [6]:
# Reading clustered dataset to segment the customers based on their K-means cluster segments and perform Basket Market Analysis per Segment
# dataset came from notebook - Customer_Segmentation_Clustering.ipynb
cluster_df = pd.read_csv('s3://myaws-capstone-bucket/data/cluster_segments.csv')
cluster_df.nunique()

user_id    914574
cluster         6
dtype: int64

In [7]:
purchase_df = pd.merge(purchase_df,cluster_df, on=["user_id"], how='inner')
purchase_df.nunique()

user_id           914574
user_session     3641796
event_time       3828804
category_code        139
category_id          930
brand               3959
product_id         89325
category             930
cluster                6
dtype: int64

In [8]:
# Creating quantity column where each row represent one purchased item based on the kaggle dataset
purchase_df['quantity'] = 1
purchase_df.head()

Unnamed: 0,user_id,user_session,event_time,category_code,category_id,brand,product_id,category,cluster,quantity
0,543272936,8187d148-3c41-46d4-b0c0-9c08cd9dc564,2019-10-01 00:02:14 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone,3,1
1,543272936,3591a683-59b0-41d0-94b7-fbc381401119,2019-10-01 03:42:37 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone,3,1
2,543272936,4ab63ddd-717a-435b-93cd-934176ecfc0e,2019-10-02 00:38:12 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone,3,1
3,543272936,d5de376a-9d8a-4e66-9fcd-74d00a384daa,2019-10-02 00:43:45 UTC,electronics.telephone,2053013555531219353,panasonic,11300010,2053013555531219353_electronics.telephone,3,1
4,543272936,5fa53005-2891-4880-a4ac-aa039b71c37c,2019-10-02 22:42:22 UTC,electronics.smartphone,2053013555631882655,samsung,1004856,2053013555631882655_electronics.smartphone,3,1


In [9]:
# Creating a df for each cluster segment
c1 = purchase_df.loc[purchase_df.cluster == 0]
c2 = purchase_df.loc[purchase_df.cluster == 1]
c3 = purchase_df.loc[purchase_df.cluster == 2]
c4 = purchase_df.loc[purchase_df.cluster == 3]
c5 = purchase_df.loc[purchase_df.cluster == 4]
c6 = purchase_df.loc[purchase_df.cluster == 5]

#### Basket Market Analysis on different cluster segments

In [10]:
# Creating market basket 
market_basket = c1.groupby(['user_id', 'category'])['quantity']
market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('user_id')
market_basket.head()

category,2053013551865397438_sport.trainer,2053013551882174655_construction.tools.welding,2053013551898951873_construction.tools.light,2053013551907340482_sport.ski,2053013551924117699_sport.ski,2053013551932506308_construction.tools.drill,2053013551940894917_computers.desktop,2053013551966060743_kids.carriage,2053013552058335434_appliances.kitchen.meat_grinder,2053013552125444301_appliances.environment.vacuum,...,2232732135146389852_apparel.trousers,2232732135213498718_apparel.trousers,2232732135960084850_apparel.costume,2232732137285484952_furniture.bathroom.bath,2232732137780412838_furniture.kitchen.table,2232732137855910312_apparel.pajamas,2232732138325672372_apparel.costume,2234185357446873711_apparel.shirt,2282652065861730868_stationery.paper,2292044075982913587_furniture.bathroom.bath
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
237470903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
237973968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272115549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297538443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297633028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Encoding dataset to represent when a category is purchased - 0 means no and 1 means yes
def encode_data(value):
    if value <= 0:
        return 0
    if value >= 1:
        return 1

market_basket = market_basket.applymap(encode_data)

In [12]:
# Applying apriori to determine which categories are usually bought in sequence by using the aprioro function

# Setting a support of 0.01 to get categories that repeat 1% of the time or more 
# 1% was the support level that provided the largest amount of categories
categorysets = apriori(market_basket, min_support=0.01, use_colnames=True, max_len = 2)
categorysets

Unnamed: 0,support,itemsets
0,0.020922,(2053013554415534427_electronics.video.tv)
1,0.09236,(2053013554658804075_electronics.audio.headphone)
2,0.015779,(2053013554725912943_appliances.kitchen.coffee...
3,0.157781,(2053013555631882655_electronics.smartphone)
4,0.016744,(2053013558920217191_computers.notebook)
5,0.025185,(2053013563835941749_appliances.kitchen.refrig...
6,0.070713,(2232732079706079299_sport.bicycle)
7,0.010187,(2232732091391410500_appliances.kitchen.blender)
8,0.023472,(2232732091718566220_appliances.kitchen.refrig...
9,0.030661,(2232732092297380188_appliances.kitchen.washer)


The next step is to set up the sequence association rules based on the pattern. The association rules function allows us to use three metrics. 

- support - which consists of the percentage of purchases with a specific sequence of products relative to the total number of purchases.
- confidence - this metric meassures how dependent was one product in the sequence from the one before - the probability of the pattern repeating.
- lift - it meassures the difference (in ratio) between the confidence rule and the expected confidende. 

Definitions based on article: https://towardsdatascience.com/mba-for-breakfast-4c18164ef82b

In [13]:
rules = association_rules(categorysets, metric="lift", min_threshold=.5)
rules['cluster'] = 'c1'
rules = rules.sort_values(by=['confidence','lift'], ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,cluster
8,(2232732079706079299_sport.bicycle),(2232732093077520756_construction.tools.light),0.070713,0.635424,0.041717,0.589947,0.928431,-0.003216,0.889095,c1
6,(2053013563835941749_appliances.kitchen.refrig...,(2232732093077520756_construction.tools.light),0.025185,0.635424,0.012108,0.480769,0.756612,-0.003895,0.702147,c1
15,(2232732103101907535_electronics.clocks),(2232732093077520756_construction.tools.light),0.031446,0.635424,0.01427,0.453793,0.714159,-0.005712,0.667469,c1
12,(2232732101063475749_appliances.environment.va...,(2232732093077520756_construction.tools.light),0.031266,0.635424,0.011812,0.377801,0.594565,-0.008055,0.585948,c1
4,(2053013555631882655_electronics.smartphone),(2232732093077520756_construction.tools.light),0.157781,0.635424,0.055102,0.349231,0.549604,-0.045156,0.560224,c1
2,(2053013554658804075_electronics.audio.headphone),(2232732093077520756_construction.tools.light),0.09236,0.635424,0.030997,0.335616,0.528176,-0.02769,0.548743,c1
10,(2232732099754852875_appliances.personal.massa...,(2232732093077520756_construction.tools.light),0.073107,0.635424,0.024221,0.331307,0.521396,-0.022233,0.545208,c1
0,(2053013554658804075_electronics.audio.headphone),(2053013555631882655_electronics.smartphone),0.09236,0.157781,0.012933,0.140028,0.887481,-0.00164,0.979356,c1
5,(2232732093077520756_construction.tools.light),(2053013555631882655_electronics.smartphone),0.635424,0.157781,0.055102,0.086717,0.549604,-0.045156,0.922188,c1
1,(2053013555631882655_electronics.smartphone),(2053013554658804075_electronics.audio.headphone),0.157781,0.09236,0.012933,0.081968,0.887481,-0.00164,0.98868,c1


In [14]:
# Saving Results in S3
rules.to_csv('s3://myaws-capstone-bucket/data/basket_market_analysis/c1_segment_Arules.csv',index=False)

In [15]:
# Creating market basket 
market_basket = c2.groupby(['user_id', 'category'])['quantity']
market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('user_id')
market_basket.head()

category,2053013551907340482_sport.ski,2053013551932506308_construction.tools.drill,2053013552167387343_appliances.environment.climate,2053013552226107603_appliances.environment.fan,2053013552259662037_computers.components.power_supply,2053013552293216471_appliances.environment.air_heater,2053013552326770905_appliances.environment.water_heater,2053013552351936731_appliances.environment.air_conditioner,2053013552469377249_apparel.jeans,2053013552570040549_electronics.video.projector,...,2232732115617710964_apparel.shoes.keds,2232732116137804674_apparel.underwear,2232732116498514828_apparel.jeans,2232732117446427558_apparel.shoes,2232732120994807810_apparel.shoes,2232732121070305284_construction.tools.saw,2232732124283142224_apparel.shirt,2232732129735737558_sport.ski,2232732133988761920_apparel.sock,2232732135054115162_apparel.trousers
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
453908841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
456810531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
461023190,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
467576236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
470262816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Encoding dataset to represent when a category is purchased - 0 means no and 1 means yes
def encode_data(value):
    if value <= 0:
        return 0
    if value >= 1:
        return 1

market_basket = market_basket.applymap(encode_data)

In [17]:
# Applying apriori to determine which categories are usually bought in sequence by using the aprioro function

# Setting a support of 0.01 to get categories that repeat 1% of the time or more 
# 1% was the support level that provided the largest amount of categories
categorysets = apriori(market_basket, min_support=0.01, use_colnames=True, max_len = 2)
categorysets

Unnamed: 0,support,itemsets
0,0.021264,(2053013553341792533_electronics.clocks)
1,0.017234,(2053013553945772349_electronics.audio.subwoofer)
2,0.017834,(2053013553970938175_auto.accessories.player)
3,0.046815,(2053013554415534427_electronics.video.tv)
4,0.05376,(2053013554658804075_electronics.audio.headphone)
5,0.012432,(2053013554776244595_appliances.kitchen.microw...
6,0.010718,(2053013555262783879_appliances.kitchen.blender)
7,0.010289,(2053013555573162395_electronics.telephone)
8,0.488982,(2053013555631882655_electronics.smartphone)
9,0.046472,(2053013558920217191_computers.notebook)


In [18]:
rules = association_rules(categorysets, metric="lift", min_threshold=.5)
rules['cluster'] = 'c2'
rules = rules.sort_values(by=['confidence','lift'], ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,cluster
0,(2053013554658804075_electronics.audio.headphone),(2053013555631882655_electronics.smartphone),0.05376,0.488982,0.01912,0.355662,0.727351,-0.007167,0.793089,c2
1,(2053013555631882655_electronics.smartphone),(2053013554658804075_electronics.audio.headphone),0.488982,0.05376,0.01912,0.039102,0.727351,-0.007167,0.984746,c2


In [19]:
# Saving Results in S3
rules.to_csv('s3://myaws-capstone-bucket/data/basket_market_analysis/c2_segment_Arules.csv',index=False)

In [20]:
# Creating market basket 
market_basket = c3.groupby(['user_id', 'category'])['quantity']
market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('user_id')
market_basket.head()

category,2053013551865397438_sport.trainer,2053013551882174655_construction.tools.welding,2053013551898951873_construction.tools.light,2053013551907340482_sport.ski,2053013551924117699_sport.ski,2053013551932506308_construction.tools.drill,2053013551940894917_computers.desktop,2053013551966060743_kids.carriage,2053013552058335434_appliances.kitchen.meat_grinder,2053013552125444301_appliances.environment.vacuum,...,2232732134299140424_apparel.tshirt,2232732134366249290_apparel.tshirt,2232732134441746764_apparel.tshirt,2232732134987006296_electronics.audio.headphone,2232732135054115162_apparel.trousers,2232732135146389852_apparel.trousers,2232732135213498718_apparel.trousers,2232732135960084850_apparel.costume,2232732137855910312_apparel.pajamas,2234185357446873711_apparel.shirt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
145611266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
259560538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
267054723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
311304884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
350787985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Encoding dataset to represent when a category is purchased - 0 means no and 1 means yes
def encode_data(value):
    if value <= 0:
        return 0
    if value >= 1:
        return 1

market_basket = market_basket.applymap(encode_data)

In [22]:
# Applying apriori to determine which categories are usually bought in sequence by using the aprioro function

# Setting a support of 0.01 to get categories that repeat 1% of the time or more 
# 1% was the support level that provided the largest amount of categories
categorysets = apriori(market_basket, min_support=0.01, use_colnames=True, max_len = 2)
categorysets

Unnamed: 0,support,itemsets
0,0.018749,(2053013554415534427_electronics.video.tv)
1,0.052347,(2053013554658804075_electronics.audio.headphone)
2,0.012429,(2053013554776244595_appliances.kitchen.microw...
3,0.014302,(2053013554834964853_appliances.kitchen.kettle)
4,0.012631,(2053013555262783879_appliances.kitchen.blender)
5,0.225255,(2053013555631882655_electronics.smartphone)
6,0.019085,(2053013557477376525_furniture.bathroom.bath)
7,0.014542,(2053013560530830019_electronics.camera.video)
8,0.011584,(2053013563810775923_appliances.kitchen.washer)
9,0.036489,(2053013565983425517_appliances.environment.va...


In [23]:
rules = association_rules(categorysets, metric="lift", min_threshold=.5)
rules['cluster'] = 'c3'
rules = rules.sort_values(by=['confidence','lift'], ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,cluster
1,(2232732093077520756_construction.tools.light),(2053013555631882655_electronics.smartphone),0.187085,0.225255,0.025588,0.13677,0.607176,-0.016554,0.897495,c3
0,(2053013555631882655_electronics.smartphone),(2232732093077520756_construction.tools.light),0.225255,0.187085,0.025588,0.113594,0.607176,-0.016554,0.917091,c3


In [24]:
# Saving Results in S3
rules.to_csv('s3://myaws-capstone-bucket/data/basket_market_analysis/c3_segment_Arules.csv',index=False)

In [25]:
# Creating market basket 
market_basket = c4.groupby(['user_id', 'category'])['quantity']
market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('user_id')
market_basket.head()

category,2053013551857008829_apparel.shoes,2053013551865397438_sport.trainer,2053013551882174655_construction.tools.welding,2053013551898951873_construction.tools.light,2053013551907340482_sport.ski,2053013551924117699_sport.ski,2053013551932506308_construction.tools.drill,2053013551940894917_computers.desktop,2053013551966060743_kids.carriage,2053013552058335434_appliances.kitchen.meat_grinder,...,2232732135054115162_apparel.trousers,2232732135146389852_apparel.trousers,2232732135213498718_apparel.trousers,2232732135960084850_apparel.costume,2232732137285484952_furniture.bathroom.bath,2232732137780412838_furniture.kitchen.table,2232732137855910312_apparel.pajamas,2232732138325672372_apparel.costume,2234185357446873711_apparel.shirt,2292044075982913587_furniture.bathroom.bath
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300613484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
389186962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
390878829,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
395303394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404851685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Encoding dataset to represent when a category is purchased - 0 means no and 1 means yes
def encode_data(value):
    if value <= 0:
        return 0
    if value >= 1:
        return 1

market_basket = market_basket.applymap(encode_data)

In [27]:
# Applying apriori to determine which categories are usually bought in sequence by using the aprioro function

# Setting a support of 0.01 to get categories that repeat 1% of the time or more 
# 1% was the support level that provided the largest amount of categories
categorysets = apriori(market_basket, min_support=0.01, use_colnames=True, max_len = 2)
categorysets

Unnamed: 0,support,itemsets
0,0.026372,(2053013553056579841_computers.peripherals.pri...
1,0.010340,(2053013553140465927_kids.toys)
2,0.012053,(2053013553325015316_appliances.kitchen.toster)
3,0.026933,(2053013553341792533_electronics.clocks)
4,0.047302,(2053013554415534427_electronics.video.tv)
...,...,...
72,0.030855,(2232732101063475749_appliances.environment.va...
73,0.023088,"(2232732101407408685_apparel.shoes.slipons, 22..."
74,0.012568,(2232732102103663163_furniture.bedroom.blanket...
75,0.051596,"(2232732093077520756_construction.tools.light,..."


In [28]:
rules = association_rules(categorysets, metric="lift", min_threshold=.5)
rules['cluster'] = 'c4'
rules = rules.sort_values(by=['confidence','lift'], ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,cluster
46,(2053013560530830019_electronics.camera.video),(2232732093077520756_construction.tools.light),0.015663,0.736625,0.013478,0.860471,1.168126,0.001940,1.887600,c4
87,(2232732103831716449_apparel.shoes),(2232732093077520756_construction.tools.light),0.028089,0.736625,0.023872,0.849856,1.153717,0.003181,1.754156,c4
50,(2053013563835941749_appliances.kitchen.refrig...,(2232732093077520756_construction.tools.light),0.046165,0.736625,0.038586,0.835843,1.134693,0.004580,1.604413,c4
63,(2232732081585127530_construction.components.f...,(2232732093077520756_construction.tools.light),0.017926,0.736625,0.013889,0.774802,1.051827,0.000684,1.169526,c4
54,(2232732079706079299_sport.bicycle),(2232732093077520756_construction.tools.light),0.139938,0.736625,0.105584,0.754508,1.024278,0.002503,1.072848,c4
...,...,...,...,...,...,...,...,...,...,...
47,(2232732093077520756_construction.tools.light),(2053013560530830019_electronics.camera.video),0.736625,0.015663,0.013478,0.018297,1.168126,0.001940,1.002683,c4
65,(2232732093077520756_construction.tools.light),(2232732082063278200_electronics.clocks),0.736625,0.016962,0.012764,0.017327,1.021539,0.000269,1.000372,c4
49,(2232732093077520756_construction.tools.light),(2053013563810775923_appliances.kitchen.washer),0.736625,0.022842,0.012648,0.017171,0.751705,-0.004178,0.994229,c4
83,(2232732093077520756_construction.tools.light),(2232732102103663163_furniture.bedroom.blanket),0.736625,0.017960,0.012568,0.017061,0.949932,-0.000662,0.999085,c4


In [29]:
# Saving Results in S3
rules.to_csv('s3://myaws-capstone-bucket/data/basket_market_analysis/c4_segment_Arules.csv',index=False)

In [30]:
# Creating market basket 
market_basket = c5.groupby(['user_id', 'category'])['quantity']
market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('user_id')
market_basket.head()

category,2053013551857008829_apparel.shoes,2053013551865397438_sport.trainer,2053013551882174655_construction.tools.welding,2053013551898951873_construction.tools.light,2053013551907340482_sport.ski,2053013551924117699_sport.ski,2053013551932506308_construction.tools.drill,2053013551940894917_computers.desktop,2053013551966060743_kids.carriage,2053013552058335434_appliances.kitchen.meat_grinder,...,2232732134441746764_apparel.tshirt,2232732134987006296_electronics.audio.headphone,2232732135054115162_apparel.trousers,2232732135146389852_apparel.trousers,2232732135213498718_apparel.trousers,2232732135960084850_apparel.costume,2232732137285484952_furniture.bathroom.bath,2232732137780412838_furniture.kitchen.table,2232732137855910312_apparel.pajamas,2234185357446873711_apparel.shirt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
192078182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200985178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
221480173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
263137999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288246633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Encoding dataset to represent when a category is purchased - 0 means no and 1 means yes
def encode_data(value):
    if value <= 0:
        return 0
    if value >= 1:
        return 1

market_basket = market_basket.applymap(encode_data)

In [32]:
# Applying apriori to determine which categories are usually bought in sequence by using the aprioro function

# Setting a support of 0.01 to get categories that repeat 1% of the time or more 
# 1% was the support level that provided the largest amount of categories
categorysets = apriori(market_basket, min_support=0.01, use_colnames=True, max_len = 2)
categorysets

Unnamed: 0,support,itemsets
0,0.013283,(2053013553056579841_computers.peripherals.pri...
1,0.012714,(2053013553341792533_electronics.clocks)
2,0.039035,(2053013554415534427_electronics.video.tv)
3,0.055223,(2053013554658804075_electronics.audio.headphone)
4,0.012927,(2053013554725912943_appliances.kitchen.coffee...
5,0.231528,(2053013555631882655_electronics.smartphone)
6,0.010618,(2053013557477376525_furniture.bathroom.bath)
7,0.016277,(2053013558920217191_computers.notebook)
8,0.026566,(2053013563810775923_appliances.kitchen.washer)
9,0.010338,(2053013563911439225_appliances.kitchen.refrig...


In [33]:
rules = association_rules(categorysets, metric="lift", min_threshold=.5)
rules['cluster'] = 'c5'
rules = rules.sort_values(by=['confidence','lift'], ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,cluster
2,(2053013555631882655_electronics.smartphone),(2232732093077520756_construction.tools.light),0.231528,0.490987,0.071175,0.307414,0.626115,-0.042502,0.734945,c5
8,(2232732101063475749_appliances.environment.va...,(2232732093077520756_construction.tools.light),0.045534,0.490987,0.012958,0.284584,0.579615,-0.009398,0.711492,c5
6,(2232732099754852875_appliances.personal.massa...,(2232732093077520756_construction.tools.light),0.046704,0.490987,0.013078,0.280027,0.570334,-0.009853,0.706988,c5
4,(2232732079706079299_sport.bicycle),(2232732093077520756_construction.tools.light),0.087812,0.490987,0.024035,0.273708,0.557465,-0.01908,0.700838,c5
0,(2053013554658804075_electronics.audio.headphone),(2053013555631882655_electronics.smartphone),0.055223,0.231528,0.010578,0.191558,0.827364,-0.002207,0.950559,c5
3,(2232732093077520756_construction.tools.light),(2053013555631882655_electronics.smartphone),0.490987,0.231528,0.071175,0.144963,0.626115,-0.042502,0.898759,c5
5,(2232732093077520756_construction.tools.light),(2232732079706079299_sport.bicycle),0.490987,0.087812,0.024035,0.048952,0.557465,-0.01908,0.95914,c5
1,(2053013555631882655_electronics.smartphone),(2053013554658804075_electronics.audio.headphone),0.231528,0.055223,0.010578,0.045689,0.827364,-0.002207,0.99001,c5
7,(2232732093077520756_construction.tools.light),(2232732099754852875_appliances.personal.massa...,0.490987,0.046704,0.013078,0.026637,0.570334,-0.009853,0.979384,c5
9,(2232732093077520756_construction.tools.light),(2232732101063475749_appliances.environment.va...,0.490987,0.045534,0.012958,0.026392,0.579615,-0.009398,0.980339,c5


In [34]:
# Saving Results in S3
rules.to_csv('s3://myaws-capstone-bucket/data/basket_market_analysis/c5_segment_Arules.csv',index=False)

In [35]:
# Creating market basket 
market_basket = c6.groupby(['user_id', 'category'])['quantity']
market_basket = market_basket.sum().unstack().reset_index().fillna(0).set_index('user_id')
market_basket.head()

category,2053013551857008829_apparel.shoes,2053013551865397438_sport.trainer,2053013551882174655_construction.tools.welding,2053013551898951873_construction.tools.light,2053013551907340482_sport.ski,2053013551924117699_sport.ski,2053013551932506308_construction.tools.drill,2053013551940894917_computers.desktop,2053013551966060743_kids.carriage,2053013552058335434_appliances.kitchen.meat_grinder,...,2232732135054115162_apparel.trousers,2232732135146389852_apparel.trousers,2232732135213498718_apparel.trousers,2232732135960084850_apparel.costume,2232732137285484952_furniture.bathroom.bath,2232732137780412838_furniture.kitchen.table,2232732137855910312_apparel.pajamas,2234185357446873711_apparel.shirt,2282652065861730868_stationery.paper,2292044075982913587_furniture.bathroom.bath
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
128968633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299358698,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
301943177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
320738065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
354303483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Encoding dataset to represent when a category is purchased - 0 means no and 1 means yes
def encode_data(value):
    if value <= 0:
        return 0
    if value >= 1:
        return 1

market_basket = market_basket.applymap(encode_data)

In [37]:
# Applying apriori to determine which categories are usually bought in sequence by using the aprioro function

# Setting a support of 0.01 to get categories that repeat 1% of the time or more 
# 1% was the support level that provided the largest amount of categories
categorysets = apriori(market_basket, min_support=0.01, use_colnames=True, max_len = 2)
categorysets

Unnamed: 0,support,itemsets
0,0.011955,(2053013552259662037_computers.components.powe...
1,0.011156,(2053013552293216471_appliances.environment.ai...
2,0.014556,(2053013553056579841_computers.peripherals.pri...
3,0.01816,(2053013554415534427_electronics.video.tv)
4,0.03668,(2053013554658804075_electronics.audio.headphone)
5,0.01805,(2053013554834964853_appliances.kitchen.kettle)
6,0.017408,(2053013555220840837_appliances.kitchen.juicer)
7,0.058898,(2053013555631882655_electronics.smartphone)
8,0.011454,(2053013555724157349_sport.bicycle)
9,0.017549,(2053013557452210699_electronics.clocks)


In [38]:
rules = association_rules(categorysets, metric="lift", min_threshold=.5)
rules['cluster'] = 'c6'
rules = rules.sort_values(by=['confidence','lift'], ascending=False)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,cluster
0,(2053013555631882655_electronics.smartphone),(2232732093077520756_construction.tools.light),0.058898,0.215067,0.016781,0.284916,1.324779,0.004114,1.09768,c6
5,(2232732103831716449_apparel.shoes),(2232732079706079299_sport.bicycle),0.049121,0.137304,0.010796,0.219777,1.600661,0.004051,1.105704,c6
2,(2232732079706079299_sport.bicycle),(2232732093077520756_construction.tools.light),0.137304,0.215067,0.021388,0.155769,0.72428,-0.008142,0.929761,c6
3,(2232732093077520756_construction.tools.light),(2232732079706079299_sport.bicycle),0.215067,0.137304,0.021388,0.099446,0.72428,-0.008142,0.957962,c6
4,(2232732079706079299_sport.bicycle),(2232732103831716449_apparel.shoes),0.137304,0.049121,0.010796,0.078626,1.600661,0.004051,1.032023,c6
1,(2232732093077520756_construction.tools.light),(2053013555631882655_electronics.smartphone),0.215067,0.058898,0.016781,0.078027,1.324779,0.004114,1.020748,c6


In [39]:
# Saving Results in S3
rules.to_csv('s3://myaws-capstone-bucket/data/basket_market_analysis/c6_segment_Arules.csv',index=False)