In [500]:
import pandas as pd
from functools import reduce

In [550]:
shopping = pd.read_csv("datasets/Market_Basket_Optimisation.csv", header=None, sep="\t")
s = pd.read_csv("datasets/Market_Basket_Optimisation.csv", header=None, sep="\t")

In [502]:
shopping
#shopping.head()
shopping.shape

(7501, 1)

In [551]:
shopping.rename(columns={0: 'items'}, inplace=True)
s.rename(columns={0: 'items'}, inplace=True)

In [504]:
shopping

Unnamed: 0,items
0,"shrimp,almonds,avocado,vegetables mix,green gr..."
1,"burgers,meatballs,eggs"
2,chutney
3,"turkey,avocado"
4,"mineral water,milk,energy bar,whole wheat rice..."
...,...
7496,"butter,light mayo,fresh bread"
7497,"burgers,frozen vegetables,eggs,french fries,ma..."
7498,chicken
7499,"escalope,green tea"


### Clean items

In [505]:
def clean_items(x):
    items = x['items'].lower() #make all lowercase
    items = list(set(items.split(","))) #remove duplicates
    items = sorted(items) #sort
    items = [item.replace(' ', '-') for item in items]
    items = ','.join(items)
    return items

In [506]:
shopping['clean_items'] = shopping.apply(clean_items, axis=1)

In [507]:
shopping

Unnamed: 0,items,clean_items
0,"shrimp,almonds,avocado,vegetables mix,green gr...","almonds,antioxydant-juice,avocado,cottage-chee..."
1,"burgers,meatballs,eggs","burgers,eggs,meatballs"
2,chutney,chutney
3,"turkey,avocado","avocado,turkey"
4,"mineral water,milk,energy bar,whole wheat rice...","energy-bar,green-tea,milk,mineral-water,whole-..."
...,...,...
7496,"butter,light mayo,fresh bread","butter,fresh-bread,light-mayo"
7497,"burgers,frozen vegetables,eggs,french fries,ma...","burgers,eggs,french-fries,frozen-vegetables,gr..."
7498,chicken,chicken
7499,"escalope,green tea","escalope,green-tea"


### Get number of items in each set

In [508]:
def length(x):
    return len(x['clean_items'].split(","))

In [509]:
shopping['length'] = shopping.apply(length, axis=1)

In [510]:
shopping.sort_values('clean_items')

Unnamed: 0,items,clean_items,length
694,"burgers,chocolate,shrimp,whole wheat pasta,gro...","-asparagus,burgers,chocolate,energy-bar,ground...",8
7453,almonds,almonds,1
253,almonds,almonds,1
0,"shrimp,almonds,avocado,vegetables mix,green gr...","almonds,antioxydant-juice,avocado,cottage-chee...",20
1774,"burgers,ground beef,spaghetti,avocado,milk,oli...","almonds,avocado,burgers,cake,chocolate,frozen-...",13
...,...,...,...
3553,yogurt cake,yogurt-cake,1
402,yogurt cake,yogurt-cake,1
5611,yogurt cake,yogurt-cake,1
5427,yogurt cake,yogurt-cake,1


### Get frequency of each item

In [511]:
item_freq = shopping[['clean_items', 'items']].groupby('clean_items').count()

In [512]:
item_freq = item_freq.reset_index().rename(columns={'items': 'count'})

In [513]:
item_freq

Unnamed: 0,clean_items,count
0,"-asparagus,burgers,chocolate,energy-bar,ground...",1
1,almonds,2
2,"almonds,antioxydant-juice,avocado,cottage-chee...",1
3,"almonds,avocado,burgers,cake,chocolate,frozen-...",1
4,"almonds,avocado,burgers,cauliflower,green-tea,...",1
...,...,...
5149,whole-wheat-pasta,7
5150,"whole-wheat-pasta,zucchini",1
5151,whole-wheat-rice,18
5152,yams,8


In [514]:
shopping = shopping.merge(item_freq, left_on='clean_items', right_on='clean_items')

In [515]:
#shopping.sort_values('count', ascending=False).head(25)

### Calculate support

In [516]:
shopping.shape[0]

7501

In [517]:
def support(x):
    return x['count']/shopping.shape[0]

In [518]:
shopping['support'] = shopping.apply(support, axis=1)

In [519]:
duplicates = shopping.clean_items.duplicated()
shopping = shopping.loc[~duplicates]

In [520]:
shopping.sort_values(['support', 'length'], ascending=[False, False])

Unnamed: 0,items,clean_items,length,count,support
191,cookies,cookies,1,223,0.029729
56,french fries,french-fries,1,132,0.017598
700,escalope,escalope,1,101,0.013465
424,mineral water,mineral-water,1,93,0.012398
540,eggs,eggs,1,90,0.011998
...,...,...,...,...,...
6601,hand protein bar,hand-protein-bar,1,1,0.000133
6897,bacon,bacon,1,1,0.000133
7027,flax seed,flax-seed,1,1,0.000133
7141,green grapes,green-grapes,1,1,0.000133


In [521]:
shopping

Unnamed: 0,items,clean_items,length,count,support
0,"shrimp,almonds,avocado,vegetables mix,green gr...","almonds,antioxydant-juice,avocado,cottage-chee...",20,1,0.000133
1,"burgers,meatballs,eggs","burgers,eggs,meatballs",3,1,0.000133
2,chutney,chutney,1,2,0.000267
4,"turkey,avocado","avocado,turkey",2,2,0.000267
6,"mineral water,milk,energy bar,whole wheat rice...","energy-bar,green-tea,milk,mineral-water,whole-...",5,1,0.000133
...,...,...,...,...,...
7496,"burgers,salmon,pancakes,french fries,frozen sm...","burgers,french-fries,fresh-bread,frozen-smooth...",7,1,0.000133
7497,"turkey,burgers,dessert wine,shrimp,pasta,tomat...","burgers,dessert-wine,frozen-smoothie,milk,oil,...",12,1,0.000133
7498,"butter,light mayo,fresh bread","butter,fresh-bread,light-mayo",3,1,0.000133
7499,"burgers,frozen vegetables,eggs,french fries,ma...","burgers,eggs,french-fries,frozen-vegetables,gr...",6,1,0.000133


In [540]:
def recommend(x, search):
    search = set([a.lower().replace(' ', '-') for a in search])
    basket = set(x['clean_items'].split(","))
    return ','.join(list(basket.difference(search))) if search.issubset(basket) else ''

In [548]:
shopping['recommended'] = shopping.apply(recommend, args=(['eggs', 'turkey'], ), axis=1)

In [549]:
shopping.loc[shopping['recommended'] != '']

Unnamed: 0,items,clean_items,length,count,support,recommended
414,"turkey,burgers,mineral water,eggs,cooking oil","burgers,cooking-oil,eggs,mineral-water,turkey",5,1,0.000133,"mineral-water,cooking-oil,burgers"
534,"turkey,fresh tuna,tomatoes,spaghetti,mineral w...","black-tea,chicken,eggs,extra-dark-chocolate,fr...",10,1,0.000133,"extra-dark-chocolate,mineral-water,fresh-tuna,..."
999,"turkey,burgers,ground beef,chocolate,soup,almo...","almonds,burgers,chocolate,cottage-cheese,eggs,...",9,1,0.000133,"chocolate,cottage-cheese,ground-beef,soup,hot-..."
1237,"turkey,burgers,chocolate,olive oil,eggs,cookin...","burgers,chocolate,cooking-oil,corn,eggs,olive-...",7,1,0.000133,"chocolate,cooking-oil,burgers,olive-oil,corn"
1260,"turkey,eggs,chocolate,frozen smoothie,tomato j...","chocolate,eggs,frozen-smoothie,tomato-juice,tu...",5,1,0.000133,"chocolate,tomato-juice,frozen-smoothie"
...,...,...,...,...,...,...
7196,"turkey,burgers,spaghetti,milk,eggs,honey","burgers,eggs,honey,milk,spaghetti,turkey",6,1,0.000133,"milk,spaghetti,honey,burgers"
7266,"turkey,green grapes,eggs","eggs,green-grapes,turkey",3,1,0.000133,green-grapes
7372,"turkey,frozen vegetables,mineral water,eggs,ch...","chocolate,eggs,fresh-bread,frozen-vegetables,m...",6,1,0.000133,"chocolate,mineral-water,frozen-vegetables,fres..."
7481,"turkey,burgers,grated cheese,shrimp,pasta,spag...","burgers,chocolate,eggs,grated-cheese,mineral-w...",12,1,0.000133,"chocolate,grated-cheese,mineral-water,yogurt-c..."


# Piping

In [574]:
def Eclat(df, min_len, search):
    
    original_length = df.shape[0]
    
    def clean_items(x):
        items = x['items'].lower() #make all lowercase
        items = list(set(items.split(","))) #remove duplicates
        items = sorted(items) #sort
        items = [item.replace(' ', '-') for item in items]
        items = ','.join(items)
        return items 
    
    df['clean_items'] = df.apply(clean_items, axis=1)

    def length(x):
        return len(x['clean_items'].split(","))    
    df['length'] = df.apply(length, axis=1)
    
    item_freq = df[['clean_items', 'items']].groupby('clean_items').count()
    item_freq = item_freq.reset_index().rename(columns={'items': 'count'})
    df = df.merge(item_freq, left_on='clean_items', right_on='clean_items')
    
    dupes = df.duplicated() #remvoe duplicates
    df = df.loc[~dupes]
    
    def support(x):
        return x['count']/original_length    
    
    df['support'] = df.apply(support, axis=1)
    
    df =  df.loc[df['length'] > min_len].sort_values(['support', 'length'], ascending=[False, False])

    def recommend(x, search):
        search = set([item.lower().replace(' ', '-') for item in search])
        basket = set(x['clean_items'].split(","))
        return ','.join(list(basket.difference(search))) if search.issubset(basket) else '' 
    
    df['recommended'] = df.apply(recommend, args =(search,), axis=1)
    return df.loc[df['recommended'] != '']

In [575]:
s.pipe(Eclat, min_len=2, search=['burgers'])

Unnamed: 0,items,clean_items,length,count,support,recommended
2279,"burgers,eggs,french fries","burgers,eggs,french-fries",3,9,0.001200,"eggs,french-fries"
3643,"burgers,eggs,green tea","burgers,eggs,green-tea",3,3,0.000400,"eggs,green-tea"
4060,"burgers,mineral water,eggs","burgers,eggs,mineral-water",3,3,0.000400,"eggs,mineral-water"
4019,"burgers,eggs,french fries,pancakes","burgers,eggs,french-fries,pancakes",4,2,0.000267,"eggs,pancakes,french-fries"
867,"burgers,escalope,shallot","burgers,escalope,shallot",3,2,0.000267,"shallot,escalope"
...,...,...,...,...,...,...
7304,"burgers,escalope,pasta","burgers,escalope,pasta",3,1,0.000133,"escalope,pasta"
7345,"burgers,fresh tuna,chocolate","burgers,chocolate,fresh-tuna",3,1,0.000133,"chocolate,fresh-tuna"
7374,"burgers,mineral water,chocolate bread","burgers,chocolate-bread,mineral-water",3,1,0.000133,"chocolate-bread,mineral-water"
7377,"turkey,burgers,pet food","burgers,pet-food,turkey",3,1,0.000133,"turkey,pet-food"


In [368]:
def apriori(df, result_count):
    #df = [(a[1].recommended.split(","), a[1].support) for a in df[['support', 'recommended']].iterrows()]
    df = [item.split(",") for item in df['recommended']]
    df = reduce(lambda x,y: x+y, df)
    
    df = pd.DataFrame([{'item': i, 'count':df.count(i)} for i in df])
    dupes = df.duplicated()
    df = df[~dupes]
    #df = df.sort_values('count', ascending=False)
    return df.head(result_count)

In [394]:
#s.pipe(Eclat, min_len=2, search=['frozen-vegetables', 'salmon']).pipe(apriori, result_count=10)

In [279]:
movies = pd.read_csv("movies_viewing.csv")

In [379]:
users = movies[['userId', 'title']].userId.value_counts()

3