# Association Rules

In [180]:
import pandas as pd
from functools import reduce

In [2]:
movies = pd.read_csv("AssociationRules/movies_viewing.csv")

In [5]:
#get the userId and movie
movies_sub = movies[['userId', 'title']] 

In [47]:
users = list(set(movies_sub.userId.values))

In [156]:
watchlist = []
for user in users[0:100000]:
    watchlist.append(','.join(list(set(movies_sub.loc[movies_sub.userId == user].title.values))))

Unnamed: 0,userId,title
0,1,Three Colors: Red
1,11,Three Colors: Red
2,22,Three Colors: Red
3,24,Three Colors: Red
4,29,Three Colors: Red


In [157]:
#pd.DataFrame(watchlist).to_csv("watchlist.csv")
netflix = pd.DataFrame(watchlist)

Unnamed: 0,0
0,"License to Wed,Caesar Must Die,Sleepless in Se..."
1,"Monty Python and the Holy Grail,Jarhead,Hero,B..."
2,"Monsoon Wedding,Sleepless in Seattle,Reservoir..."
3,"The 13th Warrior,Reign Over Me,The Departed,Sh..."
4,"Terminator 3: Rise of the Machines,Star Trek I..."
...,...
99995,"The Endless Summer,Clean, Shaven,20,000 League..."
99996,"Jacob's Ladder,The 13th Warrior,The Passion of..."
99997,"Mrs. Doubtfire,Das Boot,Shaft,M,The Grapes of ..."
99998,"Dogville,Monsoon Wedding,The Endless Summer,Co..."


### Make Pipelines

In [106]:
netflix = pd.read_csv("watchlist.csv", header=None, sep="\t")

In [120]:
netflix = pd.DataFrame(netflix.apply(steve, axis=1))

In [121]:
def rename_cols(df):
    df.rename(columns={0: 'items'}, inplace=True)
    return df

In [122]:
def cleanItems(df):
    def clean_items(x):
        items = x['items'].lower() #make all lowercase
        items = list(set(items.split(","))) #remove duplicates
        items = sorted(items) #sort
        items = [item.replace(' ', '-') for item in items]
        items = ','.join(items)
        return items

    df['clean_items'] = df.apply(clean_items, axis=1)
    return df

In [129]:
def itemsLength(df):
    def length(x):
        return len(x['clean_items'].split(","))
    df['length'] = df.apply(length, axis=1)
    return df

In [124]:
def ItemFreq(df):
    item_freq = df[['clean_items', 'items']].groupby('clean_items').count()
    item_freq = item_freq.reset_index().rename(columns={'items': 'count'})
    df = df.merge(item_freq, left_on='clean_items', right_on='clean_items')
    return df

In [168]:
def calculateSupport(df):
    def support(x):
        return x['count']/df.shape[0]
    df['support'] = df.apply(support, axis=1)
    return df

In [126]:
def removeDuplicates(df):
    duplicates = df.clean_items.duplicated()
    df = df.loc[~duplicates]
    return df

In [127]:
def recommendItems(df, search):
    def recommend(x, search):
        search = set([a.lower().replace(' ', '-') for a in search])
        basket = set(x['clean_items'].split(","))
        return ','.join(list(basket.difference(search))) if search.issubset(basket) else ''
    
    df['recommended'] = df.apply(recommend, args=(search, ), axis=1)
    return df.loc[df['recommended'] != '']

In [128]:
def recommendedItems(df):
    return list(df.recommened.values)

In [None]:
### Run Pipelines

In [183]:
collection = netflix.pipe(rename_cols).pipe(cleanItems).pipe(
    itemsLength
).pipe(ItemFreq).pipe(calculateSupport).pipe(
    removeDuplicates
).pipe(recommendItems, search=['Rocky II'])

In [194]:
def apriori(df, result_count):
    df = [item.split(",") for item in df['recommended']] #convert recommended to list
    df = reduce(lambda x,y: x+y, df) #merge all lists into a single list
    df = pd.DataFrame([{'item': i, 'count':df.count(i)} for i in df]) #count each list item to see how many times it occurs
    #dupes = df.duplicated() #identify duplicated
    #df = df[~dupes] #remove duplicates
    #df = df.sort_values('count', ascending=False)
    return df

In [195]:
collection.pipe(apriori, 10)

Unnamed: 0,item,count
0,human-nature,489
1,kolya,1222
2,the-wild-angels,108
3,run-lola-run,1309
4,shock,833
...,...,...
655532,the-men,704
655533,onibaba,250
655534,the-chorus,453
655535,jungle-fever,399


### Alternatives to getting unique values

In [20]:
foo = pd.DataFrame(dict(key=['a', 'a', 'b', 'a', 'c', 'b', 'a', 'a'], score=[1, 2, 12, 12, 14, 12, 3, 0]))

In [35]:
duplicates = foo.key.duplicated()

In [39]:
foo.loc[~duplicates].key.valuesfoo

array(['a', 'b', 'c'], dtype=object)

In [50]:
abc = list(set(foo.key.values))

In [75]:
empty = []
for l in abc:
    empty.append(set(foo.loc[foo.key == l].score.values))

In [76]:
empty

[{12}, {0, 1, 2, 3, 12}, {14}]