### Load Packages

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from tensorflow.keras.utils import get_file

  from ._conv import register_converters as _register_converters


### Download Data

In [2]:
ROOT_URL = "https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/"

fpath = get_file("movies.csv", ROOT_URL + "movies.csv")
movie_df = pd.read_csv(fpath)

fpath = get_file("genres.csv", ROOT_URL + "genres.csv")
genre_df = pd.read_csv(fpath)

fpath = get_file("ratings.csv", ROOT_URL + 'ratings.csv')
rating_df = pd.read_csv(fpath)

In [3]:
movie_df.head(3)

Unnamed: 0,id,title,release_year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995


In [4]:
genre_df.head(3)

Unnamed: 0,movie_id,genre
0,1,Adventure
1,1,Animation
2,1,Children


In [5]:
rating_df.head(3)

Unnamed: 0,user_id,movie_id,rating,rated_at
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819


## Mining Frequent Pattern
---

### 1. convert from rating data to Basket data

***caution*** 

: there is no basket data form in movielens. let's assume that item sets that the customer has given a rating make up a basket.


In [6]:
baskets = (
    rating_df
    [rating_df.rating>=4.] 
    .groupby('user_id')    
    ['movie_id']           
    .apply(set)            
)
baskets.head(3)

user_id
1    {3081, 1036, 541, 4128, 5171, 2100, 1079, 1090...
2    {3, 260, 2948, 2951, 1544, 1673, 266, 908, 245...
3    {3072, 1, 2054, 1544, 2571, 3098, 2076, 541, 3...
Name: movie_id, dtype: object

### 2. Frequent Pattern using Apriori Algorithm

reference : [wiki - Apriori Algorithm](https://en.wikipedia.org/wiki/Apriori_algorithm)

In [70]:
from functools import reduce
from itertools import combinations

def apriori(baskets, min_support=0.1, max_size=3):
    """
    Find Frequent set to perform market basket analysis
    
    :param baskets: pd.Series, Each value represents one basket 
    :param min_support: minimum support to be selected as a frequent set
    :param max_size: maximum size of frequent set
    :return: pd.Dataframe,
        | columns | description    |
        | ----    |  ----          |
        |itemsets |  frequent set  |
        |support  |  support value |
    """
    
    # Candidates for frequent sets of one item
    print("collect all items")
    all_items = reduce(lambda x,y: x|y, baskets)
    candidates = [frozenset([item]) for item in all_items]
    
    num_basket = len(baskets)
    frequent_sets = {}
    for i in range(1, max_size+1):
        print(f"{i}th step...")
        support_dict = {}    
        for candidate in tqdm(candidates):
            if len(candidate) != i:
                continue
            # Calculate support for the frequent set candidate 
            num_included = sum(baskets >= candidate)
            support = num_included / num_basket
            
            if support >= min_support:
                support_dict[candidate] = support

        # Add item sets over min support
        frequent_sets.update(support_dict)

        # Generate next step frequent set candidates
        candidates = {frozenset(a|b) 
                      for a, b in combinations(support_dict.keys(), 2)}
    
    # frequent sets to Dataframe
    freq_df = pd.Series(frequent_sets, name='support')
    freq_df.index.name = 'itemsets'
    freq_df = freq_df.reset_index()
    return freq_df

In [71]:
freq_df = apriori(baskets, min_support=0.1, max_size=3)

collect all items


  0%|          | 2/20720 [00:00<22:12, 15.54it/s]

1th step...


100%|██████████| 20720/20720 [21:27<00:00, 16.09it/s]
  0%|          | 2/6441 [00:00<08:47, 12.20it/s]

2th step...


100%|██████████| 6441/6441 [07:57<00:00, 13.49it/s]
  0%|          | 26/26859 [00:00<02:39, 168.37it/s]

3th step...


100%|██████████| 26859/26859 [04:43<00:00, 94.75it/s] 


### 3. analyze baskets using association rule



* **Support**
    
    * measure the abundance or frequency of an itemset in a database
    * $S(X, Y) = \frac{Freq(X,Y)}{N}$

* **Confidence**

    * the probability of seeing the consequent in a transaction given that it also contains the antecedent
    * $C(X \rightarrow Y) = \frac{Freq(X,Y)}{Freq(X)} = \frac{S(X,Y)}{S(X)}$
    
* **Lift**

    * measure how much more often the antecedent and consequent of a rule A->C occur together than we would expect if they were statistically independent
    * $L(X\rightarrow Y) = \frac{C(X \rightarrow Y)}{S(Y)}$

In [76]:
def find_association_rules(freq_df, min_confidence=0.5):
    """
    find all of the association rules derived through the frequent sets
    
    :param freq_df: the return value from apriori method
    :param min_confidence: minimum confidence to be selected as a assocation rule
    """
    rules = []
    for idx, row in freq_df.iterrows():
        X = row.itemsets
        support_X = row.support

        for idx, case in freq_df[freq_df.itemsets > X].iterrows():
            Y = case.itemsets - X

            support_XY = case.support
            confidence_XY = support_XY / support_X
            
            if confidence_XY<=min_confidence:
                continue
            
            support_Y = freq_df.loc[freq_df.itemsets==Y,"support"].values[0]

            lift_XY = confidence_XY / support_Y
            rules.append({
                "antecedents": X,
                "consequents": Y,
                "support": support_XY,
                "confidence": confidence_XY,
                "lift": lift_XY
            })
        
    association_df = pd.DataFrame(rules)
    return association_df

### Sampling Association Rules found

In [77]:
id2title = dict(zip(movie_df.id.values, movie_df.title.values))

samples = (
    find_association_rules(freq_df, min_confidence=0.5)
    .sort_values('lift',ascending=False)
    .iloc[:10]
)
samples.antecedents = samples.antecedents.map(lambda items : {id2title[i] for i in items})
samples.consequents = samples.consequents.map(lambda items : {id2title[i] for i in items})

samples

Unnamed: 0,antecedents,consequents,support,confidence,lift
231,"{Lord of the Rings: The Two Towers, The}","{Lord of the Rings: The Return of the King, Th...",0.135103,0.733587,4.911905
274,"{Lord of the Rings: The Return of the King, Th...","{Lord of the Rings: The Two Towers, The}",0.135103,0.904614,4.911905
250,"{Lord of the Rings: The Two Towers, The, Lord ...","{Lord of the Rings: The Return of the King, The}",0.135103,0.8533,4.908089
238,"{Lord of the Rings: The Return of the King, The}","{Lord of the Rings: The Two Towers, The, Lord ...",0.135103,0.777098,4.908089
245,"{Lord of the Rings: The Two Towers, The, Matri...","{Lord of the Rings: The Return of the King, The}",0.104609,0.843302,4.850582
239,"{Lord of the Rings: The Return of the King, The}","{Lord of the Rings: The Two Towers, The, Matri...",0.104609,0.601697,4.850582
232,"{Lord of the Rings: The Two Towers, The}","{Lord of the Rings: The Return of the King, Th...",0.104609,0.568007,4.765104
305,"{Lord of the Rings: The Return of the King, Th...","{Lord of the Rings: The Two Towers, The}",0.104609,0.877578,4.765104
128,{Aliens},{Alien},0.10599,0.752451,4.586514
141,{Alien},{Aliens},0.10599,0.646053,4.586514
