### Load Packages

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from tensorflow.keras.utils import get_file

### Download Data

In [None]:
ROOT_URL = "https://craftsangjae.s3.ap-northeast-2.amazonaws.com/data/"

fpath = get_file("movies.csv", ROOT_URL + "movies.csv")
movie_df = pd.read_csv(fpath)

fpath = get_file("genres.csv", ROOT_URL + "genres.csv")
genre_df = pd.read_csv(fpath)

fpath = get_file("ratings.csv", ROOT_URL + 'ratings.csv')
rating_df = pd.read_csv(fpath)

## Mining Frequent Pattern
---

### 1. convert from rating data to Basket data

In [None]:
baskets = (
    rating_df              # 평점 데이터 중
    [rating_df.rating>=4.] # 4점이상 평가를 높게 준 것들만 추려,
    .groupby('user_id')    # 유저 별로
    ['movie_id']           # 4점 이상 점수 준 무비아이디를 묶어
    .apply(set)            # 하나의 바스킷(set)으로 구성해주세요
)
baskets.head(3)

### 2. Frequent Pattern using Apriori Algorithm

In [None]:
from functools import reduce
from itertools import combinations

def apriori(baskets, min_support=0.1, max_size=3):
    """
    
    """
    # 후보군 가져오기
    print("collect all items")
    all_items = reduce(lambda x,y: x|y, baskets)
    candidates = [frozenset([item]) for item in all_items]
    
    # candidate 별 support 구하기
    num_basket = len(baskets)
    frequent_sets = {}
    for i in range(1, max_size+1):
        print(f"{i}th step...")
        support_dict = {}    
        for candidate in tqdm(candidates):
            # frozenset은 dictionary의 key값으로 될 수 있고,
            # set은 dictionary의 key값이 되지 못함
            candidate = frozenset(candidate)

            # 지지도 계산하기
            num_included = sum(baskets >= candidate)
            support = num_included / num_basket

            if support >= min_support:
                # min_support 기준보다 높은 것들만 추림
                support_dict[candidate] = support

        # min support 보다 높은 빈발집합을 결과에 담기
        frequent_sets.update(support_dict)

        # min_support보다 높은 빈발집합 케이스 가져오기
        pruned_candidates = support_dict.keys()

        # min_support 기준보다 높은 것들끼리 self_join을 통해 다음 후보군을 구성
        candidates = { a|b for a, b in combinations(pruned_candidates, 2)}    
    
    # frequent sets to Dataframe
    freq_df = pd.Series(frequent_sets, name='support')
    freq_df.index.name = 'itemsets'
    freq_df = freq_df.reset_index()
    return freq_df

In [None]:
freq_df = apriori(baskets, min_support=0.1, max_size=2)

### 3. analyze baskets using association rule

In [None]:
def find_association_rules(freq_df, min_confidence=0.5):
    """
    """
    rules = []
    for idx, row in freq_df.iterrows():
        X = row.itemsets
        support_X = row.support

        for idx, case in freq_df[freq_df.itemsets > X].iterrows():
            Y = case.itemsets - X

            support_XY = case.support
            confidence_XY = support_XY / support_X
            
            if confidence_XY<=min_confidence:
                continue
            
            support_Y = freq_df.loc[freq_df.itemsets==Y,"support"].values[0]

            lift_XY = confidence_XY / support_Y
            rules.append({
                "antecedents": X,
                "consequents": Y,
                "support": support_XY,
                "confidence": confidence_XY,
                "lift": lift_XY
            })
        
    association_df = pd.DataFrame(rules)
    return association_df

In [None]:
(
    find_association_rules(freq_df, min_confidence=0.5)
    .sort_values('lift',ascending=False)
    .iloc[:10]
)