### 연관규칙 지표 설명
#### 지지도(Support)
지지도는 전체 거래 중 특정 아이템 집합이 함께 발생하는 비율

$ \text{Support}(A \rightarrow B) = P(A \cap B) = \frac{\text{Number of transactions containing both } A \text{ and } B}{\text{Total number of transactions}} $

#### 신뢰도(Confidence)
신뢰도는 한 아이템 집합이 주어졌을 때 다른 아이템 집합이 함께 발생할 조건부 확률 

$ \text{Confidence}(A \rightarrow B) = P(B | A) = \frac{P(A \cap B)}{P(A)} = \frac{\text{Number of transactions containing both } A \text{ and } B}{\text{Number of transactions containing } A} $

#### 향상도(Lift)
향상도는 두 아이템 집합이 서로 얼마나 관련이 있는지를 측정

$ \text{Lift}(A \rightarrow B) = \frac{P(A \cap B)}{P(A)P(B)} = \frac{\text{Confidence}(A \rightarrow B)}{P(B)} = \frac{\text{Number of transactions containing both } A \text{ and } B \times \text{Total number of transactions}}{\text{Number of transactions containing } A \times \text{Number of transactions containing } B} $


<img src="https://github.com/dandacompany/kmu-practical-ds-2024/blob/main/lecture07/asso_rule1.png?raw=true">

<img src="https://github.com/dandacompany/kmu-practical-ds-2024/blob/main/lecture07/asso_rule2.png?raw=true">

#### 데이터 준비



In [11]:
import sys
import os
sys.path.append("..") 

In [12]:
import pandas as pd
from glob import glob

In [13]:
from lib.kdata import KaggleDownloader



In [14]:
KaggleDownloader().setup('{"username":"dantekwak","key":"ff47b95b8b596c2888b7644e3b7fc836"}')



In [15]:
dataset_name = "heeraldedhia/groceries-dataset"

In [16]:
KaggleDownloader().download(dataset=dataset_name)

Dataset URL: https://www.kaggle.com/datasets/heeraldedhia/groceries-dataset


In [17]:
df = pd.read_csv(glob("./datasets/groceries-dataset/*.csv")[0])
df.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


#### 데이터 전처리

In [18]:
df.Member_number = df.Member_number.astype(str)
df.Date = pd.to_datetime(df.Date)

  df.Date = pd.to_datetime(df.Date)


In [19]:
from dateutil import parser

#### 데이터 시각화 및 EDA

In [20]:
gantt_df = df.groupby(['Member_number']).agg(
    CUSTID=('Member_number', 'first'),
    Start=('Date', 'min'),
    Finish=('Date', 'max'),
    Period=('Date', lambda x: (x.max() - x.min()).days + 1)
).reset_index(drop=True).sort_values(by='Period', ascending=False)
gantt_df.head()


Unnamed: 0,CUSTID,Start,Finish,Period
2570,3641,2014-01-02,2015-12-27,725
159,1165,2014-01-04,2015-12-29,725
1918,2974,2014-01-01,2015-12-26,725
3274,4364,2014-01-02,2015-12-24,722
2073,3133,2014-01-04,2015-12-25,721


In [21]:
import plotly.express as px
# 간트 차트 그리기
fig = px.timeline(gantt_df.head(300), x_start='Start', x_end='Finish', y='CUSTID')
fig.update_yaxes(categoryorder='total ascending')
fig.update_layout(height=500)
fig.show()

fig2 = px.timeline(gantt_df.iloc[300:600], x_start='Start', x_end='Finish', y='CUSTID')
fig2.update_yaxes(categoryorder='total ascending')
fig2.update_layout(height=500)
fig2.show()

In [22]:
# Period의 분포
fig = px.histogram(gantt_df, x='Period', nbins=200, title='이용기간 분포')
fig.show()

In [23]:
gantt_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Period,3898.0,375.504618,205.540836,1.0,225.0,406.0,547.0,725.0


In [24]:
transaction_by_1d_df = df.groupby(['Member_number', 'Date']).agg(
    Items=('itemDescription', lambda x: list(x)),
    ItemCount=('itemDescription', 'size')
).reset_index()
transaction_by_1d_df.head()

Unnamed: 0,Member_number,Date,Items,ItemCount
0,1000,2014-06-24,"[whole milk, pastry, salty snack]",3
1,1000,2015-03-15,"[sausage, whole milk, semi-finished bread, yog...",4
2,1000,2015-05-27,"[soda, pickled vegetables]",2
3,1000,2015-07-24,"[canned beer, misc. beverages]",2
4,1000,2015-11-25,"[sausage, hygiene articles]",2


In [25]:
import plotly.express as px

fig = px.bar(transaction_by_1d_df, x=transaction_by_1d_df.ItemCount.value_counts().index, y=transaction_by_1d_df.ItemCount.value_counts().values, labels={'x':'ItemCount', 'y':'Count'})
fig.update_layout(title='ItemCount 분포', xaxis_title='함께 구매한 상품수', yaxis_title='빈도수', width=1000, height=500)
fig.update_yaxes(tickmode='array', tickvals=[i for i in range(0, max(transaction_by_1d_df.ItemCount.value_counts().values)+1, 1000)], ticktext=[str(i) for i in range(0, max(transaction_by_1d_df.ItemCount.value_counts().values)+1, 1000)])

fig.update_traces(text=transaction_by_1d_df.ItemCount.value_counts().values, textposition='outside')
fig.show()




#### 패키지를 이용한 연관지표 자동계산

In [30]:
#!pip install mlxtend

In [27]:
# Eclat (Equivalence Class Clustering and bottom-up Lattice Traversal) 알고리즘 내장 (대규모 데이터셋에 효과적)

In [31]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


In [32]:
# 데이터 준비
dataset = transaction_by_1d_df.Items.tolist()


In [33]:
# 모델 준비
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
as_df = pd.DataFrame(te_ary, columns=te.columns_)


In [34]:
# 연관 규칙 생성
frequent_itemsets = apriori(as_df, min_support = 0.0045, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="conviction", min_threshold=0.001)

In [35]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.021386,(UHT-milk)
1,0.008087,(baking powder)
2,0.033950,(beef)
3,0.021787,(berries)
4,0.016574,(beverages)
...,...,...
140,0.005814,"(soda, yogurt)"
141,0.008220,"(whole milk, tropical fruit)"
142,0.005213,"(tropical fruit, yogurt)"
143,0.004611,"(whole milk, whipped/sour cream)"


In [36]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(whole milk),(beef),0.157923,0.033950,0.004678,0.029623,0.872548,-0.000683,0.995541,-0.147821
1,(beef),(whole milk),0.033950,0.157923,0.004678,0.137795,0.872548,-0.000683,0.976656,-0.131343
2,(bottled beer),(other vegetables),0.045312,0.122101,0.004678,0.103245,0.845568,-0.000854,0.978973,-0.160585
3,(other vegetables),(bottled beer),0.122101,0.045312,0.004678,0.038314,0.845568,-0.000854,0.992724,-0.172212
4,(bottled beer),(whole milk),0.045312,0.157923,0.007151,0.157817,0.999330,-0.000005,0.999874,-0.000702
...,...,...,...,...,...,...,...,...,...,...
95,(yogurt),(tropical fruit),0.085879,0.067767,0.005213,0.060700,0.895720,-0.000607,0.992477,-0.112970
96,(whole milk),(whipped/sour cream),0.157923,0.043708,0.004611,0.029200,0.668077,-0.002291,0.985056,-0.371073
97,(whipped/sour cream),(whole milk),0.043708,0.157923,0.004611,0.105505,0.668077,-0.002291,0.941399,-0.341907
98,(whole milk),(yogurt),0.157923,0.085879,0.011161,0.070673,0.822940,-0.002401,0.983638,-0.203508


| 지표           | 장점                                                       | 단점                                                           |
|----------------|------------------------------------------------------------|----------------------------------------------------------------|
| 지지도(Support) | 가장 기본적인 지표로, 규칙이 데이터 세트 전체에서 얼마나 자주 발생했음을 나타냄 | 지지도만으로는 아이템 간의 실제 의존성을 정확히 파악하기 어려움 |
| 신뢰도(Confidence) | 한 아이템 집합이 주어졌을 때 다른 아이템 집합이 얼마나 자주 발생했음을 보여줌 | 결과 아이템의 전체 발생 빈도를 고려하지 않아, 오해할 수 있음  |
| 향상도(Lift)     | 두 아이템 집합의 독립성 대비 상호 의존성을 직접적으로 측정함.<br/> 1보다 크면 양의 연관, 1이면 독립, 1보다 작으면 음의 연관이 있음을 나타냄 | 두 아이템의 지지도가 매우 낮은 경우에도 높은 향상도를 나타낼 수 있어, .<br/>실제 중요도를 과대평가할 가능성 있음 |
| Leverage        | 두 아이템의 독립적인 예상 지지도와 실제 지지도의 차이를 보여줌   | 값의 범위가 작아 비교적 미세한 차이를 구분하기 어려움           |
| Conviction      | 한 아이템이 결과 아이템 없이 발생할 확률에 대한 비율을 나타내며, .<br/> 아이템 간 의존도의 강도를 직관적으로 파악할 수 있음 | 다른 지표들에 비해 계산이 복잡하고 이해하기 어려움               |
| Zhang’s Metric  | 규칙의 예상 신뢰도와 실제 신뢰도의 차이를 고려하며, .<br/>방향성을 포함한 연관성의 강도를 측정함 | 다른 지표들에 비해 덜 일반적으로 사용되어 분석가들 사이에서 덜 알려짐 |

#### 연관지표 메뉴얼 계산

In [37]:
total_transaction_count = transaction_by_1d_df.shape[0]

In [38]:
beef_whole_milk_intersection_count =  transaction_by_1d_df.Items.apply(lambda items : ('beef' in items) & ('whole milk' in items)).sum()
beef_whole_milk_intersection_count

70

In [39]:
support = beef_whole_milk_intersection_count / total_transaction_count
support

0.004678206242063757

In [40]:
antecedent_support = transaction_by_1d_df.Items.apply(lambda items : ( 'beef' in items)).sum()  / total_transaction_count
antecedent_support

0.03395041101383412

In [41]:
consequents_support = transaction_by_1d_df.Items.apply(lambda items : ( 'whole milk' in items)).sum()  / total_transaction_count
consequents_support


0.15792287642852368

In [42]:
confidence = support / antecedent_support
confidence

0.1377952755905512

In [43]:
lift = confidence / consequents_support
lift

0.8725479088706803

In [44]:
leverage = consequents_support - antecedent_support
leverage



0.12397246541468956

In [45]:
conviction = 1 - (1 - confidence) / (1 - antecedent_support)
conviction



0.10749434165765592

In [46]:
# Leverage = P(A∩B)−P(A)P(B) : 두 아이템 집합 A와 B가 함께 발생하는 것이 독립적인 경우에 비해 얼마나 더 자주 발생하는지를 측정
leverage = support - antecedent_support * consequents_support
leverage

-0.0006833403211715583

In [47]:
# Conviction = (1 - P(B)) / (1 - confidence(A->B)) : A 없이 B가 발생할 확률에 대한 A의 신뢰도의 비율로, A가 B에 얼마나 강한 영향을 미치는지를 나타냄
# Conviction 값이 높을수록 A가 발생하지 않을 때 B가 발생하지 않을 가능성이 높음을 의미하며, 이는 A가 B에 강한 영향을 미친다는 것을 나타냄. 
# 만약 B의 발생이 A와 완전히 독립적이라면, conviction 값은 1
conviction = (1 - consequents_support) / (1 - confidence)
conviction


0.9766556593020775

In [48]:
# Zhang's Metric = 두 아이템 집합 A와 B의 연관 규칙의 흥미도 측정하는 지표 
# 음수이면, A가 발생하면, B의 구매확률이 낮아짐.. (부정적 연관성)
# 0에 가까울수록 연관도가 낮고, 절대값을 취했을때 1에 가까워지면 A가 B에 강한 영향을 미친다고 볼수 있음.
zhangs_metric = (support - antecedent_support * consequents_support) / max(support * (1 - antecedent_support), antecedent_support * (consequents_support - support))
zhangs_metric


-0.13134290943680002

In [49]:
rules[rules.zhangs_metric > 0.001].sort_values(by='zhangs_metric', ascending=False)



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
29,(other vegetables),(frankfurter),0.122101,0.03776,0.005146,0.042146,1.11615,0.000536,1.004579,0.118536
28,(frankfurter),(other vegetables),0.03776,0.122101,0.005146,0.136283,1.11615,0.000536,1.01642,0.108146
83,(yogurt),(sausage),0.085879,0.060349,0.005748,0.066926,1.108986,0.000565,1.007049,0.107508
82,(sausage),(yogurt),0.060349,0.085879,0.005748,0.095238,1.108986,0.000565,1.010345,0.104587
79,(soda),(sausage),0.097106,0.060349,0.005948,0.061253,1.014975,8.8e-05,1.000963,0.016341
78,(sausage),(soda),0.060349,0.097106,0.005948,0.09856,1.014975,8.8e-05,1.001613,0.015702
25,(yogurt),(citrus fruit),0.085879,0.053131,0.004611,0.053696,1.010642,4.9e-05,1.000598,0.01152
24,(citrus fruit),(yogurt),0.053131,0.085879,0.004611,0.086792,1.010642,4.9e-05,1.001001,0.011121


#### 추천시스템 데모

In [51]:
def recommend_products(antecedent, rules_df, metric='confidence', top_n=5):
    """
    주어진 상품(antecedent)에 대해 연관 규칙을 기반으로 추천 상품 리스트를 반환하는 함수.
    
    Parameters:
    antecedent (str): 추천의 기준이 되는 상품.
    rules_df (DataFrame): 연관 규칙이 담긴 데이터프레임.
    metric (str): 정렬 기준 
    top_n (int): 반환할 추천 상품의 최대 개수.
    
    Returns:
    list의 개별 요소 n개: 추천 상품 리스트.
    """
    # 주어진 상품에 대한 연관 규칙 필터링
    filtered_rules = rules_df[rules_df['antecedents'].apply(lambda x: antecedent in x)]
    
    # 신뢰도(confidence)가 높은 순으로 정렬
    sorted_rules = filtered_rules.sort_values(by=metric, ascending=False)
    
    # 상위 N개의 결과에서 추천 상품(consequents) 추출
    recommendations = [""] * top_n
    cal_recommendations = sorted_rules['consequents'].head(top_n).apply(lambda x: list(x)[0]).tolist()
    for i, rec in enumerate(cal_recommendations):
        recommendations[i] = rec
    
    return recommendations[0], recommendations[1], recommendations[2], recommendations[3], recommendations[4]


In [39]:
recommend_products('yogurt', rules)

['whole milk', 'other vegetables', 'rolls/buns', 'soda', 'sausage']

In [40]:
recommend_products('yogurt', rules, metric='lift')



['sausage', 'citrus fruit', 'tropical fruit', 'rolls/buns', 'whole milk']

In [41]:
recommend_products('yogurt', rules, metric='zhangs_metric')

['sausage', 'citrus fruit', 'tropical fruit', 'rolls/buns', 'whole milk']

In [62]:
# 상품리스트
product_list = df.itemDescription.unique().tolist()
len(product_list)

167

In [59]:
# !pip install gradio

In [68]:
#https://www.gradio.app/
import gradio as gr

# Gradio 인터페이스 설정
interface = gr.Interface(
    fn=lambda product_name: recommend_products(product_name, rules),
    inputs=gr.Dropdown(choices=product_list, label="상품 선택"),
    outputs=[gr.Textbox(label="추천상품 TOP" + str(i)) for i in range(1, 6)],
    title="연관규칙 기반 추천시스템",
    description="상품을 선택하면 다른분들이 함께 구매하는 상품 5개를 추천합니다."
)

# ��터페이스 실행, 화면 높이 조정
interface.launch(height=800)




        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangol

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.





The `name` is not the first parameter anymore. The first parameter should be the `Request` instance.
Replace `TemplateResponse(name, {"request": request})` by `TemplateResponse(request, name)`.



In [67]:
# Gradio 서버를 멈추는 코드
interface.close()

Closing server running on port: 7860


#### 리뷰 데이터를 이용한 연관 단어 추출기 만들기

In [103]:
review_df = pd.read_csv('datasets/review/review.csv')

In [104]:
review_df

Unnamed: 0,review_no,accom_no,user_no,accom_nm,content,review_rate1,review_rate2,review_rate3
0,43f61fa6,9b0057ac,b9677a1e,세인트존스 호텔,지은지 얼마안되서 그런지 실내 인테리어도 좋았고 침대시트 등 모든것이 만족하였다. ...,100,100,100
1,694f3882,9b0057ac,956bc8bc,세인트존스 호텔,1. 시티뷰를 결제했는데 오션뷰를 주더라구요 한달 전에 예약한 거여서 몰랐는데 나중...,100,100,100
2,c6756abd,9b0057ac,41376307,세인트존스 호텔,객실도 넓고 좋았어요~~~,100,100,100
3,7b98fef9,9b0057ac,0b4c9081,세인트존스 호텔,일단 체크인 줄 굉장히 깁니다. 입실시간 최소 30분 전에 줄 서야 4시 겨우 넘어...,100,100,40
4,58f50b9e,9b0057ac,2fc2aed9,세인트존스 호텔,늦은휴가를위해 검색도중 인피니티풀이 핫하다는 세인트존스호텔이 인기가많길래 결정했습...,100,100,100
...,...,...,...,...,...,...,...,...
4981,48ae2f9a,9b0057ac,09cfaba6,세인트존스 호텔,층고도 높은데를 받아서 뷰도 너무 좋았습니디.숙소도 깔끔하고 직원들도 너무 친절했습니다.,100,100,100
4982,f7bca7bd,9b0057ac,80924d9a,세인트존스 호텔,룸 컨디션이 좋았고 깨끗하게 되어 있었습니다무료 업그레이드 받길 잘한것 같아요.. ...,100,100,100
4983,93a4c8dc,9b0057ac,c3ce7359,세인트존스 호텔,수영장물이 살면서 가본곳중 제일 더러웠어요.물안경 가져갔는데 물이 너무 뿌옇고 이물...,60,10,50
4984,59c17aa0,9b0057ac,8c3341a9,세인트존스 호텔,어머니가 이모랑 이용하신다 해서 예약해 드렸습니다.강릉역에 KTX로 가서 택시로 이...,100,80,100


In [109]:
import requests
import json

In [190]:
# 엘라스틱서치 토크나이저 함수
def extract_words_from_text(text):
    # Elasticsearch의 endpoint 설정
    url = 'http://localhost:9200/_analyze'
    
    # 요청할 데이터 준비: 사용자 정의 분석기 사용 시 'analyzer' 설정을 변경
    headers = {'Content-Type': 'application/json'}
    payload = {
        "analyzer": "nori",  # 'standard', 'nori' 등의 분석기 지정 가능
        "text": text
    }
    
    # POST 요청 수행
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    
    # 응답 데이터 처리
    if response.status_code == 200:
        tokens = []
        for token in response.json()['tokens']:
            if token['token'].isdigit() :
                continue
            if len(token['token']) < 2 :
                continue
            if isinstance(token['token'], int):
                continue
            tokens.append(token['token'])
        return tokens

    else:
        print("Error:", response.status_code, response.text)
        return []


In [201]:
# 토크나이징
review_df['nouns'] = review_df.content.apply(extract_words_from_text)
review_df.head()

Unnamed: 0,review_no,accom_no,user_no,accom_nm,content,review_rate1,review_rate2,review_rate3,nouns
0,43f61fa6,9b0057ac,b9677a1e,세인트존스 호텔,지은지 얼마안되서 그런지 실내 인테리어도 좋았고 침대시트 등 모든것이 만족하였다. ...,100,100,100,"[얼마, 그렇, 실내, 인테리어, 침대, 시트, 만족, 위치, 산책, 직원, 부분,..."
1,694f3882,9b0057ac,956bc8bc,세인트존스 호텔,1. 시티뷰를 결제했는데 오션뷰를 주더라구요 한달 전에 예약한 거여서 몰랐는데 나중...,100,100,100,"[시티, 결제, 오션, 예약, 모르, 나중, 확인, 그것, 행운, 오션, 아니, 여..."
2,c6756abd,9b0057ac,41376307,세인트존스 호텔,객실도 넓고 좋았어요~~~,100,100,100,[객실]
3,7b98fef9,9b0057ac,0b4c9081,세인트존스 호텔,일단 체크인 줄 굉장히 깁니다. 입실시간 최소 30분 전에 줄 서야 4시 겨우 넘어...,100,100,40,"[체크, 입실, 시간, 최소, 체크인, 체크인, 감상, 위하, 테라스, 나가, 투숙..."
4,58f50b9e,9b0057ac,2fc2aed9,세인트존스 호텔,늦은휴가를위해 검색도중 인피니티풀이 핫하다는 세인트존스호텔이 인기가많길래 결정했습...,100,100,100,"[휴가, 위하, 검색, 도중, 인피니티, 세인트존스, 호텔, 인기, 결정, 사람, ..."


In [203]:
extract_words_from_text("일단 체크인 줄 굉장히 깁니다. 입실시간 최소 30분 전에 줄 서야 4시 겨우 넘어..")

['체크', '입실', '시간', '최소']

In [195]:
nouns_list = []

In [202]:
# 데이터 준비
dataset = review_df.nouns.tolist()
nouns_list = []
for i in dataset:
    nouns_list.extend(i)
nouns_list = list(set(nouns_list))
len(nouns_list)

47388

In [225]:
# 숫자 제거
review_df.nouns = review_df.nouns.apply(lambda x : [noun for noun in x if not noun.isdigit()])
nouns_list = [noun for noun in nouns_list if not noun.isdigit()]


In [229]:
# 모델 준비
dataset = review_df.nouns.tolist()
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
as_df = pd.DataFrame(te_ary, columns=te.columns_)


In [230]:
# 연관 규칙 생성
frequent_itemsets = apriori(as_df, min_support = 0.0045, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="conviction", min_threshold=0.001)

In [231]:
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(너무),(1층),0.284998,0.010028,0.004613,0.016186,1.614046,0.001755,1.006259,0.532081
1,(1층),(너무),0.010028,0.284998,0.004613,0.460000,1.614046,0.001755,1.324078,0.384293
2,(1층에),(너무),0.011432,0.284998,0.004813,0.421053,1.477388,0.001555,1.235003,0.326867
3,(너무),(1층에),0.284998,0.011432,0.004813,0.016890,1.477388,0.001555,1.005551,0.451928
4,(ㅎㅎ),(것),0.039711,0.059567,0.005014,0.126263,2.119682,0.002649,1.076334,0.550075
...,...,...,...,...,...,...,...,...,...,...
7857,"(잘, 너무)","(좀, 체크인)",0.059767,0.020457,0.004813,0.080537,3.936834,0.003591,1.065342,0.793409
7858,(좀),"(체크인, 잘, 너무)",0.110509,0.013037,0.004813,0.043557,3.341170,0.003373,1.031911,0.787758
7859,(체크인),"(좀, 잘, 너무)",0.096069,0.012234,0.004813,0.050104,4.095417,0.003638,1.039868,0.836153
7860,(너무),"(좀, 체크인, 잘)",0.284998,0.007220,0.004813,0.016890,2.339198,0.002756,1.009835,0.800701


In [233]:
def recommend_products(antecedent, rules_df, metric='confidence', top_n=10):
    """
    주어진 상품(antecedent)에 대해 연관 규칙을 기반으로 추천 상품 리스트를 반환하는 함수.
    
    Parameters:
    antecedent (str): 추천의 기준이 되는 상품.
    rules_df (DataFrame): 연관 규칙이 담긴 데이터프레임.
    metric (str): 정렬 기준 
    top_n (int): 반환할 추천 상품의 최대 개수.
    
    Returns:
    list의 개별 요소 n개: 추천 상품 리스트.
    """
    # 주어진 상품에 대한 연관 규칙 필터링
    filtered_rules = rules_df[rules_df['antecedents'].apply(lambda x: antecedent in x)]
    
    # 신뢰도(confidence)가 높은 순으로 정렬
    sorted_rules = filtered_rules.sort_values(by=metric, ascending=False)
    
    # 상위 N개의 결과에서 추천 상품(consequents) 추출
    recommendations = [""] * top_n
    cal_recommendations = sorted_rules['consequents'].head(top_n).apply(lambda x: list(x)[0]).tolist()
    for i, rec in enumerate(cal_recommendations):
        recommendations[i] = rec
    
    return recommendations[0], recommendations[1], recommendations[2], recommendations[3], recommendations[4], recommendations[5], recommendations[6], recommendations[7], recommendations[8], recommendations[9]


In [235]:
# Gradio 인터페이스 설정
interface = gr.Interface(
    fn=lambda product_name: recommend_products(product_name, rules, metric='lift', top_n=10),
    inputs=gr.Dropdown(choices=nouns_list, label="단어 선택"),
    outputs=[gr.Textbox(label="연관 단어 리스트" + str(i)) for i in range(1, 11)],
    title="리뷰 기반 연관 단어 추출",
    description="태깅용으로 활용 가능"
)

# ��터페이스 실행, 화면 높이 조정
interface.launch(height=1200)




        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        



        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangol

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.





The `name` is not the first parameter anymore. The first parameter should be the `Request` instance.
Replace `TemplateResponse(name, {"request": request})` by `TemplateResponse(request, name)`.



In [236]:
interface.close()

Closing server running on port: 7861
