# LDA 모델 맛보기

In [28]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [29]:
import re
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime
from tqdm import tqdm
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
# Enable logging for gensim - optional
import logging
from google.colab import drive
from sklearn.datasets import fetch_20newsgroups
import warnings
warnings.filterwarnings("ignore")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## scikit learn 텍스트 데이터
- sklearn에서 제공하는 텍스트 데이터를 가지고 LDA 모델을 학습해봅니다
- 추천 데이터는 아니지만 LDA를 파악하기 위해서 간단한 실습을 수행합니다

In [30]:
# 학습 데이터 로드
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [31]:
# 문서(document) 예시
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [32]:
# 텍스트 데이터 전처리
news_df = pd.DataFrame({'document' : documents})
news_df['clean_doc'] = news_df['document'].str.replace('[^a-zA-Z]', ' ')
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w) > 3]))
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
news_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist will speak...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet with orange micros grappler syste...


In [33]:
# stopwords 제거를 위한 nltk 라이브러리

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
# 문서 tokenize 이후 stopwords 제거
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokoenized_doc = tokenized_doc.apply(lambda x: [word for word in x if word not in stop_words])
tokenized_doc[1]

['yeah',
 'expect',
 'people',
 'read',
 'actually',
 'accept',
 'hard',
 'atheism',
 'need',
 'little',
 'leap',
 'faith',
 'jimmy',
 'your',
 'logic',
 'runs',
 'steam',
 'sorry',
 'pity',
 'sorry',
 'that',
 'have',
 'these',
 'feelings',
 'denial',
 'about',
 'faith',
 'need',
 'well',
 'just',
 'pretend',
 'that',
 'will',
 'happily',
 'ever',
 'after',
 'anyway',
 'maybe',
 'start',
 'newsgroup',
 'atheist',
 'hard',
 'bummin',
 'much',
 'forget',
 'your',
 'flintstone',
 'chewables',
 'bake',
 'timmons']

In [35]:
# gensim에서 지원하는 dictionary
# document를 TF 형태로 변환
id2word = corpora.Dictionary(tokenized_doc)
corpus = [id2word.doc2bow(text) for text in tokenized_doc]
print(corpus[1])

[(0, 1), (2, 1), (20, 1), (60, 2), (66, 1), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 2), (85, 1), (86, 1), (87, 1), (88, 1), (89, 2), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 2), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)]


In [36]:
# 개별 단어는 id의 형태로 변환되어 모델에 사용됨
id2word[52]

'shame'

In [37]:
# 1번 문서의 예시
# id넘버, 단어, 빈도수 형태로 표현

[(id, id2word[id], freq) for id, freq in corpus[1]]

[(0, 'about', 1),
 (2, 'after', 1),
 (20, 'have', 1),
 (60, 'that', 2),
 (66, 'well', 1),
 (72, 'your', 2),
 (73, 'accept', 1),
 (74, 'actually', 1),
 (75, 'anyway', 1),
 (76, 'atheism', 1),
 (77, 'atheist', 1),
 (78, 'bake', 1),
 (79, 'bummin', 1),
 (80, 'chewables', 1),
 (81, 'denial', 1),
 (82, 'ever', 1),
 (83, 'expect', 1),
 (84, 'faith', 2),
 (85, 'feelings', 1),
 (86, 'flintstone', 1),
 (87, 'forget', 1),
 (88, 'happily', 1),
 (89, 'hard', 2),
 (90, 'jimmy', 1),
 (91, 'just', 1),
 (92, 'leap', 1),
 (93, 'little', 1),
 (94, 'logic', 1),
 (95, 'maybe', 1),
 (96, 'much', 1),
 (97, 'need', 2),
 (98, 'newsgroup', 1),
 (99, 'people', 1),
 (100, 'pity', 1),
 (101, 'pretend', 1),
 (102, 'read', 1),
 (103, 'runs', 1),
 (104, 'sorry', 2),
 (105, 'start', 1),
 (106, 'steam', 1),
 (107, 'these', 1),
 (108, 'timmons', 1),
 (109, 'will', 1),
 (110, 'yeah', 1)]

## Gensim LDA 학습
- gensim에서 제공하는 lda 라이브러리
- 모델에 학습하기 위해 필요한 하이퍼 파라미터는 [API 문서](https://radimrehurek.com/gensim/models/ldamodel.html)에서 확인합니다
- multicore를 지원하여 빠르게 학습하는 LdaMulticore 모델을 사용할 수도 있습니다

In [38]:
# lda 모델 학습

topic_num = 20 # 20개의 토픽
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics = topic_num,
                                           id2word=id2word,
                                           passes=10,
                                           per_word_topics=True)

In [39]:
# 특정 topic의 top 5 단어

topic = 0
topn = 5
for id, prob in ldamodel.get_topic_terms(topic, topn=topn):
    print(ldamodel.id2word[id], prob)

from 0.014931039
available 0.013880068
software 0.009088892
version 0.007804608
information 0.0073042917


In [40]:
# topic별 top 10 단어 출력

topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.015*"from" + 0.014*"available" + 0.009*"software" + 0.008*"version" + 0.007*"information"')
(1, '0.017*"play" + 0.015*"game" + 0.014*"period" + 0.014*"team" + 0.010*"season"')
(2, '0.019*"game" + 0.016*"year" + 0.016*"games" + 0.015*"team" + 0.009*"last"')
(3, '0.046*"that" + 0.018*"this" + 0.013*"have" + 0.009*"what" + 0.009*"with"')
(4, '0.031*"jesus" + 0.014*"church" + 0.012*"christ" + 0.010*"from" + 0.010*"bible"')
(5, '0.017*"this" + 0.014*"will" + 0.013*"that" + 0.010*"with" + 0.008*"encryption"')
(6, '0.006*"john" + 0.006*"lock" + 0.005*"financial" + 0.004*"intake" + 0.004*"bike"')
(7, '0.023*"were" + 0.020*"they" + 0.012*"from" + 0.010*"that" + 0.009*"their"')
(8, '0.019*"file" + 0.014*"this" + 0.014*"window" + 0.013*"output" + 0.013*"program"')
(9, '0.010*"health" + 0.008*"with" + 0.008*"medical" + 0.006*"than" + 0.006*"study"')
(10, '0.023*"with" + 0.017*"have" + 0.014*"this" + 0.010*"that" + 0.008*"drive"')
(11, '0.044*"that" + 0.020*"have" + 0.018*"they" + 0.018*"thi

In [41]:
# 특정 문서의 토픽별 분포
bow = corpus[0]
ldamodel.get_document_topics(bow)

[(3, 0.6052696), (7, 0.38426527)]

In [42]:
# 특정 문서의 토픽별 분포
ldamodel.get_document_topics(bow, minimum_probability=0)

[(0, 0.00058139535),
 (1, 0.00058139535),
 (2, 0.00058139535),
 (3, 0.6052813),
 (4, 0.00058139535),
 (5, 0.00058139535),
 (6, 0.00058139535),
 (7, 0.38425356),
 (8, 0.00058139535),
 (9, 0.00058139535),
 (10, 0.00058139535),
 (11, 0.00058139535),
 (12, 0.00058139535),
 (13, 0.00058139535),
 (14, 0.00058139535),
 (15, 0.00058139535),
 (16, 0.00058139535),
 (17, 0.00058139535),
 (18, 0.00058139535),
 (19, 0.00058139535)]

In [43]:
# 모든 문서에 대해서 토픽을 추출해봅시다

result = []

for i, topic_list in enumerate(ldamodel[corpus]):
    topics = topic_list[0]
    topics = sorted(topics, key=lambda x: (x[1]), reverse=True)
    if len(topics) > 0:
        topic_num, prop_topic = topics[0][0], topics[0][1]
        result.append([i, int(topic_num), round(prop_topic,4)])

topic_table = pd.DataFrame(result, columns = ['doc_id', 'top_topic', 'prop'])

In [44]:
topic_table

Unnamed: 0,doc_id,top_topic,prop
0,0,3,0.6052
1,1,3,0.6066
2,2,3,0.4994
3,3,11,0.3748
4,4,11,0.4761
...,...,...,...
11309,11309,3,0.3154
11310,11310,0,0.0500
11311,11311,11,0.5544
11312,11312,10,0.7446


In [45]:
# Perplexity 계산, Perplexity가 낮을수록 좋은 모델을 의미함
print('Perplexity: ', ldamodel.log_perplexity(corpus))

Perplexity:  -8.337468985506481


In [46]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, id2word)
vis

# 커머스 데이터로 LDA 추천 모델 만들기
- 사용자 = 문서, 단어 = 구매한 아이템.
- 각 사용자는 n개 토픽의 분포로 표현되고, 각 토픽은 아이템의 분포로 표현됩니다.
- 사용자가 특정 토픽에 대한 비율이 높다면, 그 사용자에게 해당 토픽의 비율이 높은 아이템을 추천해줍니다.

In [47]:
file_path = '/content/drive/MyDrive/recomm_study/recomm_code/Recommend_learningspoons/data/online_retail.csv'
parse_date = lambda x : datetime.strptime(x, '%Y-%m-%d %H:%M')
retail_df = pd.read_csv(file_path, encoding="utf-8", parse_dates = ['InvoiceDate'], date_parser = parse_date)
retail_df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [48]:
# 각 문서(row)의 StockCode : Description 형태의 딕셔너리 생성
stock_to_desc = {row['StockCode']: row['Description'] for _, row in retail_df.iterrows()}

In [49]:
# CustomerID 없는 데이터 및 Quantity 음수인 데이터 제외
# 데이터 전처리
cleaned_retail_df = retail_df[(retail_df['CustomerID'].notnull()) & (retail_df['Quantity'] > 0)]
cleaned_retail_df['CustomerID'] = cleaned_retail_df['CustomerID'].astype(int)
cleaned_retail_df['StockCode'] = cleaned_retail_df['StockCode'].astype(str)
cleaned_retail_df = cleaned_retail_df[['InvoiceNo', 'StockCode', 'Quantity', 'CustomerID', 'InvoiceDate']]
cleaned_retail_df['ym'] = cleaned_retail_df['InvoiceDate'].apply(lambda x: str(x)[:7])

In [50]:
# train_test_split : 2011-11까지의 데이터 train  /   이후의 데이터는 test
train = cleaned_retail_df[(cleaned_retail_df['ym'] >= '2011-09') & (cleaned_retail_df['ym'] <= '2011-11')]
test = cleaned_retail_df[cleaned_retail_df['ym'] == '2011-12']

In [51]:
# user별로 groupby해서 문서의 형태로 생성
# user별 구매한 item이 doc_list에 들어감
train_groupby = train.groupby(['CustomerID'])

doc_list = []
for user_id, user_df in train_groupby:
  doc = user_df['StockCode'].values.tolist()
  doc_list.append(doc)

print(doc_list[100])

['22993', '16045', '22398', '22608', '22610', '21558', '16008', '23379', '23392', '23559', '90087', '23388', '23389', '21559', '22352', '35964', '22579', '21015', '21014', '22336', '23483', '23540', '23542', 'POST', '22326', '21580', '22327', 'POST']


In [52]:
# gensim에서 지원하는 dictionary
id2word = corpora.Dictionary(doc_list)

# 문서를 TF의 형태로 변환
corpus = [id2word.doc2bow(doc) for doc in doc_list]

print(corpus[100])

[(49, 2), (67, 1), (139, 1), (148, 1), (329, 1), (342, 1), (343, 1), (354, 1), (433, 1), (434, 1), (501, 1), (741, 1), (757, 1), (774, 1), (800, 1), (1035, 1), (1081, 1), (1192, 1), (1213, 1), (1403, 1), (1404, 1), (1405, 1), (1406, 1), (1407, 1), (1408, 1), (1409, 1), (1410, 1)]


In [53]:
topic_num = 20 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics = topic_num,
                                           id2word=id2word,
                                           passes=10)

In [54]:
# 20개의 토픽별 Top N 아이템 분포 확인

topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.022*"21669" + 0.017*"23032" + 0.017*"21668" + 0.016*"21670" + 0.014*"21672"')
(1, '0.044*"85099B" + 0.043*"23203" + 0.029*"23202" + 0.028*"23344" + 0.025*"23209"')
(2, '0.047*"21181" + 0.045*"21175" + 0.043*"21166" + 0.040*"85152" + 0.028*"85150"')
(3, '0.017*"POST" + 0.010*"22551" + 0.008*"22554" + 0.008*"23368" + 0.008*"21212"')
(4, '0.017*"22578" + 0.017*"22577" + 0.015*"22086" + 0.011*"22579" + 0.009*"22909"')
(5, '0.019*"22086" + 0.017*"22910" + 0.014*"23084" + 0.014*"21790" + 0.011*"23349"')
(6, '0.020*"21034" + 0.013*"23084" + 0.010*"84879" + 0.010*"23333" + 0.009*"23323"')
(7, '0.009*"23263" + 0.009*"23266" + 0.008*"23265" + 0.007*"23264" + 0.006*"21034"')
(8, '0.027*"23284" + 0.021*"22193" + 0.019*"21137" + 0.018*"22191" + 0.017*"23535"')
(9, '0.019*"20724" + 0.018*"20727" + 0.017*"22382" + 0.016*"22355" + 0.014*"22356"')
(10, '0.021*"82494L" + 0.019*"22470" + 0.017*"22469" + 0.017*"85123A" + 0.016*"23322"')
(11, '0.023*"23293" + 0.019*"23295" + 0.018*"23294" + 0.018*"2

In [55]:
# 20개의 토픽별 Top-5 아이템 분포 확인

for i in range(20):
    recommend = ldamodel.show_topic(topicid=i, topn=5)
    print(i, [stock_to_desc[item] for item, score in recommend])

0 ['BLUE STRIPE CERAMIC DRAWER KNOB', 'DRAWER KNOB CRACKLE GLAZE IVORY', 'RED STRIPE CERAMIC DRAWER KNOB', 'BLUE SPOT CERAMIC DRAWER KNOB', 'WHITE SPOT RED CERAMIC DRAWER KNOB']
1 ['JUMBO BAG RED RETROSPOT', 'JUMBO BAG VINTAGE DOILY ', 'JUMBO BAG VINTAGE LEAF', "JUMBO BAG 50'S CHRISTMAS ", 'LUNCH BAG VINTAGE DOILY ']
2 ['PLEASE ONE PERSON METAL SIGN', 'GIN AND TONIC DIET METAL SIGN', 'COOK WITH WINE METAL SIGN ', 'HAND OVER THE CHOCOLATE   SIGN ', 'LADIES & GENTLEMEN METAL SIGN']
3 ['POSTAGE', 'PLASTERS IN TIN SPACEBOY', 'PLASTERS IN TIN WOODLAND ANIMALS', 'SET 12 COLOUR PENCILS DOLLY GIRL ', 'PACK OF 72 RETROSPOT CAKE CASES']
4 ['WOODEN STAR CHRISTMAS SCANDINAVIAN', 'WOODEN HEART CHRISTMAS SCANDINAVIAN', "PAPER CHAIN KIT 50'S CHRISTMAS ", 'WOODEN TREE CHRISTMAS SCANDINAVIAN', 'SET OF 20 VINTAGE CHRISTMAS NAPKINS']
5 ["PAPER CHAIN KIT 50'S CHRISTMAS ", 'PAPER CHAIN KIT VINTAGE CHRISTMAS', 'RABBIT NIGHT LIGHT', 'VINTAGE SNAP CARDS', 'ROLL WRAP VINTAGE CHRISTMAS']
6 ['REX CASH+CARRY JUMB

In [56]:
# 유저의 토픽 분포를 구해봅시다

user_topic_dict = {}

for user_id, user_df in train_groupby:
    document = user_df['StockCode'].values.tolist()
    user_topic_dict[user_id] = ldamodel.get_document_topics(id2word.doc2bow(document), minimum_probability=0.0)

In [57]:
user_id = 12682
user_topic_dict[user_id]

[(0, 0.00035460995),
 (1, 0.31897137),
 (2, 0.00035460995),
 (3, 0.5177698),
 (4, 0.00035460995),
 (5, 0.00035460995),
 (6, 0.00035460995),
 (7, 0.00035460995),
 (8, 0.00035460995),
 (9, 0.00035460995),
 (10, 0.00035460995),
 (11, 0.098363824),
 (12, 0.012935239),
 (13, 0.00035460995),
 (14, 0.00035460995),
 (15, 0.00035460995),
 (16, 0.026720054),
 (17, 0.00035460995),
 (18, 0.020275105),
 (19, 0.00035460995)]

## LDA로 유저에게 추천

### 유저 토픽 가운데 확률이 높은 topic을 이용한 추천

In [59]:
# user의 토픽 분포를 확인하고 가장 확률 높은 토픽의 Top-N item을 추천
topn=20
user_id = 12682
user_topics = user_topic_dict[user_id]
user_topics = sorted(user_topics, key=lambda x: (x[1]), reverse=True)
user_topic = user_topics[0][0]
recomm = ldamodel.show_topic(topicid=user_topic, topn=topn)
recomm = [item for item, score in recomm]
recomm

['POST',
 '22551',
 '22554',
 '23368',
 '21212',
 '21731',
 '23084',
 '23367',
 '22326',
 '21080',
 '22367',
 '22138',
 '21213',
 '22629',
 '22556',
 '22630',
 '23389',
 '23480',
 '22144',
 '22417']

In [60]:
user_topic

3

In [61]:
# 실제 12682번 user가 test 데이터에서 구매한 item_list
relevant = test[test['CustomerID'] == user_id]['StockCode'].unique()
list(relevant)

['20750',
 '21931',
 '85099B',
 '22423',
 '21242',
 '21243',
 '21239',
 '21240',
 '23040',
 '22596',
 '22456',
 '48185',
 '21770',
 '21977',
 '21212',
 '84375',
 '23163',
 '84378',
 '23020',
 '22966',
 '23084',
 '22556',
 '22551',
 '22555',
 '47566',
 '23192',
 '22139',
 '22138',
 '22467',
 'POST']

In [62]:
def get_precision(relevant, recommend):
    
    _intersection = set(recommend).intersection(set(relevant))
    return len(_intersection) / len(recommend)

def get_recall(relevant, recommend):
    
    _intersection = set(recommend).intersection(set(relevant))
    return len(_intersection) / len(relevant)

In [63]:
# precision
print(get_precision(relevant, recomm))

# recall
print(get_recall(relevant, recomm))

0.3
0.2


### Top N 추천 성능 평가

In [64]:
# user별로 추천하여 전체 성능 측정
train_user_ids = train['CustomerID'].unique()
test_user_ids = test['CustomerID'].unique()
topn=20

default_recomm = list(train.groupby('StockCode')['Quantity'].count().sort_values(ascending=False)[:topn].index)   # user_id가 test 데이터에만 있는 경우 --> 베스트셀러(가장 많이 구매한) item 추천
precisions = []
recalls = []

for user_id in test_user_ids:

  if user_id in train_user_ids:
    user_topics = user_topic_dict[user_id]
    user_topics = sorted(user_topics, key=lambda x: (x[1]), reverse=True)
    user_topic = user_topics[0][0]
    recomm = [item for item, score in ldamodel.show_topic(topicid=user_topic, topn=topn)]
  
  else:
    recomm = default_recomm
  
  relevant = test[test['CustomerID'] == user_id]['StockCode'].unique()
  precisions.append(get_precision(relevant, recomm))
  recalls.append(get_recall(relevant, recomm))

print('precision@k:', np.mean(np.asarray(precisions)))
print('recall@k:', np.mean(np.asarray(recalls)))

precision@k: 0.10227642276422765
recall@k: 0.12022564979526072


In [65]:
len(train_user_ids)

2852