# LSA(잠재의미 분석)

- DTM이나 TF-IDF 행렬에 Truncated SVD를 사용하여 차원을 축소시키고, 단어들의 잠재 의미를 끌어낸다

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print(f'샘플의 수 : {len(documents)}')

샘플의 수 : 11314


In [3]:
print(dataset.target_names) # 뉴스그룹 데이터의 카테고리(20개)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


### 텍스트 전처리

In [4]:
df = pd.DataFrame({'documents' : documents})
df['cleaned_doc'] = df['documents'].str.replace('[^a-zA-z]', ' ') # 정규식으로 특수문자 제거
df['cleaned_doc'] = df['cleaned_doc'].apply(lambda x : ' '.join([w for w in x.split() if len(x)>3])) # 3글자 넘는 텍스트만
df['cleaned_doc'] = df['cleaned_doc'].apply(lambda x : x.lower()) # 소문자로 변환

In [5]:
# 불용어
stop_words = stopwords.words('english')
tokenized_doc = df['cleaned_doc'].apply(lambda x : x.split())
tokenized_doc = tokenized_doc.apply(lambda x : [item for item in x if item not in stop_words])
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'faq', 'etc', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'jim', 'sorry', 'pity', 'jim', 'sorry', 'feelings', 'denial', 'faith', 'need', 'get', 'oh', 'well', 'pretend', 'end', 'happily', 'ever', 'anyway', 'maybe', 'start', 'new', 'newsgroup', 'alt', 'atheist', 'hard', 'bummin', 'much', 'bye', 'bye', 'big', 'jim', 'forget', 'flintstone', 'chewables', 'bake', 'timmons', 'iii']


In [6]:
# TF-IDF 행렬
detokenized_doc = [] # 역토큰화
for i in range(len(df)) :
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
    
df['clean_doc'] = detokenized_doc

In [7]:
# 상위 1000개 단어 보존
vectorizer = TfidfVectorizer(stop_words='english', max_features = 1000,
                            max_df=0.5, smooth_idf=True)

X = vectorizer.fit_transform(df['clean_doc'])
print(f'TF-IDF 행렬 크기 : {X.shape}')

TF-IDF 행렬 크기 : (11314, 1000)


### 토픽 모델링

In [8]:
svd = TruncatedSVD(n_components=30, algorithm='randomized', n_iter=100,
                  random_state=0)
svd.fit(X)
len(svd.components_)

30

In [9]:
np.shape(svd.components_)

(30, 1000)

In [11]:
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5) :
    for idx, topic in enumerate(components) :
        print('Topic %d:' %(idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1 : -1]])
get_topics(svd.components_, terms)

Topic 1: [('like', 0.19259), ('people', 0.18158), ('know', 0.18046), ('think', 0.16337), ('good', 0.13876)]
Topic 2: [('windows', 0.24861), ('thanks', 0.24842), ('card', 0.14924), ('drive', 0.14144), ('file', 0.12795)]
Topic 3: [('god', 0.55292), ('jesus', 0.19138), ('windows', 0.14159), ('thanks', 0.12546), ('bible', 0.1156)]
Topic 4: [('key', 0.26756), ('use', 0.19789), ('government', 0.19529), ('people', 0.18715), ('chip', 0.1511)]
Topic 5: [('drive', 0.42574), ('scsi', 0.17444), ('mb', 0.16456), ('card', 0.15601), ('disk', 0.13606)]
Topic 6: [('windows', 0.37433), ('file', 0.21786), ('window', 0.16185), ('files', 0.15976), ('game', 0.15579)]
Topic 7: [('edu', 0.41718), ('god', 0.21425), ('com', 0.14189), ('team', 0.1317), ('key', 0.12201)]
Topic 8: [('key', 0.39663), ('chip', 0.23618), ('god', 0.19907), ('thanks', 0.17538), ('clipper', 0.15697)]
Topic 9: [('car', 0.43925), ('like', 0.31015), ('good', 0.18644), ('bike', 0.15935), ('god', 0.14572)]
Topic 10: [('edu', 0.40728), ('like

# LDA(잠재 디리클레 할당)

- 문서 집합에서 존재하는 토픽을 찾는 알고리즘
- 단어가 특정 토픽에 존재할 확률, 문서에 특정 토픽이 존재할 확률을 결합확률로 추정하여 토픽을 추출

In [12]:
tokenized_doc[:5]

0    [well, sure, story, nad, seem, biased, disagre...
1    [yeah, expect, people, read, faq, etc, actuall...
2    [although, realize, principle, one, strongest,...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: cleaned_doc, dtype: object

In [13]:
from gensim import corpora
dic = corpora.Dictionary(tokenized_doc)
corpus = [dic.doc2bow(text) for text in tokenized_doc]
print(corpus[1])

[(59, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 3), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 2), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 2), (104, 1), (105, 1), (106, 1), (107, 1)]


In [14]:
import gensim
num_topics = 20
lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dic, passes=15)
topics = lda.print_topics(num_words=4)
for topic in topics :
    print(topic)

# --> 20개의 토픽, * 해당 토픽에 대한 각각의 기여도

(0, '0.012*"runs" + 0.011*"mon" + 0.011*"ball" + 0.010*"hit"')
(1, '0.587*"ax" + 0.057*"q" + 0.046*"f" + 0.043*"max"')
(2, '0.012*"georgia" + 0.006*"corn" + 0.005*"hci" + 0.005*"mom"')
(3, '0.015*"armenian" + 0.013*"armenians" + 0.012*"said" + 0.011*"turkish"')
(4, '0.064*"`" + 0.056*"w" + 0.035*"b" + 0.032*"u"')
(5, '0.178*"x" + 0.020*"file" + 0.014*"window" + 0.012*"output"')
(6, '0.018*"game" + 0.018*"team" + 0.013*"play" + 0.011*"year"')
(7, '0.024*"drive" + 0.021*"card" + 0.018*"db" + 0.018*"mb"')
(8, '0.013*"would" + 0.012*"one" + 0.010*"like" + 0.010*"get"')
(9, '0.018*"edu" + 0.010*"com" + 0.010*"available" + 0.007*"software"')
(10, '0.016*"ms" + 0.015*"myers" + 0.010*"de" + 0.008*"nist"')
(11, '0.011*"people" + 0.009*"would" + 0.009*"one" + 0.007*"god"')
(12, '0.012*"picture" + 0.012*"smokeless" + 0.010*"sleeve" + 0.010*"hanging"')
(13, '0.015*"new" + 0.014*"games" + 0.013*"san" + 0.011*"st"')
(14, '0.016*"space" + 0.008*"president" + 0.006*"nasa" + 0.006*"research"')
(15, '0.

### 시각화

In [15]:
import pyLDAvis.gensim_models # pip install pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dic)
pyLDAvis.display(vis)

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))


In [16]:
# 문서별 토픽 분포
for i, topic_list in enumerate(lda[corpus]) :
    if i == 10 :
        break
    print(i+1, '번째 문서의 topic 비율 :', topic_list)

1 번째 문서의 topic 비율 : [(3, 0.12794237), (4, 0.052900616), (11, 0.77652854), (16, 0.031341556)]
2 번째 문서의 topic 비율 : [(8, 0.226915), (9, 0.11468083), (11, 0.6206118), (14, 0.022379648)]
3 번째 문서의 topic 비율 : [(8, 0.07900906), (10, 0.09321193), (11, 0.6465423), (16, 0.16891348)]
4 번째 문서의 topic 비율 : [(7, 0.030349782), (8, 0.40467688), (11, 0.20717981), (18, 0.3052814), (19, 0.036518432)]
5 번째 문서의 topic 비율 : [(5, 0.08343804), (8, 0.31293124), (13, 0.57933366)]
6 번째 문서의 topic 비율 : [(8, 0.34773692), (11, 0.5697628), (12, 0.045482088)]
7 번째 문서의 topic 비율 : [(4, 0.7523156), (7, 0.037792824), (8, 0.15827344), (9, 0.029461812)]
8 번째 문서의 topic 비율 : [(8, 0.42215464), (10, 0.06345883), (11, 0.3676844), (16, 0.119657554), (17, 0.016314458)]
9 번째 문서의 topic 비율 : [(8, 0.64378864), (11, 0.14148825), (17, 0.1944189)]
10 번째 문서의 topic 비율 : [(8, 0.91326475), (16, 0.07518156)]


  and should_run_async(code)


In [17]:
# dataframe으로 생성
def make_topictable_per_doc(lda, corpus) :
    topic_table = pd.DataFrame()
    
    for i, topic_list in enumerate(lda[corpus]) :
        doc = topic_list[0] if lda.per_word_topics else topic_list
        doc = sorted(doc, key=lambda x : (x[1]), reverse=True) # 비중 높은 순으로 정렬
        
        for j, (topic_num, prop_topic) in enumerate(doc) :
            if j == 0 :
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)

            else :
                break
                
    return topic_table

  and should_run_async(code)


In [18]:
df = make_topictable_per_doc(lda, corpus)
df.reset_index(inplace=True)
df.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
df[:10]

  and should_run_async(code)


Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,11.0,0.7766,"[(3, 0.12791376), (4, 0.052900184), (11, 0.776..."
1,1,11.0,0.6205,"[(8, 0.2269899), (9, 0.11467805), (11, 0.62053..."
2,2,11.0,0.6461,"[(8, 0.07945902), (10, 0.09321128), (11, 0.646..."
3,3,8.0,0.4046,"[(7, 0.030333536), (8, 0.40456775), (11, 0.207..."
4,4,13.0,0.5794,"[(5, 0.083447896), (8, 0.312897), (13, 0.57935..."
5,5,11.0,0.5698,"[(8, 0.34770256), (11, 0.5697972), (12, 0.0454..."
6,6,4.0,0.7523,"[(4, 0.75231475), (7, 0.037793044), (8, 0.1582..."
7,7,8.0,0.4221,"[(8, 0.42214468), (10, 0.06345887), (11, 0.367..."
8,8,8.0,0.6438,"[(8, 0.6437841), (11, 0.1414929), (17, 0.19441..."
9,9,8.0,0.9133,"[(8, 0.9132625), (16, 0.07518383)]"


---
# 1대1 문의 데이터에 적용해보기

In [19]:
import pymysql
import pandas as pd
conn = pymysql.connect(
                user='my_srv',  # 유저 이름
                passwd='wkrldi@duqhdi12',  # 패스워드
                host='125.141.223.156',  # 호스트
                db='m_yeoboya',  # 데이터베이스
                charset='utf8',  # 인코딩
                port=13306  # 포트 번호(''없이 사용)
            )
cursor = conn.cursor(pymysql.cursors.DictCursor)
sql = 'SELECT * FROM m_yeoboya_ai_dv.adm_inqry'
cursor.execute(sql)
result = cursor.fetchall()
cursor.close()
data = pd.DataFrame(result)
data.head(3)

  and should_run_async(code)


Unnamed: 0,auto_no,mem_no,mem_agent,inqry_slct,inqry_media,inqry_conts,atchd_file_nm,mem_hphone,chrgr_name,ans_conts,...,read_yn,etc_yn,ins_date,ans_date,del_date,gthr_yn,gift_item_code,gift_item_cnt,gift_yn,memo_cnt
0,1,7,Mozilla/5.0 (Linux; Android 4.4.2; IM-A910S Bu...,1,s,앙돼요,7_20150722163127.jpg|576|1024,,양대기,앙돼면 안하면 됨,...,y,n,2015-07-22 16:31:31,2015-07-22 16:32:55,0000-00-00 00:00:00,n,,0,n,0
1,2,7,Mozilla/5.0 (Linux; Android 4.4.2; IM-A910S Bu...,2,s,앙돼요2,,,양대기,앙돼요 답변2,...,y,n,2015-07-22 16:36:03,2015-07-22 16:36:19,0000-00-00 00:00:00,n,,0,n,0
2,23,182,Mozilla/5.0 (Linux; Android 5.0.1; SM-N916L Bu...,2,s,메시지함이 갑자기 텅비워 졌네요~~\n메시지함에 아무것도 나타 나지 않아요~~,,,이윤경,안녕하세요. [여보야] 관리자입니다. \r\n\r\n항상 저희 [여보야]를 이용해 ...,...,y,n,2015-08-26 21:21:38,2015-08-27 13:03:55,0000-00-00 00:00:00,n,,0,n,0


In [20]:
data.drop_duplicates(subset='inqry_conts', inplace=True)
data.dropna(subset=['ans_conts'], inplace=True)
data = data[data['ans_conts'].str.contains('여보야')]
data = data[['ans_conts','inqry_conts']]

  and should_run_async(code)


In [21]:
import re
data['inqry_conts'] =  data['inqry_conts'].apply(lambda x : re.sub('[^ A-Za-z0-9가-힣]', '', x))
data['ans_conts'] =  data['ans_conts'].apply(lambda x : re.sub('[^ A-Za-z0-9가-힣]', '', x))
data = data[data['inqry_conts']!='']

print(len(data))

  and should_run_async(code)


81647


In [22]:
# !pip install pandas==1.2.5
# !pip install tqdm==4.62.2

  and should_run_async(code)


In [23]:
# tokenizing
from tqdm import tqdm
tqdm.pandas()
data['inqry_tokens'] = data['inqry_conts'].progress_apply(nltk.word_tokenize)
data['ans_tokens'] = data['ans_conts'].progress_apply(nltk.word_tokenize)

  and should_run_async(code)
100%|██████████| 81647/81647 [00:06<00:00, 11792.79it/s]
100%|██████████| 81647/81647 [00:21<00:00, 3752.23it/s]


In [24]:
# 한국어 불용어 사전 정의
f = open('./data/한국어불용어100.txt', encoding='utf8') # https://bab2min.tistory.com/544
# print(f.readlines())
stopwords = f.readlines()
f.close()

  and should_run_async(code)


In [25]:
import re
stopwords = [re.sub('[^ 가-힣]', '', text) for text in stopwords ]
temp = ['도', '는', '다', '의', '가', '이', '은', '한', '에', '하', '고', '을', '를', '인', '듯', '과', '와', '네', '들', '듯', '지', '임', '게',
            '로', '요','안', '으로', '데', '하는', '하고', '인데', '야', '이고', '에게', '것', '거', '좀', '다', '왜', '더', '여기', '아마', 
            '합니다', '입니다','습니다', '너무','제가','듯', '전', '안', '때', '가요','이거','이건','무슨','있는데','하는데','는데','해서','라서','해야','너무']
for t in temp :
    if t not in stopwords :
        stopwords.append(t)
        
print(len(stopwords))

144


  and should_run_async(code)


In [26]:
# 전처리
data['inqry_tokens'] = data['inqry_tokens'].progress_apply(lambda x : [item for item in x if item not in stopwords and len(item) > 1 ])
data['ans_tokens'] = data['ans_tokens'].progress_apply(lambda x : [item for item in x if item not in stopwords and len(item) > 1])

  and should_run_async(code)
100%|██████████| 81647/81647 [00:01<00:00, 61552.91it/s]
100%|██████████| 81647/81647 [00:07<00:00, 10421.21it/s]


In [28]:
data.reset_index(drop=True, inplace=True)
data.head(3)

  and should_run_async(code)


Unnamed: 0,ans_conts,inqry_conts,inqry_tokens,ans_tokens
0,안녕하세요 여보야 관리자입니다 항상 저희 여보야를 이용해 주셔서 감사합니다먼저 불편...,메시지함이 갑자기 텅비워 졌네요메시지함에 아무것도 나타 나지 않아요,"[메시지함이, 갑자기, 텅비워, 졌네요메시지함에, 아무것도, 나타, 나지, 않아요]","[안녕하세요, 여보야, 관리자입니다, 항상, 저희, 여보야를, 이용해, 주셔서, 감..."
1,안녕하세요 여보야 관리자입니다 항상 저희 여보야를 이용해 주셔서 감사합니다차단회원이...,차단회원,[차단회원],"[안녕하세요, 여보야, 관리자입니다, 항상, 저희, 여보야를, 이용해, 주셔서, 감..."
2,안녕하세요 여보야 관리자입니다 항상 저희 여보야를 이용해 주셔서 감사합니다회원탈퇴를...,탈퇴능 어떻게하나요,"[탈퇴능, 어떻게하나요]","[안녕하세요, 여보야, 관리자입니다, 항상, 저희, 여보야를, 이용해, 주셔서, 감..."


In [None]:
from gensim import corpora
inqry_token = data['inqry_tokens']
dic = corpora.Dictionary(inqry_token)
corpus = [dic.doc2bow(text) for text in inqry_token]
print(corpus[1])

In [None]:
inqry_token[:5]

In [None]:
import gensim
num_topics = 10
lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dic, passes=15)
topics = lda.print_topics(num_words=7)
for topic in topics :
    print(topic)

In [None]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dic)
pyLDAvis.display(vis)

In [None]:
# 문서별 토픽 분포
for i, topic_list in enumerate(lda[corpus]) :
    if i == 10 :
        break
    print(i+1, '번째 문서의 topic 비율 :', topic_list)

In [None]:
# dataframe으로 생성
def make_topictable_per_doc(lda, corpus) :
    topic_table = pd.DataFrame()
    
    for i, topic_list in enumerate(lda[corpus]) :
        doc = topic_list[0] if lda.per_word_topics else topic_list
        doc = sorted(doc, key=lambda x : (x[1]), reverse=True) # 비중 높은 순으로 정렬
        
        for j, (topic_num, prop_topic) in enumerate(doc) :
            if j == 0 :
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)

            else :
                break
                
    return topic_table

In [None]:
df = make_topictable_per_doc(lda, corpus)
df.reset_index(inplace=True)
df.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
df[:10]

### sklearn으로 LDA 구현해보기

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

count_vector = CountVectorizer(max_df=0.95, max_features=1000,
                              min_df=2,
                              ngram_range=(1, 2))
trans_vector = count_vector.fit_transform(data['inqry_conts'])

lda_model = LatentDirichletAllocation(n_components=10, random_state=0)
lda_model.fit(trans_vector)

print(lda_model.components_.shape)
print(lda_model.components_)

In [None]:
def display_topic_words(model, feature_names, num_top_words) :
    for topic_idx, topic in enumerate(model.components_) :
        print(f'\n Topic #', topic_idx + 1)
        
        topic_word_idx = topic.argsort()[::-1]
        top_idx = topic_word_idx[:num_top_words]

        feature_concat = '+'.join([str(feature_names[i]) +'*'+str(round(topic[i],1)) for i in top_idx])
        print(feature_concat)

feature_names = count_vector.get_feature_names()
display_topic_words(lda_model, feature_names, 15)
    