## 텍스트 데이터 전처리

In [None]:
# 특수문자, 구두점, 숫자 제거
con_text['요약'] = con_text['요약'].str.replace("[^a-zA-Z]", " ")
# 소문자 변환 전처리
text = con_text.applymap(str.lower)
#토큰화
text['headline_text'] = text.apply(lambda row: nltk.word_tokenize(row['headline_text']), axis=1)
# 영문 불용어 처리
stop_words = stopwords.words('english')
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words)])
# 토픽마다 중복되는 단어나 주제에 불필요한 단어 제거
text['headline_text'] = text['headline_text'].apply(lambda x: [word for word in x if word not in (stop_words2)])
# 표제어 추출
text['headline_text'] = text['headline_text'].apply(lambda x: [WordNetLemmatizer().lemmatize(word, pos='v') for word in x])
# 길이 2 이하 제거
tokenized_doc = text['headline_text'].apply(lambda x: [word for word in x if len(word) > 2])






## Perplexity, coherence

In [None]:
import gensim
import collections
# 토큰화된 단어를 Dictionary 형태로 변환
# doc2bow를 통해 corpus에 단어 저장
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
print(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

# perplexity 계산
import matplotlib.pyplot as plt
perplexity_values=[]
for i in range(2,15):
    ldamodel=gensim.models.ldamodel.LdaModel(corpus, num_topics=i, id2word=dictionary)
    perplexity_values.append(ldamodel.log_perplexity(corpus))

# 그래프 출력
x=range(2,15)
plt.plot(x, perplexity_values)
plt.xlabel("number of topics")
plt.ylabel("perplexity score")
plt.show()
# coherence 계산
from gensim.models import CoherenceModel
coherence_values=[]
for i in range(2,15):
    ldamodel=gensim.models.ldamodel.LdaModel(corpus, num_topics=i, id2word=dictionary)
    coherence_model_lda=CoherenceModel(model=ldamodel, texts=tokenized_doc, dictionary=dictionary,topn=10)
    coherence_lda=coherence_model_lda.get_coherence()
    coherence_values.append(coherence_lda)
# 그래프 출력
x=range(2,15)
plt.plot(x, coherence_values)
plt.xlabel("number of topics")
plt.ylabel("coherence score")
plt.show()

## Topic modeling




In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

# LDA 모델을 형성하여 11개의 토픽으로 군집화 진행
NUM_TOPICS = 11 # 11개의 토픽, k=11
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=20)
for topic in topics:
    print(topic)
# Intertopic Distance Map, Top-30 Most Salient Terms 출력
pip install pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

## SNA

In [None]:
count = {} 
for tokens in tokenized_doc: 
    stopped_tokens = [i for i in list(set(tokens)) ] 
    for i,a in enumerate(stopped_tokens): 
        for b in stopped_tokens[i+1:]: 
            if a>b: 
                count[b,a] = count.get((b,a),0) + 1
            else:
                count[a,b] = count.get((a,b),0) + 1
df = pd.DataFrame.from_dict(count, orient='index')
list1=[]
for i in range(len(df)):
    list1.append([df.index[i][0], df.index[i][1], df[0][i]])
df2 = pd.DataFrame(list1, columns=['term1','term2','freq'])
df2 = df2.sort_values(by=['freq'], ascending=False) 
df3 = df2.reset_index(drop=True)
import networkx as nx
import operator
G_centrality = nx.Graph() 
for ind in range(len(np.where(df3['freq']>=5)[0])):
    G_centrality.add_edge(df3['term1'][ind],df3['term2'][ind],weight=int(df3['freq'][ind])
cls = nx.closeness_centrality(G_centrality) 
#중심성 큰 순서대로 저장
sorted_cls = sorted(cls.items(), key= operator.itemgetter(1), reverse = True)
#단어 네트워크를 그려줄 Graph선언
G=nx.Graph() 
for i in range(len(sorted_cls)):
    G.add_node(sorted_cls[i][0], nodesize = sorted_cls[i][1])
for ind in range(len(np.where(df3['freq']>=5)[0])):
    G.add_weighted_edges_from([(df3['term1'][ind],df3['term2'][ind],int(df3['freq'][ind]))])
# 노드 크기 조정
sizes = [G.nodes[node]['nodesize']*2000 for node in G]
options = {
    'edge_color':'#FFDEA2',
    'width':2,
    'with_labels':True,
    'font_weight':'regular'
}
nx.draw(G, node_size=sizes, pos=nx.spring_layout(G, k=4.5, iterations=100), **options)
ax = plt.gca()
ax.collections[0].set_edgecolor("#555555")
plt.show()