# 데이터 분석
## - 한글 형태소 분석
## - 워드클라우드
## - 의미망분석

# 한글 형태소 분석

In [None]:
!pip3 install jpype
!pip3 install konlpy

## 단문에 대한 형태소 분석

In [None]:
from konlpy.tag import Kkma

kkma = Kkma()
print(kkma.pos("한국어 형태소 분석은 재밌습니다"))

from collections import Counter

pos = kkma.pos("한국어 형태소 분석은 재밌습니다")
count = Counter(pos)
print(count)

## 파일에 대한 형태소 분석

In [None]:
from konlpy.tag import Kkma
from collections import Counter

with open('thirties.txt', encoding='utf-8') as file:
    text = file.read()

kkma = Kkma()

nouns = kkma.nouns(text)
print(nouns)
count = Counter(nouns)
print(count)
print(count.most_common(5))

## 발의안에 대한 형태소 분석

In [None]:
from collections import Counter
from konlpy.tag import Twitter
from konlpy.corpus import kobill    # Docs from pokr.kr/bill

files_ko = kobill.fileids()         # Get file ids
print(files_ko)

text = kobill.open('1809891.txt').read()
print(text)

t = Twitter()
nouns = t.nouns(text)
count = Counter(nouns)
print(count.most_common(5))

# 워드 클라우드

 #### 예제 출처: PinkWInk(http://pinkwink.kr/1029, 방문일: 2018년 7월 10일)

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud   

text = open('thirties.txt', encoding='utf-8').read()

wordcloud = WordCloud().generate(text)

In [None]:
wordcloud.words_

In [None]:
import nltk
from konlpy.corpus import kolaw
from konlpy.tag import Twitter 
t = Twitter()

In [None]:
ko_con_text = kolaw.open('constitution.txt').read()
ko_con_text

In [None]:
tokens_ko = t.nouns(ko_con_text)
tokens_ko

In [None]:
stop_words = ['제', '월', '일', '조', '애', '수']
tokens_ko = [each_word for each_word in tokens_ko if each_word not in stop_words]
tokens_ko

In [None]:
ko = nltk.Text(tokens_ko)
ko.vocab().most_common(5)

## c.f. 한글폰트 설정

In [None]:
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="AppleGothic.ttf").get_name()
rc('font', family=font_name)

In [None]:
data = ko.vocab().most_common(500)
tmp_data = dict(data)

wordcloud = WordCloud(font_path = "AppleGothic.ttf", 
                     relative_scaling = 0.2,
                     background_color = 'white',
                     ).generate_from_frequencies(tmp_data)
plt.figure(figsize = (16, 8))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
import numpy as np
from PIL import Image   

korea_coloring = np.array(Image.open("korea_mask.jpg"))
from wordcloud import ImageColorGenerator 
image_colors = ImageColorGenerator(korea_coloring)

#### 이미지 출처: https://www.pinterest.co.kr/pin/514395588671378068/

In [None]:
plt.figure(figsize=(12, 12))
plt.imshow(korea_coloring, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wordcloud = WordCloud(font_path ="AppleGothic.ttf", 
                     relative_scaling=0.2, mask=korea_coloring,
                      background_color='white', 
                      min_font_size=1, max_font_size=40).generate_from_frequencies(tmp_data)
plt.figure(figsize=(12,12))
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation='bilinear')
plt.axis("off")
plt.show()

## 의미망 분석

In [None]:
import networkx as nx

In [None]:
g = nx.Graph()

# NODES
g.add_node(1)
g.add_node(2)
g.add_nodes_from([3, 4, 5, "six", "hello"])
g.node[1]['type']='number'

In [None]:
print(g.nodes())
print(g.node[1])
print(g.node[2])
print(g.nodes(data=True))
print(len(g))

In [None]:
# EDGES
g.add_edge(1, 2)
g.add_edges_from([(2, 3), (3, 5), (5, 'six')])
g[1][2]['weight'] = 2.3

In [None]:
print(g.edges())
print(g.edges(data=True))

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
g2 = nx.Graph()

paths =[(1, 2), (1, 3), (3, 4), (4, 5), (4, 1)]
for p in paths:
    g2.add_edge(p[0], p[1])

pos = nx.spring_layout(g2)
nx.draw_networkx_nodes(g2, pos)
nx.draw_networkx_edges(g2, pos)

nx.draw_networkx_labels(g2, pos)  # 노드에 레이블을 더하라. Draw node labels on the graph G.

edgeLabels = {pair: x for x, pair in enumerate(paths)}
nx.draw_networkx_edge_labels(g2, pos, edge_labels=edgeLabels)  # 관계(edge)에 레이블을 더하라

# Show or save your graph to disk
plt.savefig('network_eg01.png')
plt.axis('off')
plt.show()

In [None]:
# 중심성
degree_cent = nx.degree_centrality(g2)
print(degree_cent)
bw_cent = nx.betweenness_centrality(g2)
print(bw_cent)
close_cent = nx.closeness_centrality(g2)
print(close_cent)

In [None]:
from operator import itemgetter

degree_cent = nx.degree_centrality(g2)
degree_sorted = sorted(degree_cent.items(), key=itemgetter(1), reverse=True)
print(degree_sorted[0:3])

## 의미망 분석

In [None]:
import pandas as pd
from collections import Counter 
from konlpy.tag import Twitter

In [None]:
df = pd.read_csv("korea_TwitterSearch_2018-07-11.csv", sep="\t", header=0,  index_col=False)

In [None]:
df.shape

    The dataframe df is a 93 x 3 table, which means there are 93 rows, or records, and 3 columns, or fields.

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
t = Twitter()
df['tags'] = df.tweet_text.apply(lambda x: t.nouns(x) )

In [None]:
df

In [None]:
counter = Counter()

for tags in df.tags:
    
    ###################################################################
    # Use a set to remove duplicate words.
    # This enables us to count every term in a tweet as just 1 
    # even though they appear multiple times in the same tweet. 
    ###################################################################
    
    word_set = set()
    
    for item in tags:
        
        word_set.add(item)
            
    counter.update(word_set)

In [None]:
counter