# Import

In [1]:
from collections import defaultdict
from collections import Counter
from pprint import pprint

from konlpy.tag import Mecab, Kkma
import pandas as pd

# Load

In [2]:
directory = '/home/jake/Documents/'
file_2019 = '2019년_경제정책방향.txt'

file_2018 = '2018년_경제정책방향.txt'

with open(directory + file_2019, 'r') as f:
    raw_2019 = f.read()

with open(directory + file_2018, 'r') as f:
    raw_2018 = f.read()

In [3]:
"""
NNG 일반 명사
NNP 고유 명사
NNB 의존 명사
NNBC    단위를 나타내는 명사
NR  수사
NP  대명사

VV  동사

VA  형용사
VX  보조 용언
VCP 긍정 지정사
VCN 부정 지정사

MM  관형사
MAG 일반 부사
MAJ 접속 부사
"""
tags = ['NNG', 'NNP', 'NNB', 'NNBC', 'NR', 'NP', 'VV', 'VA', 'VX', 'VCP', 'VCN', 'MM', 'MAG', 'MAJ', 'IC', 'JKS', 'JKC', 'JKG', 'JKO',
        'JKB', 'JKV', 'JKQ', 'JC', 'JX', 'EP', 'EF', 'EC', 'ETN', 'ETM', 'XPN', 'XSN', 'XSV', 'XSA', 'XR', 'SF', 'SE', 'SSO', 'SSC',
        'SC', 'SY', 'SH', 'SL', 'SN',]

# Globals

In [4]:
mecab = Mecab()

# 2019 형태소 분석: counters_2019

In [5]:
list_of_pos_2019 = mecab.pos(raw_2019)
dict_of_pos_2019 = defaultdict(list)

for p in list_of_pos_2019:
    dict_of_pos_2019[p[1]].append(p[0])

counters_2019 = {}

for tag, words in dict_of_pos_2019.items():
    cnter = Counter(words)
    counters_2019[tag] = cnter

# 2018 형태소 분석: counters_2018

In [6]:
list_of_pos_2018 = mecab.pos(raw_2018)
dict_of_pos_2018 = defaultdict(list)

for p in list_of_pos_2018:
    dict_of_pos_2018[p[1]].append(p[0])

counters_2018 = {}

for tag, words in dict_of_pos_2018.items():
    cnter = Counter(words)
    counters_2018[tag] = cnter

# Functions

In [7]:
# pprint(counters_2019['NNG'].most_common(20))


def get_ranking_chart_table(tag, counter_2019, counter_2018):
    tmp = [list(t) for t in counter_2019.most_common()]
    [t.append('2019') for t in tmp]

    tmp2 = [list(t) for t in counter_2018.most_common()]
    [t.append('2018') for t in tmp2]
  
    df = pd.DataFrame(columns=[f'{tag}_word', 'count', 'year'], data=tmp)
    df_2018 = pd.DataFrame(columns=[f'{tag}_word', 'count', 'year'], data=tmp2)
    df = df.append(df_2018)

    df.index = df.index+1
    return df


def get_word_cloud_table(tag, counter_2019, counter_2018):
    df = pd.DataFrame(columns=[f'{tag}_word', '2019'], data=counter_2019.most_common())
    df_2018 = pd.DataFrame(columns=[f'{tag}_word', '2018'], data=counter_2018.most_common())
    df = df_2018.merge(df, on=f'{tag}_word', how='outer')

    df = pd.DataFrame(columns=[f'2019_{tag}_word', f'2019_{tag}_count'], data=counter_2019.most_common())
    df_2018 = pd.DataFrame(columns=[f'2018_{tag}_word', f'2018_{tag}_count'], data=counter_2018.most_common())
    df = df_2018.merge(df, left_index=True, right_index=True, how='outer')
    
    df.index.name = '순위'
    df.index = df.index+1
    return df
    

# Creating Tables

In [8]:
word_cloud_table = {}
ranking_chart_table = {}

for tag in tags:
    counter_2019 = counters_2019.get(tag, Counter())
    counter_2018 = counters_2018.get(tag, Counter())
    word_cloud_table[tag] = get_word_cloud_table(tag, counter_2019, counter_2018)
    ranking_chart_table[tag] = get_ranking_chart_table(tag, counter_2019, counter_2018)
    

# ---

# Word Cloud

In [9]:
# comparison_table['NNP'].head(25).to_csv('', sep='\t')

tag = 'NNP'

word_cloud_table[tag].head(25)

Unnamed: 0_level_0,2018_NNP_word,2018_NNP_count,2019_NNP_word,2019_NNP_count
순위,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,한국은행,23.0,한국은행,21
2,한국,21.0,중국,15
3,중국,14.0,미국,14
4,저출산,13.0,물류,14
5,거시경제,12.0,최저임금,11
6,일본,11.0,고용부,11
7,평창,11.0,바이오,10
8,고용부,10.0,저출산,9
9,스마트,10.0,남북,9
10,균형발전,10.0,한국,9


# ---

# Ranking Chart

## 일반명사

In [10]:
'''
2018년 톱15 + 2019년 톱10 단어를 뽑을 예정
'''
tag = 'NNG'
x = 10
y = 15

top_x_2019 = list(zip(*counters_2019[tag].most_common(x)))[0]
top_y_2018 = list(zip(*counters_2018[tag].most_common(y)))[0]

print(top_y_2018)

idx_2019 = ranking_chart_table[tag]['year'] == '2019'
idx_top_x_2019 = ranking_chart_table[tag][f'{tag}_word'].isin(top_x_2019)
idx_2018 = ranking_chart_table[tag]['year'] == '2018'
idx_top_y_2018 = ranking_chart_table[tag][f'{tag}_word'].isin(top_y_2018)

ranking_chart = ranking_chart_table[tag].loc[(idx_2019 & idx_top_x_2019) | (idx_2018 & idx_top_y_2018)]; ranking_chart

# ranking_chart.to_csv('nng_ranking_chart.tsv', sep='\t', index=False)

('지원', '확대', '기업', '경제', '혁신', '투자', '추진', '개선', '강화', '금융', '증가', '산업', '정책', '마련', '사업')


Unnamed: 0,NNG_word,count,year
1,지원,253,2019
2,확대,195,2019
3,투자,163,2019
4,기업,130,2019
5,추진,129,2019
6,경제,118,2019
7,사업,104,2019
8,산업,87,2019
9,전망,80,2019
10,강화,80,2019


## 고유명사

In [11]:
'''
2018년 톱15 + 2019년 톱10 단어를 뽑을 예정
'''
tag = 'NNP'
x = 10
y = 15

top_x_2019 = list(zip(*counters_2019[tag].most_common(x)))[0]
top_y_2018 = list(zip(*counters_2018[tag].most_common(y)))[0]

nnp_2019 = ['한국은행', '중국', '미국', '고용부', '남북', '한국', '서울', '일본', '인도', '아세안',]
nnp_2018 = ['한국은행', '한국', '중국', '일본', '평창', '고용부', '미국', '공정위', '아세안', '서울',]

print(top_y_2018)

idx_2019 = ranking_chart_table[tag]['year'] == '2019'
# idx_top_x_2019 = ranking_chart_table[tag][f'{tag}_word'].isin(top_x_2019)
idx_top_x_2019 = ranking_chart_table[tag][f'{tag}_word'].isin(nnp_2019)
idx_2018 = ranking_chart_table[tag]['year'] == '2018'
# idx_top_y_2018 = ranking_chart_table[tag][f'{tag}_word'].isin(top_x_2018)
idx_top_y_2018 = ranking_chart_table[tag][f'{tag}_word'].isin(nnp_2018)

ranking_chart = ranking_chart_table[tag].loc[(idx_2019 & idx_top_x_2019) | (idx_2018 & idx_top_y_2018)]; ranking_chart

# ranking_chart.to_csv('nnp_ranking_chart.tsv', sep='\t', index=False)

('한국은행', '한국', '중국', '저출산', '거시경제', '일본', '평창', '고용부', '스마트', '균형발전', '로드맵', '올림픽', '미국', '최저임금', '팜')


Unnamed: 0,NNP_word,count,year
1,한국은행,21,2019
2,중국,15,2019
3,미국,14,2019
6,고용부,11,2019
9,남북,9,2019
10,한국,9,2019
11,서울,8,2019
12,일본,6,2019
14,인도,6,2019
15,아세안,6,2019


# ---

# Setence Chart

In [12]:
tag = 'NNP'
x = 10

sentences_2019 = raw_2019.split('\n')

sentences_2018 = raw_2018.split('\n')

top_10_2019 = tuple(zip(*counters_2019[tag].most_common(x)))[0]; top_10_2019

nnp_2019 = ['한국은행', '중국', '미국', '고용부', '남북', '한국', '서울', '일본', '인도', '아세안',]

key_sentence_list_2019 = []

for keyword in nnp_2019:
    for sentence in sentences_2019:
        if (keyword in sentence) and not (keyword == '한국' and '한국은행' in sentence):
            key_sentence_list_2019.append(('2019', keyword, sentence.strip()))


top_10_2018 = tuple(zip(*counters_2018[tag].most_common(x)))[0]; top_10_2018

nnp_2018 = ['한국은행', '한국', '중국', '일본', '평창', '고용부', '미국', '공정위', '아세안', '서울',]

key_sentence_list_2018 = []

for keyword in nnp_2018:
    for sentence in sentences_2018:
        if (keyword in sentence) and not (keyword == '한국' and '한국은행' in sentence):
            key_sentence_list_2018.append(('2018', keyword, sentence.strip()))


df_key_sentence_2019 = pd.DataFrame(columns=['연도', '단어', '문장'], data=key_sentence_list_2019); df_key_sentence_2019
df_key_sentence_2018 = pd.DataFrame(columns=['연도', '단어', '문장'], data=key_sentence_list_2018); df_key_sentence_2018

df_key_sentence = df_key_sentence_2019.append(df_key_sentence_2018); df_key_sentence

pd.set_option('display.max_rows', None)
df_key_sentence

# df_key_sentence.to_csv('nnp_sentence.tsv', sep='\t', index=False)

Unnamed: 0,연도,단어,문장
0,2019,한국은행,"* 자료 : 한국은행, 통계청"
1,2019,한국은행,* 출처 : 한국은행 * 출처 : 한국은행
2,2019,한국은행,* 출처 : 한국은행 * 출처 : 한국은행
3,2019,한국은행,* 출처 : 한국은행 * 출처 : 통계청
4,2019,한국은행,* 출처 : 한국은행 * 출처 : 한국은행
5,2019,한국은행,* 출처 : 한국은행 * 출처 : 한국은행
6,2019,한국은행,* 출처 : 한국은행 * 출처 : 한국은행
7,2019,한국은행,"* 출처: 관세청, 한국은행 * 출처: 무역협회"
8,2019,한국은행,* 자료 : 한국은행 ** ‘18~’19년은 정부전망 * 자료 : DRAM Exch...
9,2019,한국은행,"* 출처 : 통계청, 한국은행, 고용노동부 * 자료 : 한국은행"


# -------------------------------------

# 고유명사 찾기

In [20]:
counters_2019['NNP'].most_common(14)

[('한국은행', 21),
 ('중국', 15),
 ('미국', 14),
 ('물류', 14),
 ('최저임금', 11),
 ('고용부', 11),
 ('바이오', 10),
 ('저출산', 9),
 ('남북', 9),
 ('한국', 9),
 ('서울', 8),
 ('일본', 6),
 ('크루즈', 6),
 ('인도', 6)]

In [13]:
tag = 'NNP'
x = 30

top_10_2019 = tuple(zip(*counters_2019[tag].most_common(x)))[0]; top_10_2019

top_10_2018 = tuple(zip(*counters_2018[tag].most_common(x)))[0]; top_10_2018

pprint(top_10_2019)

'''
2019
['한국은행', '중국', '미국', '고용부', '남북', '한국', '서울', '일본', '인도', '아세안',]
'바이오', '크루즈', 

2018
['한국은행', '한국', '중국', '일본', '평창', '고용부', '미국', '공정위', '아세안', '서울',]
'''

print()
pprint(top_10_2018)

('한국은행',
 '중국',
 '미국',
 '물류',
 '최저임금',
 '고용부',
 '바이오',
 '저출산',
 '남북',
 '한국',
 '서울',
 '일본',
 '크루즈',
 '인도',
 '아세안',
 '조선',
 '기저',
 '균형발전',
 '정책금융',
 '공유',
 '거시경제',
 '미세먼지',
 '강화',
 '강원',
 '행안',
 '고위험',
 '임상시험',
 '드론',
 '스마트',
 '채무조정')

('한국은행',
 '한국',
 '중국',
 '저출산',
 '거시경제',
 '일본',
 '평창',
 '고용부',
 '스마트',
 '균형발전',
 '로드맵',
 '올림픽',
 '미국',
 '최저임금',
 '팜',
 '연기금',
 '조선',
 '공정위',
 '채무조정',
 '매칭',
 '아세안',
 '드론',
 '서울',
 '물류',
 '기업지배구조',
 '복지부',
 '공정',
 '샌드박스',
 '산은',
 '리츠')


# 2019년에 새로 등장한 단어: (among top 40)

## 공공, 구조, 대상, 둔화, 서비스, 조성

In [14]:
top40_set_2019 = set(list(zip(*counters_2019['NNG'].most_common(40)))[0])
top40_set_2018 = set(list(zip(*counters_2018['NNG'].most_common(40)))[0])

top40_set_2019 - top40_set_2018

{'공공', '구조', '대상', '둔화', '서비스', '조성'}

# 2018년에만 있었던 단어: (among top 40)

## 구축, 상승, 중심, 청년, 출처, 평가

In [15]:
top40_set_2018 - top40_set_2019

{'구축', '상승', '중심', '청년', '출처', '평가'}

In [16]:
x = 10
top_x_set_2019 = set(list(zip(*counters_2019['NNG'].most_common(x)))[0])
top_x_set_2018 = set(list(zip(*counters_2018['NNG'].most_common(x)))[0])

print(top_x_set_2019 & top_x_set_2018)
print(top_x_set_2019 - top_x_set_2018)
print(top_x_set_2018 - top_x_set_2019)

{'강화', '투자', '경제', '지원', '기업', '추진', '확대'}
{'사업', '산업', '전망'}
{'혁신', '금융', '개선'}


In [17]:
top_x_set_2019

{'강화', '경제', '기업', '사업', '산업', '전망', '지원', '추진', '투자', '확대'}