In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from utils.elasticsearch_utils import ElasticSearchUtils

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

np.random.seed(0)
pd.options.mode.chained_assignment = None

  from tqdm.autonotebook import tqdm


In [3]:
def tokenizer(text):
    import re
    
    text = re.sub(r'/([A-Z]+?)[+]', '/\g<1> ', text)
    
    result = []
    for word in text.split(' '):
        try:
            w, p = word.rsplit('/', maxsplit=1)
        except:
            continue

        if len(p) == 0:
            continue

        if p[0] == 'N':
            result.append(w)    
    
    return ' '.join(result)

In [4]:
def split_doc(doc_list):
    result = []
    
    for doc in tqdm(doc_list):
        if 'nlu_wrapper' not in doc:
            continue
        
        buf = []
        for k in doc['nlu_wrapper']:
            for item in doc['nlu_wrapper'][k]:
                buf += item.values()
        
        str_buf = '\n'.join(buf)
        
        result.append({
            'document_id': doc['document_id'],
            'date': doc['date'],                
            'morp': str_buf,
            'token': tokenizer(str_buf.replace('\n', ' ')),
        })
                    
    return result

In [5]:
def dump_docs(index):
    host_info = {
        'host': 'https://corpus.ncsoft.com:9200',
        'http_auth': 'elastic:nlplab',
    }

    utils = ElasticSearchUtils(**host_info)
    
    query = {
      '_source': [
        'document_id',
        'date',
        'nlu_wrapper.*.morp_str',
      ]
    }

    doc_list = []
    utils.export(index=index, query=query, result=doc_list)    
    
    nlu_wrapper = split_doc(doc_list)
    
    df = pd.DataFrame(nlu_wrapper)

    df.fillna('', inplace=True)
    
    with open('data/{}.json.bz2'.format(index), 'w') as fp:
        for i, row in tqdm(df.iterrows(), total=len(df)):
            line = json.dumps(dict(row), ensure_ascii=False)
            fp.write(line + '\n')    
    
    return df

In [6]:
def read_docs(index):
    with open('data/{}.json.bz2'.format(index), 'r') as fp:
        doc_list = []
        for line in tqdm(fp.readlines()):
            doc = json.loads(line)
            doc_list.append(doc)

        df = pd.DataFrame(doc_list)
        
    return df

In [None]:
index_list = [
#     'corpus_process-naver-economy-2010',
#     'corpus_process-naver-economy-2011',
#     'corpus_process-naver-economy-2012',
#     'corpus_process-naver-economy-2013',
#     'corpus_process-naver-economy-2014',
#     'corpus_process-naver-economy-2015',
#     'corpus_process-naver-economy-2016',
#     'corpus_process-naver-economy-2017',
#     'corpus_process-naver-economy-2018',
    'corpus_process-naver-economy-2019',
]

In [None]:
for index in tqdm(list(reversed(index_list))):
    dump_docs(index=index)

In [7]:
vec = TfidfVectorizer(
    min_df=2,
    use_idf=True,
    ngram_range=(1, 3),
    sublinear_tf=True,    # tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
)

In [8]:
index_list = [
    'corpus_process-naver-economy-2010',
    'corpus_process-naver-economy-2011',
    'corpus_process-naver-economy-2012',
    'corpus_process-naver-economy-2013',
    'corpus_process-naver-economy-2014',
    'corpus_process-naver-economy-2015',
    'corpus_process-naver-economy-2016',
    'corpus_process-naver-economy-2017',
    'corpus_process-naver-economy-2018',
    'corpus_process-naver-economy-2019',
]

In [9]:
for index in tqdm(index_list):
    df = read_docs(index=index)
    
    vec.fit(df['token'].to_list())
    
    with open('data/{}.tfidf.csv'.format(index), 'w') as fp:
        idfs = vec.idf_
        
        for i, f in enumerate(vec.get_feature_names()):
            fp.write('{feature}\t{idf}\n'.format(feature=f, idf=idfs[i]))

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/465912 [00:00<?, ?it/s][A
  2%|▏         | 8563/465912 [00:00<00:05, 85625.39it/s][A
  4%|▍         | 20491/465912 [00:00<00:04, 93542.27it/s][A
  7%|▋         | 32390/465912 [00:00<00:04, 99954.10it/s][A
 10%|▉         | 44323/465912 [00:00<00:04, 105071.92it/s][A
 12%|█▏        | 56460/465912 [00:00<00:03, 109481.28it/s][A
 15%|█▍        | 68780/465912 [00:00<00:03, 113262.43it/s][A
 17%|█▋        | 80630/465912 [00:00<00:03, 114784.31it/s][A
 20%|█▉        | 92884/465912 [00:00<00:03, 117005.57it/s][A
 22%|██▏       | 104708/465912 [00:00<00:03, 117372.47it/s][A
 25%|██▍       | 116211/465912 [00:01<00:02, 116658.79it/s][A
 27%|██▋       | 127690/465912 [00:01<00:02, 115869.97it/s][A
 30%|██▉       | 139148/465912 [00:01<00:02, 114934.05it/s][A
 32%|███▏      | 150677/465912 [00:01<00:02, 115006.82it/s][A
 35%|███▍      | 162612/465912 [00:01<00:02, 116275.36it/s][A
 37%|███▋      | 174685/465912 [00:01<00:02, 

KeyboardInterrupt: 

In [None]:
dict(zip(vec.get_feature_names(), vec.idf_))

In [None]:
tfidf_df = pd.DataFrame(vec.idf_, index=vec.get_feature_names(), columns=['tfidf'])
tfidf_df

In [None]:
tfidf_df.sort_values('tfidf', inplace=True)

In [None]:
tfidf_df.to_csv('data/{}-(2018~2019).csv'.format(index))

In [None]:
# https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/#.XeCBh3UzZhE