# 중복 문서 점검

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import bz2
import json
import pandas as pd
from tqdm import tqdm
from glob import glob
from collections import defaultdict
from IPython.display import display
from crawler.utils.es import ElasticSearchUtils

  from tqdm.autonotebook import tqdm


In [None]:
def read_merged_size(filename):
    data = defaultdict(dict)
    
    with open(filename, 'r') as fp:
        for l in fp:
            if l.strip() == '':
                continue

            index, count = re.sub('\s+', '\t', l.strip()).split('\t')
            count = int(count)

            if 'merged' in index:            
                data[index.replace('-merged', '')]['backfill'] = count
            else:
                data[index]['corpus'] = count

    count_df = pd.DataFrame(data).T
    count_df = count_df[ count_df.index.str.find('2020') > 0 ]

    return f"{count_df['corpus'].sum(axis=0):,}", f"{count_df['backfill'].sum(axis=0):,}", display(count_df)

read_merged_size(filename='../data/es_dump/merged-size.txt')

In [3]:
es = ElasticSearchUtils(host='https://corpus.ncsoft.com:9200', encoded_auth='ZWxhc3RpYzpubHBsYWI=')

In [4]:
def read_ids(path: str) -> list:
    result = []
    for filename in tqdm(glob(path)):
        with bz2.open(filename, 'r') as fp:
            result += [x.decode('utf-8').rstrip().split('\t') for x in fp if x.decode('utf-8').strip() != '']
            
    return result

In [5]:
def get_doc_id(url: str) -> str or None:
    oid = re.search('oid=(\d+)', url)
    aid = re.search('aid=(\d+)', url)
    
    if oid is None and aid is None:
        return None
        
    return f'{oid.groups()[0]}-{aid.groups()[0]}'

In [None]:
ids = read_ids(path='../data/es_dump/corpus/doc_ids/crawler-naver-*-2012.ids.tsv.bz2')

 91%|█████████ | 10/11 [00:30<00:02,  2.46s/it]

In [None]:
df = pd.DataFrame(ids, columns=['index', 'id', 'title', 'url'])

len(df), display(df.head())

# corpus 문서 아이디가 잘못된 것

In [None]:
error_df = df[ df['id'].str.len() != 14 ]

len(error_df), display(error_df.head())

# 문서 아이디 변경

In [None]:
doc_id_list = []
for no, row in error_df.iterrows():
    if row['url'] is None:
        continue
        
    doc_id_list.append((get_doc_id(url=row['url']), row))

In [None]:
df[ df['id'].isin([x[0] for x in doc_id_list]) ]

In [61]:
def change_doc_id(id_list: list, index: str) -> None:
    doc_list = []
    es.get_by_ids(index=index, id_list=id_list, result=doc_list)

    bulk = []
    for doc in doc_list:
        prev_id = doc['document_id']
        for k in '_index,_id,document_id'.split(','):
            if k not in doc:
                continue
            del doc[k]

        doc_id = get_doc_id(url=doc['url'])
        bulk += [{
            'delete': {
                '_id': prev_id,
                '_index': index,
            }
        }, {
            'update': {
                '_id': doc_id,
                '_index': index,
            }
        }, {
            'doc': doc,
            'doc_as_upsert': True,
        }]
        
    if len(bulk) == 0:
        return
        
    try:
        es.conn.bulk(
            index=index,
            body=bulk,
            refresh=True,
            params={'request_timeout': 620},
        )
    except Exception as e:
        pass
    
    return

In [62]:
buf = defaultdict(list)
for doc_id, row in tqdm(doc_id_list):
    index = row['index']
    buf[index].append(row['id'])

    if len(buf[index]) > 500:
        change_doc_id(id_list=buf[index], index=index)
        del buf[index]

100%|██████████| 5/5 [00:00<00:00, 48998.88it/s]


In [63]:
for index, id_list in buf.items():
    change_doc_id(id_list=id_list, index=index)