# 중복 문서 점검

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import bz2
import json
import pandas as pd
from tqdm import tqdm
from glob import glob
from collections import defaultdict
from IPython.display import display
from crawler.utils.es import ElasticSearchUtils

  from tqdm.autonotebook import tqdm


In [None]:
def read_merged_size(filename):
    data = defaultdict(dict)
    
    with open(filename, 'r') as fp:
        for l in fp:
            if l.strip() == '':
                continue

            index, count = re.sub('\s+', '\t', l.strip()).split('\t')
            count = int(count)

            if 'merged' in index:            
                data[index.replace('-merged', '')]['backfill'] = count
            else:
                data[index]['corpus'] = count

    count_df = pd.DataFrame(data).T
    count_df = count_df[ count_df.index.str.find('2020') > 0 ]

    return f"{count_df['corpus'].sum(axis=0):,}", f"{count_df['backfill'].sum(axis=0):,}", display(count_df)

read_merged_size(filename='../data/es_dump/merged-size.txt')

In [3]:
es = ElasticSearchUtils(host='https://corpus.ncsoft.com:9200', encoded_auth='ZWxhc3RpYzpubHBsYWI=')

In [4]:
def read_ids(path: str) -> list:
    result = []
    for filename in tqdm(glob(path)):
        with bz2.open(filename, 'r') as fp:
            result += [x.decode('utf-8').rstrip().split('\t') for x in fp if x.decode('utf-8').strip() != '']
            
    return result

In [5]:
def get_doc_id(url: str) -> str or None:
    oid = re.search('oid=(\d+)', url)
    aid = re.search('aid=(\d+)', url)
    
    if oid is None and aid is None:
        return None
        
    return f'{oid.groups()[0]}-{aid.groups()[0]}'

In [9]:
ids = read_ids(path='../data/es_dump/corpus/doc_ids/crawler-naver-*-2019.ids.tsv.bz2')

ids[:3]

100%|██████████| 19/19 [01:17<00:00,  4.08s/it]


[['crawler-naver-weather-2019',
  '003-0009578030',
  '강원 산간 밤부터 최고 30㎝ 눈…대설특보 발령 예상',
  'http://news.naver.com/main/read.nhn?mode=LS2D&mid=sec&sid1=103&sid2=248&oid=003&aid=0009578030'],
 ['crawler-naver-weather-2019',
  '421-0004327964',
  "'최대 30cm 눈' 강원 산지 대설예비특보…찬 북동풍 유입 원인",
  'http://news.naver.com/main/read.nhn?mode=LS2D&mid=sec&sid1=103&sid2=248&oid=421&aid=0004327964'],
 ['crawler-naver-weather-2019',
  '422-0000403077',
  '[날씨트리] 내일 기온 다소 내려가…동해안 많은 눈·비',
  'http://news.naver.com/main/read.nhn?mode=LS2D&mid=sec&sid1=103&sid2=248&oid=422&aid=0000403077']]

In [10]:
df = pd.DataFrame(ids, columns=['index', 'id', 'title', 'url'])

len(df), display(df.head())

Unnamed: 0,index,id,title,url
0,crawler-naver-weather-2019,003-0009578030,강원 산간 밤부터 최고 30㎝ 눈…대설특보 발령 예상,http://news.naver.com/main/read.nhn?mode=LS2D&...
1,crawler-naver-weather-2019,421-0004327964,'최대 30cm 눈' 강원 산지 대설예비특보…찬 북동풍 유입 원인,http://news.naver.com/main/read.nhn?mode=LS2D&...
2,crawler-naver-weather-2019,422-0000403077,[날씨트리] 내일 기온 다소 내려가…동해안 많은 눈·비,http://news.naver.com/main/read.nhn?mode=LS2D&...
3,crawler-naver-weather-2019,052-0001370493,[날씨] 낮 동안 맑고 쌀쌀...강원 산간 '대설예비특보',http://news.naver.com/main/read.nhn?mode=LS2D&...
4,crawler-naver-weather-2019,422-0000403124,[날씨톡톡] 한낮에도 쌀쌀…동해안 오후부터 비·눈,http://news.naver.com/main/read.nhn?mode=LS2D&...


(10929873, None)

In [11]:
doc_count = df.groupby(by='id').size().to_frame()

display(doc_count[ doc_count[0] > 2 ]), display(doc_count[ doc_count[0] > 1 ])

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
001-0010555244,3
001-0010555245,3
001-0010555247,3
001-0010555248,3
001-0010555372,3
...,...
629-0000008395,4
629-0000008440,4
629-0000008518,4
629-0000008574,4


Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
001-0010551768,2
001-0010551812,2
001-0010551862,2
001-0010552077,2
001-0010552081,2
...,...
629-0000008575,4
629-0000008576,2
629-0000008577,2
629-0000008578,2


(None, None)

In [12]:
freq = df.groupby(by='url').size().to_frame()

display(freq[freq[0] > 2]), display(freq[freq[0] > 1])

Unnamed: 0_level_0,0
url,Unnamed: 1_level_1
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555439,3
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555572,3
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555714,3
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555996,3
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555997,3
...,...
https://sports.news.naver.com/kbaseball/news/read.nhn?oid=529&aid=0000039687,3
https://sports.news.naver.com/kbaseball/news/read.nhn?oid=529&aid=0000039691,3
https://sports.news.naver.com/kbaseball/news/read.nhn?oid=529&aid=0000039692,3
https://sports.news.naver.com/kbaseball/news/read.nhn?oid=529&aid=0000039702,3


Unnamed: 0_level_0,0
url,Unnamed: 1_level_1
https://entertain.naver.com/read?oid=382&aid=0000720804,2
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555244,2
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555245,2
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555247,2
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0010555248,2
...,...
https://sports.news.naver.com/wbaseball/news/read.nhn?oid=023&aid=0003494173,2
https://sports.news.naver.com/wbaseball/news/read.nhn?oid=468&aid=0000604375,2
https://sports.news.naver.com/wfootball/news/read.nhn?oid=091&aid=0007053376,2
https://sports.news.naver.com/wfootball/news/read.nhn?oid=109&aid=0003949291,2


(None, None)

In [None]:
url = 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=254&oid=001&aid=0012111805'

df[ df['url'] == url ]

In [None]:
len('001-0012111805')

# corpus 문서 아이디가 잘못된 것

In [13]:
# error_df = df[ df['id'].str.find('2021') == 0 ]
error_df = df[ df['id'].str.len() != 14 ]

len(error_df), display(error_df.head())

Unnamed: 0,index,id,title,url
5435,crawler-naver-opinion-2019,20190304_233315.862944,[기자수첩]밴드 해체와 1인 가구,https://news.naver.com/main/read.nhn?mode=LSD&...
5436,crawler-naver-opinion-2019,20190304_191051.209450,[기자수첩]밴드 해체와 1인 가구,https://news.naver.com/main/read.nhn?mode=LSD&...
5536,crawler-naver-opinion-2019,20190320_085108.625709,"[천병혁의 야구세상] 감독의 팀 운영과 선수의 반발, 문제는 프로의식",https://news.naver.com/main/read.nhn?mode=LSD&...
5552,crawler-naver-opinion-2019,20190321_002700.982629,"[천병혁의 야구세상] 감독의 팀 운영과 선수의 반발, 문제는 프로의식",https://news.naver.com/main/read.nhn?mode=LSD&...
5686,crawler-naver-opinion-2019,20190212_213241.010000,이에리사휴먼스포츠 'ODA를 통한 스포츠외교 발전 방향' 세미나,https://news.naver.com/main/read.nhn?mode=LSD&...


(1574626, None)

# 문서 아이디 변경

In [14]:
doc_id_list = []
for no, row in error_df.iterrows():
    doc_id_list.append((get_doc_id(url=row['url']), row))

In [15]:
df[ df['id'].isin([x[0] for x in doc_id_list]) ]

Unnamed: 0,index,id,title,url
8031,crawler-naver-opinion-2019,088-0000572966,통합신공항으로 대구경북 비상의 날개를 달자,https://news.naver.com/main/read.nhn?mode=LSD&...
8063,crawler-naver-opinion-2019,001-0010581471,[연합시론] 체육계 잇단 '미투'…우리 모두의 책임이다,https://news.naver.com/main/read.nhn?mode=LSD&...
9310,crawler-naver-opinion-2019,001-0010733705,"[천병혁의 야구세상] 김경문 ""선동열 감독 생각하면 아직 마음 아파""",https://news.naver.com/main/read.nhn?mode=LSD&...
10966,crawler-naver-opinion-2019,001-0010606816,[\\xec\\xb2\\x9c\\xeb\\xb3\\x91\\xed\\x98\\x81...,https://news.naver.com/main/read.nhn?mode=LSD&...
28965,crawler-naver-opinion-2019,003-0009110562,\\xed\\x94\\x84\\xeb\\xa1\\x9c\\xec\\x95\\xbc\...,https://news.naver.com/main/read.nhn?mode=LSD&...
...,...,...,...,...
10902048,crawler-naver-it-2019,001-0010605300,KBS 아이돌 토크쇼 웹예능 '어썸 라이브',https://news.naver.com/main/read.nhn?mode=LS2D...
10902058,crawler-naver-it-2019,003-0009407125,'KT 갤럭시노트10 런칭 파티 경품 1등을 축하하며',https://news.naver.com/main/read.nhn?mode=LS2D...
10902068,crawler-naver-it-2019,001-0010998053,"삼성 반도체 영화 '메모리즈', 개봉 일주일만에 3천만뷰 돌파",https://news.naver.com/main/read.nhn?mode=LS2D...
10902078,crawler-naver-it-2019,003-0009291570,"'베리굿' 조현, 코스프레 선정성 가타부타···시끌시끌",https://news.naver.com/main/read.nhn?mode=LS2D...


In [16]:
len(doc_id_list), doc_id_list[:3]

(1574626,
 [('003-0009091871',
   index                           crawler-naver-opinion-2019
   id                                  20190304_233315.862944
   title                                   [기자수첩]밴드 해체와 1인 가구
   url      https://news.naver.com/main/read.nhn?mode=LSD&...
   Name: 5435, dtype: object),
  ('003-0009091871',
   index                           crawler-naver-opinion-2019
   id                                  20190304_191051.209450
   title                                   [기자수첩]밴드 해체와 1인 가구
   url      https://news.naver.com/main/read.nhn?mode=LSD&...
   Name: 5436, dtype: object),
  ('001-0010705060',
   index                           crawler-naver-opinion-2019
   id                                  20190320_085108.625709
   title               [천병혁의 야구세상] 감독의 팀 운영과 선수의 반발, 문제는 프로의식
   url      https://news.naver.com/main/read.nhn?mode=LSD&...
   Name: 5536, dtype: object)])

In [17]:
def change_doc_id(id_list: list, index: str) -> None:
    doc_list = []
    es.get_by_ids(index=index, id_list=id_list, result=doc_list)

    bulk = []
    for doc in doc_list:
        prev_id = doc['document_id']
        for k in '_index,_id,document_id'.split(','):
            if k not in doc:
                continue
            del doc[k]

        doc_id = get_doc_id(url=doc['url'])
        bulk += [{
            'delete': {
                '_id': prev_id,
                '_index': index,
            }
        }, {
            'update': {
                '_id': doc_id,
                '_index': index,
            }
        }, {
            'doc': doc,
            'doc_as_upsert': True,
        }]
        
    if len(bulk) == 0:
        return
        
    es.conn.bulk(
        index=index,
        body=bulk,
        refresh=True,
        params={'request_timeout': 620},
    )
    return

In [20]:
buf = defaultdict(list)
for doc_id, row in tqdm(doc_id_list):
    index = row['index']
    buf[index].append(row['id'])

    if len(buf[index]) > 500:
        change_doc_id(id_list=buf[index], index=index)
        del buf[index]

 52%|█████▏    | 820855/1574626 [00:43<00:49, 15282.93it/s]POST https://corpus.ncsoft.com:9200/crawler-naver-international-2019/_bulk?refresh=true [status:413 request:6.148s]
 52%|█████▏    | 821355/1574626 [00:59<00:54, 13718.66it/s]


TransportError: TransportError(413, '')

In [None]:
for index, id_list in buf.items():
    change_doc_id(id_list=id_list, index=index)