# 중복 문서 점검

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import bz2
import json
import pandas as pd
from tqdm import tqdm
from glob import glob
from collections import defaultdict
from IPython.display import display
from crawler.utils.es import ElasticSearchUtils

  from tqdm.autonotebook import tqdm


In [None]:
def read_merged_size(filename):
    data = defaultdict(dict)
    
    with open(filename, 'r') as fp:
        for l in fp:
            if l.strip() == '':
                continue

            index, count = re.sub('\s+', '\t', l.strip()).split('\t')
            count = int(count)

            if 'merged' in index:            
                data[index.replace('-merged', '')]['backfill'] = count
            else:
                data[index]['corpus'] = count

    count_df = pd.DataFrame(data).T
    count_df = count_df[ count_df.index.str.find('2020') > 0 ]

    return f"{count_df['corpus'].sum(axis=0):,}", f"{count_df['backfill'].sum(axis=0):,}", display(count_df)

read_merged_size(filename='../data/es_dump/merged-size.txt')

In [3]:
es = ElasticSearchUtils(host='https://corpus.ncsoft.com:9200', encoded_auth='ZWxhc3RpYzpubHBsYWI=')

In [4]:
def read_ids(path: str) -> list:
    result = []
    for filename in tqdm(glob(path)):
        with bz2.open(filename, 'r') as fp:
            result += [x.decode('utf-8').rstrip().split('\t') for x in fp if x.decode('utf-8').strip() != '']
            
    return result

In [5]:
def get_doc_id(url: str) -> str or None:
    oid = re.search('oid=(\d+)', url)
    aid = re.search('aid=(\d+)', url)
    
    if oid is None and aid is None:
        return None
        
    return f'{oid.groups()[0]}-{aid.groups()[0]}'

In [24]:
ids = read_ids(path='../data/es_dump/corpus/doc_ids/crawler-naver-*-2018.ids.tsv.bz2')

ids[:3]

100%|██████████| 19/19 [01:09<00:00,  3.64s/it]


[['crawler-naver-economy-reply-2018',
  '015-0004058465',
  '[집코노미] "쪽박 차게 생겼어요"…전세금 돌려줄 돈 없어 속 타는 갭투자자',
  'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=260&oid=015&aid=0004058465'],
 ['crawler-naver-economy-reply-2018',
  '025-0002869153',
  '금융위장·금감원장 회동···"靑 갈등봉합 지시? 아니다"',
  'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=260&oid=025&aid=0002869153'],
 ['crawler-naver-economy-reply-2018',
  '277-0004371608',
  '서울 아파트값 4주 연속↓…강남권 이어 영등포·동작·양천·노원도',
  'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=260&oid=277&aid=0004371608']]

In [25]:
df = pd.DataFrame(ids, columns=['index', 'id', 'title', 'url'])

len(df), display(df.head())

Unnamed: 0,index,id,title,url
0,crawler-naver-economy-reply-2018,015-0004058465,"[집코노미] ""쪽박 차게 생겼어요""…전세금 돌려줄 돈 없어 속 타는 갭투자자",https://news.naver.com/main/read.nhn?mode=LS2D...
1,crawler-naver-economy-reply-2018,025-0002869153,"금융위장·금감원장 회동···""靑 갈등봉합 지시? 아니다""",https://news.naver.com/main/read.nhn?mode=LS2D...
2,crawler-naver-economy-reply-2018,277-0004371608,서울 아파트값 4주 연속↓…강남권 이어 영등포·동작·양천·노원도,https://news.naver.com/main/read.nhn?mode=LS2D...
3,crawler-naver-economy-reply-2018,029-0002496634,수도권 추첨 물량 75% 무주택자에 우선 공급,https://news.naver.com/main/read.nhn?mode=LS2D...
4,crawler-naver-economy-reply-2018,014-0004140612,‘비산자이아이파크’ 월곶~판교선 수혜단지로 주목,https://news.naver.com/main/read.nhn?mode=LS2D...


(9131130, None)

In [26]:
doc_count = df.groupby(by='id').size().to_frame()

display(doc_count[ doc_count[0] > 2 ]), display(doc_count[ doc_count[0] > 1 ])

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
001-0009782792,4
001-0009782825,3
001-0009783407,3
001-0009783455,3
001-0009783508,3
...,...
586-0000004431,4
586-0000004434,3
586-0000004435,3
586-0000004442,3


Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
001-0009781752,2
001-0009782332,2
001-0009782415,2
001-0009782792,4
001-0009782825,3
...,...
607-0000000062,2
607-0000000109,2
607-0000000118,2
607-0000000122,2


(None, None)

In [27]:
freq = df.groupby(by='url').size().to_frame()

display(freq[freq[0] > 2]), display(freq[freq[0] > 1])

Unnamed: 0_level_0,0
url,Unnamed: 1_level_1
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009783547,4
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009797026,3
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009804903,4
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009812560,3
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009823674,3
...,...
https://sports.news.naver.com/kbaseball/news/read.nhn?oid=468&aid=0000385287,4
https://sports.news.naver.com/kbaseball/news/read.nhn?oid=468&aid=0000385296,4
https://sports.news.naver.com/kbaseball/news/read.nhn?oid=468&aid=0000399871,3
https://sports.news.naver.com/kbaseball/news/read.nhn?oid=468&aid=0000442661,3


Unnamed: 0_level_0,0
url,Unnamed: 1_level_1
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009783547,4
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009784098,2
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009784119,2
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009784128,2
http://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0009784130,2
...,...
https://sports.news.naver.com/volleyball/news/read.nhn?oid=530&aid=0000002652,2
https://sports.news.naver.com/volleyball/news/read.nhn?oid=536&aid=0000000103,2
https://sports.news.naver.com/wfootball/news/read.nhn?oid=091&aid=0006289931,2
https://sports.news.naver.com/wfootball/news/read.nhn?oid=091&aid=0006321573,2


(None, None)

In [None]:
url = 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=254&oid=001&aid=0012111805'

df[ df['url'] == url ]

In [None]:
len('001-0012111805')

# corpus 문서 아이디가 잘못된 것

In [28]:
# error_df = df[ df['id'].str.find('2021') == 0 ]
error_df = df[ df['id'].str.len() != 14 ]

len(error_df), display(error_df.head())

Unnamed: 0,index,id,title,url
733955,crawler-naver-politics-2018,2019-09-01T03:30:46.006002+09:00,인사하는 선동열,https://news.naver.com/main/read.nhn?mode=LS2D...
733956,crawler-naver-politics-2018,2019-09-01T03:31:16.425226+09:00,선서하는 선동열,https://news.naver.com/main/read.nhn?mode=LS2D...
736754,crawler-naver-politics-2018,2019-09-01T03:34:50.698673+09:00,답변하는 선동열,https://news.naver.com/main/read.nhn?mode=LS2D...
1146317,crawler-naver-politics-2018,2019-08-31T15:40:45.098423+09:00,[금주 뉴시스 포토④]국정감사에 참석한 선동열,https://news.naver.com/main/read.nhn?mode=LS2D...
1158805,crawler-naver-politics-2018,2019-09-01T00:24:25.602042+09:00,"홈쇼핑 송출 수수료 5년간 35% 증가…""소비자 부담도 커져”",https://news.naver.com/main/read.nhn?mode=LS2D...


(1702620, None)

# 문서 아이디 변경

In [None]:
doc_id_list = []
for no, row in error_df.iterrows():
    if row['url'] is None:
        continue
        
    doc_id_list.append((get_doc_id(url=row['url']), row))

In [15]:
df[ df['id'].isin([x[0] for x in doc_id_list]) ]

Unnamed: 0,index,id,title,url
8031,crawler-naver-opinion-2019,088-0000572966,통합신공항으로 대구경북 비상의 날개를 달자,https://news.naver.com/main/read.nhn?mode=LSD&...
8063,crawler-naver-opinion-2019,001-0010581471,[연합시론] 체육계 잇단 '미투'…우리 모두의 책임이다,https://news.naver.com/main/read.nhn?mode=LSD&...
9310,crawler-naver-opinion-2019,001-0010733705,"[천병혁의 야구세상] 김경문 ""선동열 감독 생각하면 아직 마음 아파""",https://news.naver.com/main/read.nhn?mode=LSD&...
10966,crawler-naver-opinion-2019,001-0010606816,[\\xec\\xb2\\x9c\\xeb\\xb3\\x91\\xed\\x98\\x81...,https://news.naver.com/main/read.nhn?mode=LSD&...
28965,crawler-naver-opinion-2019,003-0009110562,\\xed\\x94\\x84\\xeb\\xa1\\x9c\\xec\\x95\\xbc\...,https://news.naver.com/main/read.nhn?mode=LSD&...
...,...,...,...,...
10902048,crawler-naver-it-2019,001-0010605300,KBS 아이돌 토크쇼 웹예능 '어썸 라이브',https://news.naver.com/main/read.nhn?mode=LS2D...
10902058,crawler-naver-it-2019,003-0009407125,'KT 갤럭시노트10 런칭 파티 경품 1등을 축하하며',https://news.naver.com/main/read.nhn?mode=LS2D...
10902068,crawler-naver-it-2019,001-0010998053,"삼성 반도체 영화 '메모리즈', 개봉 일주일만에 3천만뷰 돌파",https://news.naver.com/main/read.nhn?mode=LS2D...
10902078,crawler-naver-it-2019,003-0009291570,"'베리굿' 조현, 코스프레 선정성 가타부타···시끌시끌",https://news.naver.com/main/read.nhn?mode=LS2D...


In [16]:
len(doc_id_list), doc_id_list[:3]

(1574626,
 [('003-0009091871',
   index                           crawler-naver-opinion-2019
   id                                  20190304_233315.862944
   title                                   [기자수첩]밴드 해체와 1인 가구
   url      https://news.naver.com/main/read.nhn?mode=LSD&...
   Name: 5435, dtype: object),
  ('003-0009091871',
   index                           crawler-naver-opinion-2019
   id                                  20190304_191051.209450
   title                                   [기자수첩]밴드 해체와 1인 가구
   url      https://news.naver.com/main/read.nhn?mode=LSD&...
   Name: 5436, dtype: object),
  ('001-0010705060',
   index                           crawler-naver-opinion-2019
   id                                  20190320_085108.625709
   title               [천병혁의 야구세상] 감독의 팀 운영과 선수의 반발, 문제는 프로의식
   url      https://news.naver.com/main/read.nhn?mode=LSD&...
   Name: 5536, dtype: object)])

In [21]:
def change_doc_id(id_list: list, index: str) -> None:
    doc_list = []
    es.get_by_ids(index=index, id_list=id_list, result=doc_list)

    bulk = []
    for doc in doc_list:
        prev_id = doc['document_id']
        for k in '_index,_id,document_id'.split(','):
            if k not in doc:
                continue
            del doc[k]

        doc_id = get_doc_id(url=doc['url'])
        bulk += [{
            'delete': {
                '_id': prev_id,
                '_index': index,
            }
        }, {
            'update': {
                '_id': doc_id,
                '_index': index,
            }
        }, {
            'doc': doc,
            'doc_as_upsert': True,
        }]
        
    if len(bulk) == 0:
        return
        
    try:
        es.conn.bulk(
            index=index,
            body=bulk,
            refresh=True,
            params={'request_timeout': 620},
        )
    except Exception as e:
        pass
    
    return

In [22]:
buf = defaultdict(list)
for doc_id, row in tqdm(doc_id_list):
    index = row['index']
    buf[index].append(row['id'])

    if len(buf[index]) > 500:
        change_doc_id(id_list=buf[index], index=index)
        del buf[index]

 52%|█████▏    | 820763/1574626 [00:48<00:43, 17483.55it/s]POST https://corpus.ncsoft.com:9200/crawler-naver-international-2019/_bulk?refresh=true [status:413 request:6.307s]
100%|██████████| 1574626/1574626 [2:28:37<00:00, 176.59it/s]  


In [23]:
for index, id_list in buf.items():
    change_doc_id(id_list=id_list, index=index)