# 중복 문서 점검

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
import re
import bz2
import json
import pandas as pd
from tqdm import tqdm
from glob import glob
from collections import defaultdict
from IPython.display import display
from crawler.utils.es import ElasticSearchUtils

In [None]:
es = ElasticSearchUtils(host='https://corpus.ncsoft.com:9200', encoded_auth='ZWxhc3RpYzpubHBsYWI=')

In [8]:
def read_ids(path: str) -> list:
    result = []
    for filename in tqdm(glob(path)):
        with bz2.open(filename, 'r') as fp:
            result += [x.decode('utf-8').rstrip().split('\t') for x in fp if x.decode('utf-8').strip() != '']
            
    return result

In [9]:
def get_doc_id(url: str) -> str or None:
    oid = re.search('oid=(\d+)', url)
    aid = re.search('aid=(\d+)', url)
    
    if oid is None and aid is None:
        return None
        
    return f'{oid.groups()[0]}-{aid.groups()[0]}'

In [5]:
ids = {
    'backfill': read_ids(path='../data/es_dump/backfill/crawler-naver-*-2020.*.ids.tsv.bz2'),
    'corpus': read_ids(path='../data/es_dump/corpus/crawler-naver-*-2020.*.ids.tsv.bz2')
}

ids['backfill'][:3], ids['corpus'][:3]

100%|██████████| 9/9 [00:54<00:00,  6.03s/it]
100%|██████████| 9/9 [01:04<00:00,  7.17s/it]


([['crawler-naver-society-2020',
   '001-0012087169',
   "교육박물관, 내년 3월까지 '대구문화재 톺아보기' 기획전",
   'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=256&oid=001&aid=0012087169'],
  ['crawler-naver-society-2020',
   '421-0005056094',
   '부안군, 취약계층 1421세대에 에너지 홈닥터 사업 추진',
   'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=256&oid=421&aid=0005056094'],
  ['crawler-naver-society-2020',
   '014-0004548991',
   '대구신세계 산타원정대, 4개 종합병원에 선물 전달',
   'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=256&oid=014&aid=0004548991']],
 [['crawler-naver-society-2020',
   '2020-01-24T17:43:10.766633+09:00',
   '결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들',
   'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=256&oid=001&aid=0011344370'],
  ['crawler-naver-society-2020',
   '2020-01-24T17:42:38.831473+09:00',
   '결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들',
   'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=256&oid=001&aid=0011344373'],
  ['craw

In [6]:
columns = ['index', 'id', 'title', 'url']

df = {
    'backfill': pd.DataFrame(ids['backfill'], columns=columns),
    'corpus': pd.DataFrame(ids['corpus'], columns=columns)
}

len(df['backfill']), len(df['corpus']), display(df['backfill'].head()), display(df['corpus'].head())

Unnamed: 0,index,id,title,url
0,crawler-naver-society-2020,001-0012087169,"교육박물관, 내년 3월까지 '대구문화재 톺아보기' 기획전",https://news.naver.com/main/read.nhn?mode=LS2D...
1,crawler-naver-society-2020,421-0005056094,"부안군, 취약계층 1421세대에 에너지 홈닥터 사업 추진",https://news.naver.com/main/read.nhn?mode=LS2D...
2,crawler-naver-society-2020,014-0004548991,"대구신세계 산타원정대, 4개 종합병원에 선물 전달",https://news.naver.com/main/read.nhn?mode=LS2D...
3,crawler-naver-society-2020,008-0004515856,집에서 몰래 대마초 재배…얼마나 벌었길래,https://news.naver.com/main/read.nhn?mode=LS2D...
4,crawler-naver-society-2020,079-0003444869,"충남도교육청, 제 32회 충남교육상 수상자 8명 선정 시상",https://news.naver.com/main/read.nhn?mode=LS2D...


Unnamed: 0,index,id,title,url
0,crawler-naver-society-2020,2020-01-24T17:43:10.766633+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1,crawler-naver-society-2020,2020-01-24T17:42:38.831473+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
2,crawler-naver-society-2020,029-0002583301,"[속보]검찰, 최강욱 비서관 업무방해 혐의 불구속 기소",https://news.naver.com/main/read.nhn?mode=LS2D...
3,crawler-naver-society-2020,2020-01-24T17:43:21.342272+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
4,crawler-naver-society-2020,2020-01-24T17:47:44.031344+09:00,문성민의 서브,https://news.naver.com/main/read.nhn?mode=LS2D...


(7593493, 8618401, None, None)

In [7]:
doc_count = {
    'backfill': df['backfill'].groupby(by='id').size().to_frame(),
    'corpus': df['corpus'].groupby(by='id').size().to_frame()
}

doc_count['backfill'][ doc_count['backfill'][0] > 1 ]

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
001-0011308126,2
001-0011308128,2
001-0011308198,2
001-0011308231,2
001-0011308232,2
...,...
469-0000567812,2
469-0000567835,2
586-0000014727,2
586-0000016742,2


In [8]:
df['backfill'][ df['backfill']['id'] == '001-0011308126' ]

Unnamed: 0,index,id,title,url
2556569,crawler-naver-society-2020,001-0011308126,"정경두 ""안보 상황 녹록지 않아…힘을 통한 평화 뒷받침할 것""",https://news.naver.com/main/read.nhn?mode=LS2D...
4010032,crawler-naver-politics-2020,001-0011308126,"정경두 ""안보 상황 녹록지 않아…힘을 통한 평화 뒷받침할 것""",https://news.naver.com/main/read.nhn?mode=LS2D...


In [9]:
df['corpus'][ df['corpus']['id'] == '2020-01-24T17:43:10.766633+09:00' ]['url'].values

array(['https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=256&oid=001&aid=0011344370'],
      dtype=object)

In [10]:
df['corpus'][ df['corpus']['id'] == '001-0011344370' ]

Unnamed: 0,index,id,title,url


In [11]:
url = 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=256&oid=001&aid=0011344370'
df['corpus'][ df['corpus']['url'] == url ]

Unnamed: 0,index,id,title,url
0,crawler-naver-society-2020,2020-01-24T17:43:10.766633+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
183,crawler-naver-society-2020,2020-01-25T11:50:41.256649+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
438,crawler-naver-society-2020,2020-01-25T18:27:26.526313+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1199,crawler-naver-society-2020,2020-01-26T09:18:24.246056+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1466,crawler-naver-society-2020,2020-01-26T15:03:46.020621+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1596,crawler-naver-society-2020,2020-01-26T04:58:35.459265+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
8700,crawler-naver-society-2020,2020-01-23T16:26:36.492574+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
9223,crawler-naver-society-2020,2020-01-22T09:57:48.246415+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
9528,crawler-naver-society-2020,2020-01-26T00:09:30.896730+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
9978,crawler-naver-society-2020,2020-01-26T21:36:23.224705+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...


In [12]:
freq = df['corpus'].groupby(by='url').size().to_frame()
freq[freq[0] > 1]

Unnamed: 0_level_0,0
url,Unnamed: 1_level_1
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011308233,1
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011308287,1
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011309042,1
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011309044,1
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011309045,1
...,...
https://sports.news.naver.com/wfootball/news/read.nhn?oid=640&aid=0000006708,1
https://sports.news.naver.com/wfootball/news/read.nhn?oid=640&aid=0000006998,1
https://sports.news.naver.com/wfootball/news/read.nhn?oid=640&aid=0000007100,1
https://sports.news.naver.com/wfootball/news/read.nhn?oid=640&aid=0000007140,1


In [14]:
df['corpus'][ df['corpus']['title'] == '결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들' ]

Unnamed: 0,index,id,title,url
0,crawler-naver-society-2020,2020-01-24T17:43:10.766633+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1,crawler-naver-society-2020,2020-01-24T17:42:38.831473+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
3,crawler-naver-society-2020,2020-01-24T17:43:21.342272+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
17,crawler-naver-society-2020,2020-01-24T17:43:31.966230+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
23,crawler-naver-society-2020,2020-01-24T17:43:00.158291+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
...,...,...,...,...
1719787,crawler-naver-society-2020,2020-11-23T04:12:03.308412+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1719837,crawler-naver-society-2020,2020-11-23T04:11:30.101082+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1719839,crawler-naver-society-2020,2020-11-23T04:11:52.300355+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1719894,crawler-naver-society-2020,2020-11-23T04:11:18.997006+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...


# corpus 문서 아이디가 잘못된 것

In [15]:
error_df = df['corpus'][ df['corpus']['id'].str.find('2020-') >= 0 ]
error_df.head()

Unnamed: 0,index,id,title,url
0,crawler-naver-society-2020,2020-01-24T17:43:10.766633+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
1,crawler-naver-society-2020,2020-01-24T17:42:38.831473+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
3,crawler-naver-society-2020,2020-01-24T17:43:21.342272+09:00,"결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들",https://news.naver.com/main/read.nhn?mode=LS2D...
4,crawler-naver-society-2020,2020-01-24T17:47:44.031344+09:00,문성민의 서브,https://news.naver.com/main/read.nhn?mode=LS2D...
5,crawler-naver-society-2020,2020-01-24T17:47:12.787994+09:00,다우디 '공격',https://news.naver.com/main/read.nhn?mode=LS2D...


# 문서 아이디 변경

In [16]:
doc_id_list = []
for no, row in error_df.iterrows():
    doc_id_list.append((get_doc_id(url=row['url']), row))

In [17]:
df['corpus'][ df['corpus']['id'].isin([x[0] for x in doc_id_list]) ]

Unnamed: 0,index,id,title,url
143509,crawler-naver-society-2020,001-0011853147,집합금지 명령에도 대면 예배 순복음대전우리교회서 11명 확진(종합2보),https://news.naver.com/main/read.nhn?mode=LS2D...
306994,crawler-naver-society-2020,001-0011599424,[영화 속 그곳] 찬실이는 복도 많지,https://news.naver.com/main/read.nhn?mode=LS2D...
513193,crawler-naver-society-2020,214-0001038073,[권순표의 작심마이크] 코로나19 블루 극복…최정원이 추천하는 뮤지컬은?,https://news.naver.com/main/read.nhn?mode=LS2D...
539032,crawler-naver-society-2020,001-0011743582,"백두대간 고갯길, 하늘재 길",https://news.naver.com/main/read.nhn?mode=LS2D...
539058,crawler-naver-society-2020,001-0011743575,"순한 고갯길, 하늘재길",https://news.naver.com/main/read.nhn?mode=LS2D...
...,...,...,...,...
8608584,crawler-naver-living-2020,001-0012018680,울산국제영화제 프레 페스티벌 공식 트레일러 2편 공개,https://news.naver.com/main/read.nhn?mode=LS2D...
8611582,crawler-naver-living-2020,001-0012111805,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
8615285,crawler-naver-living-2020,001-0012101238,"""여경래·진화가 소개합니다"" 목포MBC '바다한상 시즌2'",https://news.naver.com/main/read.nhn?mode=LS2D...
8616890,crawler-naver-living-2020,001-0012109556,넷플릭스 '인간수업' 진한새 작가와 '글리치' 만든다,https://news.naver.com/main/read.nhn?mode=LS2D...


In [18]:
len(doc_id_list), doc_id_list[:3]

(1249417,
 [('001-0011344370',
   index                           crawler-naver-society-2020
   id                        2020-01-24T17:43:10.766633+09:00
   title                              결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들
   url      https://news.naver.com/main/read.nhn?mode=LS2D...
   Name: 0, dtype: object),
  ('001-0011344373',
   index                           crawler-naver-society-2020
   id                        2020-01-24T17:42:38.831473+09:00
   title                              결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들
   url      https://news.naver.com/main/read.nhn?mode=LS2D...
   Name: 1, dtype: object),
  ('001-0011344369',
   index                           crawler-naver-society-2020
   id                        2020-01-24T17:43:21.342272+09:00
   title                              결혼 발표한 엑소 첸, 탈퇴 촉구하는 팬들
   url      https://news.naver.com/main/read.nhn?mode=LS2D...
   Name: 3, dtype: object)])

In [21]:
def change_doc_id(id_list: list, index: str) -> None:
    doc_list = []
    es.get_by_ids(index=index, id_list=id_list, result=doc_list)

    bulk = []
    for doc in doc_list:
        prev_id = doc['document_id']
        for k in '_index,_id,document_id'.split(','):
            if k not in doc:
                continue
            del doc[k]

        doc_id = get_doc_id(url=doc['url'])
        bulk += [{
            'delete': {
                '_id': prev_id,
                '_index': index,
            }
        }, {
            'update': {
                '_id': doc_id,
                '_index': index,
            }
        }, {
            'doc': doc,
            'doc_as_upsert': True,
        }]
        
    if len(bulk) == 0:
        return
        
    es.conn.bulk(
        index=index,
        body=bulk,
        refresh=True,
        params={'request_timeout': 620},
    )
    return


buf = defaultdict(list)
for doc_id, row in tqdm(doc_id_list):
    index = row['index']
    buf[index].append(row['id'])

    if len(buf[index]) > 100:
        change_doc_id(id_list=buf[index], index=index)
        del buf[index]

100%|██████████| 1249417/1249417 [11:26:57<00:00, 30.31it/s]  


In [22]:
for index, id_list in buf.items():
    change_doc_id(id_list=id_list, index=index)