# 중복 문서 점검

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import bz2
import json
import pandas as pd
from tqdm import tqdm
from glob import glob
from collections import defaultdict
from IPython.display import display
from crawler.utils.es import ElasticSearchUtils

  from tqdm.autonotebook import tqdm


In [37]:
data = defaultdict(dict)

filename = '../data/es_dump/merged-size.txt'
with open(filename, 'r') as fp:
    for l in fp:
        if l.strip() == '':
            continue
            
        index, count = re.sub('\s+', '\t', l.strip()).split('\t')
        count = int(count)
        
        if 'merged' in index:            
            data[index.replace('-merged', '')]['backfill'] = count
        else:
            data[index]['corpus'] = count

count_df = pd.DataFrame(data).T
count_df = count_df[ count_df.index.str.find('2020') > 0 ]

f"{count_df['corpus'].sum(axis=0):,}", f"{count_df['backfill'].sum(axis=0):,}", display(count_df)

Unnamed: 0,corpus,backfill
crawler-naver-economy-2020,1447481,1543492
crawler-naver-international-2020,310060,309111
crawler-naver-it-2020,277188,277574
crawler-naver-living-2020,414143,408734
crawler-naver-opinion-2020,74296,73498
crawler-naver-politics-2020,1022806,1028159
crawler-naver-society-2020,2468697,2558538
crawler-naver-sports-2020,954830,962104
crawler-naver-tv-2020,436877,432283


('7,406,378', '7,593,493', None)

In [3]:
es = ElasticSearchUtils(host='https://corpus.ncsoft.com:9200', encoded_auth='ZWxhc3RpYzpubHBsYWI=')

In [4]:
def read_ids(path: str) -> list:
    result = []
    for filename in tqdm(glob(path)):
        with bz2.open(filename, 'r') as fp:
            result += [x.decode('utf-8').rstrip().split('\t') for x in fp if x.decode('utf-8').strip() != '']
            
    return result

In [5]:
def get_doc_id(url: str) -> str or None:
    oid = re.search('oid=(\d+)', url)
    aid = re.search('aid=(\d+)', url)
    
    if oid is None and aid is None:
        return None
        
    return f'{oid.groups()[0]}-{aid.groups()[0]}'

In [38]:
ids = read_ids(path='../data/es_dump/corpus/crawler-naver-*-2020.*.ids.tsv.bz2')

ids[:3]

100%|██████████| 9/9 [00:55<00:00,  6.11s/it]


[['crawler-naver-society-2020',
  '029-0002583301',
  '[속보]검찰, 최강욱 비서관 업무방해 혐의 불구속 기소',
  'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=257&oid=029&aid=0002583301'],
 ['crawler-naver-society-2020',
  '082-0000976426',
  '설 연휴 ‘우한 폐렴’ 확산 최대 고비… 손 씻기·기침예절 지켜야',
  'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=257&oid=082&aid=0000976426'],
 ['crawler-naver-society-2020',
  '029-0002583337',
  '[포토] 설 귀성 열차 오르는 가족',
  'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=257&oid=029&aid=0002583337']]

In [39]:
df = pd.DataFrame(ids, columns=['index', 'id', 'title', 'url'])

len(df), display(df.head())

Unnamed: 0,index,id,title,url
0,crawler-naver-society-2020,029-0002583301,"[속보]검찰, 최강욱 비서관 업무방해 혐의 불구속 기소",https://news.naver.com/main/read.nhn?mode=LS2D...
1,crawler-naver-society-2020,082-0000976426,설 연휴 ‘우한 폐렴’ 확산 최대 고비… 손 씻기·기침예절 지켜야,https://news.naver.com/main/read.nhn?mode=LS2D...
2,crawler-naver-society-2020,029-0002583337,[포토] 설 귀성 열차 오르는 가족,https://news.naver.com/main/read.nhn?mode=LS2D...
3,crawler-naver-society-2020,029-0002583338,[포토] 다녀오겠습니다,https://news.naver.com/main/read.nhn?mode=LS2D...
4,crawler-naver-society-2020,029-0002583303,"김경율 ""조국 의혹 목소리 계속 내는 새 시민단체 만든다""",https://news.naver.com/main/read.nhn?mode=LS2D...


(7406378, None)

In [42]:
doc_count = df.groupby(by='id').size().to_frame()

display(doc_count[ doc_count[0] > 2 ]), display(doc_count[ doc_count[0] > 1 ])

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
001-0011308253,3
001-0011308276,3
001-0011308393,3
001-0011308396,3
001-0011308403,3
...,...
469-0000563433,3
469-0000564332,3
469-0000565646,3
469-0000566149,3


Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
001-0011307519,2
001-0011308126,2
001-0011308128,2
001-0011308198,2
001-0011308231,2
...,...
629-0000046997,2
629-0000047098,2
629-0000047103,2
629-0000052746,2


(None, None)

In [43]:
freq = df.groupby(by='url').size().to_frame()

display(freq[freq[0] > 2]), display(freq[freq[0] > 1])

Unnamed: 0_level_0,0
url,Unnamed: 1_level_1
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=014&aid=0004367064,3
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=254&oid=001&aid=0012111805,413
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=254&oid=003&aid=0010271158,356
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=105&sid2=226&oid=001&aid=0012112595,237
https://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=115&oid=215&aid=0000926403,3


Unnamed: 0_level_0,0
url,Unnamed: 1_level_1
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011309168,2
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011309201,2
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011309291,2
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011309412,2
https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=100&sid2=264&oid=001&aid=0011309556,2
...,...
https://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=115&oid=215&aid=0000926403,3
https://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=115&oid=215&aid=0000926409,2
https://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=115&oid=437&aid=0000255755,2
https://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=115&oid=437&aid=0000255756,2


(None, None)

In [44]:
url = 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=102&sid2=254&oid=001&aid=0012111805'

df[ df['url'] == url ]

Unnamed: 0,index,id,title,url
2430182,crawler-naver-society-2020,20210120T051427780499,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
2430185,crawler-naver-society-2020,20210120T064610287030,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
2430186,crawler-naver-society-2020,20210120T072656392803,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
2430187,crawler-naver-society-2020,20210120T075729488392,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
2430189,crawler-naver-society-2020,20210120T090852290155,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
...,...,...,...,...
2452185,crawler-naver-society-2020,2021-01-01T23:11:47.797962+09:00,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
2452186,crawler-naver-society-2020,2021-01-01T23:30:30.201582+09:00,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
2452188,crawler-naver-society-2020,2021-01-01T23:49:22.610914+09:00,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...
2452298,crawler-naver-society-2020,2021-01-01T22:53:05.905953+09:00,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...


In [53]:
len('001-0012111805')

14

# corpus 문서 아이디가 잘못된 것

In [58]:
# error_df = df[ df['id'].str.find('2021') == 0 ]
error_df = df[ df['id'].str.len() != 14 ]

len(error_df), display(error_df.head())

Unnamed: 0,index,id,title,url
2427042,crawler-naver-society-2020,20210113T153527991803,"SBS 노조 ""윤석민 회장만 이로울뿐""…지주회사 체제 출범 규탄",https://news.naver.com/main/read.nhn?mode=LS2D...
2427806,crawler-naver-society-2020,20210113T153526189527,코로나19 증상 목사가 예배 강행 순복음대전우리교회 12명 확진(종합3보),https://news.naver.com/main/read.nhn?mode=LS2D...
2427974,crawler-naver-society-2020,20210113T153526376452,"KBS 노사, 검언유착 오보 사태 속 수신료 인상 논의",https://news.naver.com/main/read.nhn?mode=LS2D...
2430179,crawler-naver-society-2020,20210120T032020688635,"아리랑TV, 101개국 1억3200만 가구 송출...'국제방송원' 설립되나",https://news.naver.com/main/read.nhn?mode=LS2D...
2430180,crawler-naver-society-2020,20210120T041110288654,"아리랑TV, 101개국 1억3200만 가구 송출...'국제방송원' 설립되나",https://news.naver.com/main/read.nhn?mode=LS2D...


(1063, None)

# 문서 아이디 변경

In [48]:
doc_id_list = []
for no, row in error_df.iterrows():
    doc_id_list.append((get_doc_id(url=row['url']), row))

In [49]:
df[ df['id'].isin([x[0] for x in doc_id_list]) ]

Unnamed: 0,index,id,title,url
2462718,crawler-naver-society-2020,001-0011853686,코로나19 증상 목사가 예배 강행 순복음대전우리교회 12명 확진(종합3보),https://news.naver.com/main/read.nhn?mode=LS2D...
2468521,crawler-naver-society-2020,003-0009998419,"KBS 노사, 검언유착 오보 사태 속 수신료 인상 논의",https://news.naver.com/main/read.nhn?mode=LS2D...
2468525,crawler-naver-society-2020,003-0010053129,"SBS 노조 ""윤석민 회장만 이로울뿐""…지주회사 체제 출범 규탄",https://news.naver.com/main/read.nhn?mode=LS2D...
2468526,crawler-naver-society-2020,003-0009945329,"KBS ""4년내 직원 1000명 감축, 수신료 올린다""(종합)",https://news.naver.com/main/read.nhn?mode=LS2D...
2468685,crawler-naver-society-2020,001-0012113491,무등산국립공원 사회적 거리두기 안내판,https://news.naver.com/main/read.nhn?mode=LS2D...
...,...,...,...,...
6640067,crawler-naver-sports-2020,055-0000864825,"배혜윤 27점 11리바운드…삼성생명, BNK 누르고 2연패 탈출",https://sports.news.naver.com/general/news/rea...
6640166,crawler-naver-sports-2020,055-0000864826,"'러셀 34점' 한국전력, 대한항공에 짜릿한 승리",https://sports.news.naver.com/volleyball/news/...
6917936,crawler-naver-it-2020,001-0012112595,올해 미국 OTT 가입자 50% 넘게 증가,https://news.naver.com/main/read.nhn?mode=LS2D...
7399296,crawler-naver-living-2020,001-0012111805,코로나19 시대 속 세계 곳곳에 윤동주의 시가 던지는 위로,https://news.naver.com/main/read.nhn?mode=LS2D...


In [50]:
len(doc_id_list), doc_id_list[:3]

(1063,
 [('003-0010053129',
   index                           crawler-naver-society-2020
   id                                   20210113T153527991803
   title                  SBS 노조 "윤석민 회장만 이로울뿐"…지주회사 체제 출범 규탄
   url      https://news.naver.com/main/read.nhn?mode=LS2D...
   Name: 2427042, dtype: object),
  ('001-0011853686',
   index                           crawler-naver-society-2020
   id                                   20210113T153526189527
   title            코로나19 증상 목사가 예배 강행 순복음대전우리교회 12명 확진(종합3보)
   url      https://news.naver.com/main/read.nhn?mode=LS2D...
   Name: 2427806, dtype: object),
  ('003-0009998419',
   index                           crawler-naver-society-2020
   id                                   20210113T153526376452
   title                       KBS 노사, 검언유착 오보 사태 속 수신료 인상 논의
   url      https://news.naver.com/main/read.nhn?mode=LS2D...
   Name: 2427974, dtype: object)])

In [51]:
def change_doc_id(id_list: list, index: str) -> None:
    doc_list = []
    es.get_by_ids(index=index, id_list=id_list, result=doc_list)

    bulk = []
    for doc in doc_list:
        prev_id = doc['document_id']
        for k in '_index,_id,document_id'.split(','):
            if k not in doc:
                continue
            del doc[k]

        doc_id = get_doc_id(url=doc['url'])
        bulk += [{
            'delete': {
                '_id': prev_id,
                '_index': index,
            }
        }, {
            'update': {
                '_id': doc_id,
                '_index': index,
            }
        }, {
            'doc': doc,
            'doc_as_upsert': True,
        }]
        
    if len(bulk) == 0:
        return
        
    es.conn.bulk(
        index=index,
        body=bulk,
        refresh=True,
        params={'request_timeout': 620},
    )
    return


buf = defaultdict(list)
for doc_id, row in tqdm(doc_id_list):
    index = row['index']
    buf[index].append(row['id'])

    if len(buf[index]) > 100:
        change_doc_id(id_list=buf[index], index=index)
        del buf[index]

100%|██████████| 1063/1063 [00:21<00:00, 49.49it/s]


In [52]:
for index, id_list in buf.items():
    change_doc_id(id_list=id_list, index=index)