In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
import pandas as pd
from time import sleep
from collections import defaultdict

warnings.filterwarnings(action='ignore')

from tqdm import tqdm
from crawler.utils.elasticsearch_utils import ElasticSearchUtils

%matplotlib inline

In [3]:
es = {
    'corpus': ElasticSearchUtils(**{
        'host': 'https://corpus.ncsoft.com:9200',
        'http_auth': 'ZWxhc3RpYzpubHBsYWI=',
        'encoded_auth': True
    }),
    'backfill': ElasticSearchUtils(**{
        'host': 'https://crawler-es.cloud.ncsoft.com:9200',
        'http_auth': 'ZWxhc3RpYzpzZWFyY2hUMjAyMA==',
        'encoded_auth': True
    })
}


In [4]:
index = 'crawler-naver-economy-2021'
date_range = '2021-01-01~2021-03-20'

In [5]:
query = {
    'track_total_hits': True,
    '_source': [''],
    **es['backfill'].get_date_range_query(date_range=date_range)
}

doc_list = {
    'corpus': [],
    'backfill': [],
}

es['corpus'].dump_index(index=index, query=query, result=doc_list['corpus'])
es['backfill'].dump_index(index=index, query=query, result=doc_list['backfill'])

crawler-naver-economy-2021:   0%|          | 0.00/240k [00:00<?, ?it/s]

crawler-naver-economy-2021:   0%|          | 0.00/331k [00:00<?, ?it/s]

In [7]:
ids = {
    'corpus': set(x['_id'] for x in doc_list['corpus']),
    'backfill': set(x['_id'] for x in doc_list['backfill']),
}

missing = {
    'corpus': list(ids['backfill'].difference(ids['corpus'])),
    'backfill': list(ids['corpus'].difference(ids['backfill'])), 
}

summary = {
    'count': {col: f'{len(x):,}' for col, x in doc_list.items()},
    'missing': {col: f'{len(x):,}' for col, x in missing.items()}
}

pd.DataFrame(summary)

Unnamed: 0,count,missing
corpus,240197,94813
backfill,331419,3591


In [8]:
missing['backfill']

['082-0001076306',
 '015-0004510314',
 '032-0003052762',
 '082-0001076226',
 '011-0003878230',
 '119-0002477439',
 '011-0003885296',
 '374-0000236350',
 '016-0001809307',
 '421-0005233446',
 '025-0003086327',
 '374-0000231058',
 '014-0004604260',
 '008-0004559986',
 '003-0010401321',
 '032-0003053180',
 '001-0012271025',
 '469-0000589556',
 '421-0005232983',
 '374-0000230419',
 '015-0004482008',
 '014-0004563673',
 '2021-01-11T21:07:56.484513+09:00',
 '215-0000931364',
 '081-0003172334',
 '417-0000671110',
 '018-0004880388',
 '018-0004879994',
 '015-0004486370',
 '057-0001550760',
 '081-0003169703',
 '009-0004734086',
 '20210120T213211714723',
 '421-0005232508',
 '081-0003172352',
 '421-0005234306',
 '015-0004516032',
 '417-0000671528',
 '001-0012270907',
 '001-0012271340',
 '032-0003062102',
 '123-0002242534',
 '018-0004880383',
 '277-0004868458',
 '448-0000316755',
 '023-0003602808',
 '215-0000945026',
 '009-0004766009',
 '001-0012270910',
 '629-0000072841',
 '20210120T201554687236',

In [9]:
doc_id = missing['backfill'][0]

[
    doc_id, 
    doc_id in ids['backfill'],
    es['corpus'].conn.get(index=index, id=doc_id, _source=['title'])['_source'],
    es['backfill'].conn.get(index=index, id=doc_id, _source=['title'])['_source']
]

['082-0001076306',
 False,
 {'title': '미샤 가맹점주협, 상생합의 이끈 전재수 의원에 감사 편지'},
 {'title': '미샤 가맹점주협, 상생합의 이끈 전재수 의원에 감사 편지'}]

In [11]:
es['corpus'].conn.get(index=index, id='20210120T113356505436', _source=['title', 'url'])['_source']

{'title': '여자핸드볼 SK, 광명시와 연고지 협약…임오경 의원 지역구(종합)',
 'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=263&oid=001&aid=0012151497'}

In [12]:
query = {
    '_source': ['title', 'url'],
    "query": {
        "bool": {
            "must": [{
                "match_phrase": {
                    "title": "여자핸드볼 SK, 광명시와 연고지 협약…임오경 의원 지역구(종합)"
                }
            }]
        }
    }
}

[
    es['corpus'].conn.search(index=index, body=query)['hits']['hits'][0]['_source'],
    es['backfill'].conn.search(index=index, body=query)['hits']['hits'][0]['_source'],
]

[{'title': '여자핸드볼 SK, 광명시와 연고지 협약…임오경 의원 지역구(종합)',
  'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=263&oid=001&aid=0012151497'},
 {'title': '여자핸드볼 SK, 광명시와 연고지 협약…임오경 의원 지역구(종합)',
  'url': 'https://news.naver.com/main/read.nhn?mode=LS2D&mid=shm&sid1=101&sid2=263&oid=001&aid=0012151497'}]