In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
import pandas as pd
from time import sleep
from collections import defaultdict

warnings.filterwarnings(action='ignore')

from tqdm import tqdm
from crawler.utils.elasticsearch_utils import ElasticSearchUtils

%matplotlib inline

In [3]:
es = {
    'corpus': ElasticSearchUtils(**{
        'host': 'https://corpus.ncsoft.com:9200',
        'http_auth': 'ZWxhc3RpYzpubHBsYWI=',
        'encoded_auth': True
    }),
    'backfill': ElasticSearchUtils(**{
        'host': 'https://crawler-es.cloud.ncsoft.com:9200',
        'http_auth': 'ZWxhc3RpYzpzZWFyY2hUMjAyMA==',
        'encoded_auth': True
    })
}


In [4]:
index = 'crawler-naver-economy-2021'
date_range = '2021-01-01~2021-03-20'

In [24]:
dt_query = es['backfill'].get_date_range_query(date_range=date_range)
query = {
    'corpus': {
        'track_total_hits': True,
        '_source': [''],
        'query': {
            'bool': {
                'must': [{
                    'exists': {
                        'field': 'raw'
                    }
                }, {
                    **dt_query['query']['bool']['must']
                }]
            }
        }
    },
    'backfill': {
        'track_total_hits': True,
        '_source': [''],
        'query': {
            'bool': {
                'must': [
                    dt_query['query']['bool']['must']
                ],
                'must_not': [{
                    'exists': {
                        'field': 'contents'
                    }
                }]
            }
        }
    }
}

doc_list = {
    'corpus': [],
    'backfill': [],
}

es['corpus'].dump_index(index=index, query=query['corpus'], result=doc_list['corpus'])
es['backfill'].dump_index(index=index, query=query['backfill'], result=doc_list['backfill'])

crawler-naver-economy-2021:   0%|          | 0.00/127k [00:00<?, ?it/s]

crawler-naver-economy-2021:   0%|          | 0.00/272k [00:00<?, ?it/s]

In [25]:
ids = {
    'corpus': set((x['_index'], x['_id']) for x in doc_list['corpus']),
    'backfill': set((x['_index'], x['_id']) for x in doc_list['backfill']),
}

ids['common'] = list(ids['backfill'].intersection(ids['corpus']))

missing = {
    'corpus': list(ids['backfill'].difference(ids['corpus'])),
    'backfill': list(ids['corpus'].difference(ids['backfill'])),
}

summary = {
    'count': {col: f'{len(x):,}' for col, x in doc_list.items()},
    'missing': {col: f'{len(x):,}' for col, x in missing.items()}
}

print(len(ids['common']))

pd.DataFrame(summary)

64796


Unnamed: 0,count,missing
corpus,126635,207643
backfill,272439,61839


In [28]:
from collections import defaultdict

idx = defaultdict(list)
for k, v in ids['common'][:10]:
    idx[k].append(v)
    
idx

defaultdict(list,
            {'crawler-naver-economy-2021': ['001-0012242520',
              '018-0004875074',
              '003-0010374709',
              '018-0004860400',
              '366-0000682262',
              '417-0000670171',
              '082-0001072456',
              '417-0000669461',
              '022-0003556437',
              '021-0002462838']})

# 공통 아이디 복사: corpus -> backfill

* raw, content -> raw, contents

In [23]:
size = 100
for i in tqdm(range(0, len(ids['common']), size)):
    id_list = ids['common'][i:i + size]

    docs = []
    es['corpus'].get_by_ids(index=index,
                            id_list=[x[1] for x in id_list],
                            result=docs,
                            source=['raw', 'content'])

    bulk = []
    for x in docs:
        if 'raw' not in x or 'content' not in x:
            continue
            
        bulk += [{
            'update': {
                '_id': x['_id'],
                '_index': index,
            }
        }, {
            'doc': {
                'raw': x['raw'],
                'contents': x['content'],
            },
            'doc_as_upsert': False,
        }]

    resp = es['backfill'].conn.bulk(index=index, body=bulk, refresh=True)




  0%|          | 0/2367 [00:00<?, ?it/s][A[A

  0%|          | 1/2367 [00:01<1:00:54,  1.54s/it][A[A

  0%|          | 2/2367 [00:03<59:18,  1.50s/it]  [A[A

  0%|          | 3/2367 [00:04<58:49,  1.49s/it][A[A

  0%|          | 4/2367 [00:06<59:32,  1.51s/it][A[A

  0%|          | 5/2367 [00:07<1:03:46,  1.62s/it][A[A

  0%|          | 6/2367 [00:09<1:09:37,  1.77s/it][A[A

  0%|          | 7/2367 [00:11<1:10:05,  1.78s/it][A[A

  0%|          | 8/2367 [00:13<1:12:34,  1.85s/it][A[A

  0%|          | 9/2367 [00:15<1:13:40,  1.87s/it][A[A

  0%|          | 10/2367 [00:17<1:13:42,  1.88s/it][A[A

  0%|          | 11/2367 [00:19<1:13:30,  1.87s/it][A[A

  1%|          | 12/2367 [00:21<1:14:13,  1.89s/it][A[A

  1%|          | 13/2367 [00:23<1:18:44,  2.01s/it][A[A

  1%|          | 14/2367 [00:25<1:16:04,  1.94s/it][A[A

  1%|          | 15/2367 [00:27<1:13:05,  1.86s/it][A[A

  1%|          | 16/2367 [00:28<1:13:33,  1.88s/it][A[A

  1%|          | 17

  6%|▌         | 138/2367 [04:22<1:11:29,  1.92s/it][A[A

  6%|▌         | 139/2367 [04:24<1:12:03,  1.94s/it][A[A

  6%|▌         | 140/2367 [04:26<1:12:28,  1.95s/it][A[A

  6%|▌         | 141/2367 [04:28<1:10:42,  1.91s/it][A[A

  6%|▌         | 142/2367 [04:30<1:10:05,  1.89s/it][A[A

  6%|▌         | 143/2367 [04:32<1:12:47,  1.96s/it][A[A

  6%|▌         | 144/2367 [04:33<1:09:39,  1.88s/it][A[A

  6%|▌         | 145/2367 [04:35<1:06:36,  1.80s/it][A[A

  6%|▌         | 146/2367 [04:37<1:08:48,  1.86s/it][A[A

  6%|▌         | 147/2367 [04:39<1:08:09,  1.84s/it][A[A

  6%|▋         | 148/2367 [04:41<1:09:09,  1.87s/it][A[A

  6%|▋         | 149/2367 [04:43<1:10:15,  1.90s/it][A[A

  6%|▋         | 150/2367 [04:45<1:12:09,  1.95s/it][A[A

  6%|▋         | 151/2367 [04:47<1:13:24,  1.99s/it][A[A

  6%|▋         | 152/2367 [04:49<1:10:00,  1.90s/it][A[A

  6%|▋         | 153/2367 [04:51<1:11:12,  1.93s/it][A[A

  7%|▋         | 154/2367 [04:53<1:13:20

 12%|█▏        | 274/2367 [08:43<1:03:04,  1.81s/it][A[A

 12%|█▏        | 275/2367 [08:45<1:03:13,  1.81s/it][A[A

 12%|█▏        | 276/2367 [08:47<1:02:09,  1.78s/it][A[A

 12%|█▏        | 277/2367 [08:49<1:05:33,  1.88s/it][A[A

 12%|█▏        | 278/2367 [08:51<1:05:39,  1.89s/it][A[A

 12%|█▏        | 279/2367 [08:53<1:07:04,  1.93s/it][A[A

 12%|█▏        | 280/2367 [08:55<1:08:38,  1.97s/it][A[A

 12%|█▏        | 281/2367 [08:57<1:07:13,  1.93s/it][A[A

 12%|█▏        | 282/2367 [08:59<1:03:28,  1.83s/it][A[A

 12%|█▏        | 283/2367 [09:01<1:07:08,  1.93s/it][A[A

 12%|█▏        | 284/2367 [09:03<1:06:54,  1.93s/it][A[A

 12%|█▏        | 285/2367 [09:05<1:08:10,  1.96s/it][A[A

 12%|█▏        | 286/2367 [09:07<1:08:18,  1.97s/it][A[A

 12%|█▏        | 287/2367 [09:08<1:06:49,  1.93s/it][A[A

 12%|█▏        | 288/2367 [09:11<1:09:24,  2.00s/it][A[A

 12%|█▏        | 289/2367 [09:13<1:08:55,  1.99s/it][A[A

 12%|█▏        | 290/2367 [09:15<1:07:41

 17%|█▋        | 410/2367 [13:03<1:06:06,  2.03s/it][A[A

 17%|█▋        | 411/2367 [13:05<1:07:18,  2.06s/it][A[A

 17%|█▋        | 412/2367 [13:07<1:03:13,  1.94s/it][A[A

 17%|█▋        | 413/2367 [13:09<1:04:03,  1.97s/it][A[A

 17%|█▋        | 414/2367 [13:11<1:02:44,  1.93s/it][A[A

 18%|█▊        | 415/2367 [13:12<58:00,  1.78s/it]  [A[A

 18%|█▊        | 416/2367 [13:14<1:01:01,  1.88s/it][A[A

 18%|█▊        | 417/2367 [13:16<1:03:04,  1.94s/it][A[A

 18%|█▊        | 418/2367 [13:18<1:03:23,  1.95s/it][A[A

 18%|█▊        | 419/2367 [13:20<58:07,  1.79s/it]  [A[A

 18%|█▊        | 420/2367 [13:21<57:48,  1.78s/it][A[A

 18%|█▊        | 421/2367 [13:23<58:30,  1.80s/it][A[A

 18%|█▊        | 422/2367 [13:25<1:01:30,  1.90s/it][A[A

 18%|█▊        | 423/2367 [13:27<1:00:04,  1.85s/it][A[A

 18%|█▊        | 424/2367 [13:29<1:00:51,  1.88s/it][A[A

 18%|█▊        | 425/2367 [13:31<59:12,  1.83s/it]  [A[A

 18%|█▊        | 426/2367 [13:33<1:00:44,  1

KeyboardInterrupt: 

# 차이 나는 문서 확인 

In [None]:
doc_id = missing['backfill'][0]

[
    doc_id, 
    doc_id in ids['backfill'],
    es['corpus'].conn.get(index=index, id=doc_id, _source=['title'])['_source'],
    es['backfill'].conn.get(index=index, id=doc_id, _source=['title'])['_source']
]

In [None]:
es['corpus'].conn.get(index=index, id='20210120T113356505436', _source=['title', 'url'])['_source']

In [None]:
query = {
    '_source': ['title', 'url'],
    "query": {
        "bool": {
            "must": [{
                "match_phrase": {
                    "title": "여자핸드볼 SK, 광명시와 연고지 협약…임오경 의원 지역구(종합)"
                }
            }]
        }
    }
}

[
    es['corpus'].conn.search(index=index, body=query)['hits']['hits'][0]['_source'],
    es['backfill'].conn.search(index=index, body=query)['hits']['hits'][0]['_source'],
]