# NEWS 파일(json)을 Bigquery에 저장

In [14]:
import os
import sys
import json
import requests
import pickle
import gzip
import pandas as pd
import pandas_gbq as gbq
from pathlib import Path
from google.oauth2 import service_account
from google.cloud import bigquery
from IPython.core.debugger import set_trace
from tqdm.auto import tqdm
tqdm.pandas()

## Configuration

In [15]:
proj = 'global-news-crawl'
table_downloaded = 'news_dataset.downloaded'
table_trashed = 'news_dataset.trashed'
credentials = service_account.Credentials.from_service_account_file('global-news-crawl-c48d7cd9aa81.json')

## Bigquery에서 기존 뉴스 ID 리스트 받기

In [16]:
%%time
qry_0 = 'SELECT id FROM `{}`'.format(proj + '.' + table_downloaded)
ids_downloaded = gbq.read_gbq(qry_0, project_id=proj, credentials=credentials)#, use_bqstorage_api=True)

Wall time: 20.4 s


In [17]:
%%time
qry_1 = 'SELECT id FROM `{}`'.format(proj + '.' + table_trashed)
ids_trashed = gbq.read_gbq(qry_1, project_id=proj, credentials=credentials)

Wall time: 5.99 s


In [18]:
ids_downloaded_set = set(ids_downloaded.id)
ids_trashed_set = set(ids_trashed.id)

In [19]:
assert len(ids_downloaded) == len(ids_downloaded_set), 'duplicated in downloaded'
assert len(ids_trashed) == len(ids_trashed_set), 'duplicated in trashed'
assert len(ids_downloaded_set & ids_trashed_set) == 0, 'ids overlapping'

In [20]:
newsids = ids_downloaded_set | ids_trashed_set; len(newsids)

323763

In [21]:
len(ids_downloaded_set), len(ids_trashed_set)

(301135, 22628)

## 파일에서 뉴스정보 추출
나중에는 필요없는 과정이다. 다운로드 받은 즉시 바로 Bigquery에 전송할 것이므로

In [22]:
def extract_contents(newsids=None, where=None, n=1000):
    df = {}
    _n = 0
    
    for file in Path(where).glob('**/*.json'):
        id = file.stem
        
        if id not in newsids:
            try:
                js = json.loads(file.read_text())
            
                if 'authors' in js:
                    js['authors'] = ', '.join(js['authors'])

                df[id] = js

                _n += 1
                print('\r{}'.format(_n), end='')
                if _n == n: break
                    
            except:
                print(file)
           
    df = pd.DataFrame.from_dict(df, orient='index')
    df.index.name = 'id'
    return df.reset_index()

In [23]:
df_downloaded = extract_contents(newsids=newsids, where='newsdata/downloaded', n=50000); print('\n')
df_trashed = extract_contents(newsids=newsids, where='newsdata/trashed', n=30000)

163

78

## downloaded와 trashed 간에 겹치는 게 없는지 확인
기존의 뉴스파일을 Bigquery에 전송하는 과정에서, 이 둘간에 겹치는 사례가 종종 있었다
(2019.10.18)

In [24]:
intersect = set(df_trashed.id) & set(df_downloaded.id)
assert len(intersect) == 0, 'ids overlapping'

## Bigquery에 업로드

In [25]:
gbq.to_gbq(df_downloaded, table_downloaded, project_id=proj, if_exists='append', chunksize=1000, credentials=credentials)
gbq.to_gbq(df_trashed, table_trashed, project_id=proj, if_exists='append', chunksize=1000, credentials=credentials)

1it [00:06,  6.27s/it]
1it [00:04,  4.02s/it]
