In [331]:
import os
import sys
import json
import requests
import pickle
import gzip
import pandas as pd
import pandas_gbq as gbq
from pathlib import Path
from google.oauth2 import service_account
from google.cloud import bigquery
from IPython.core.debugger import set_trace
from tqdm.auto import tqdm
tqdm.pandas()

In [342]:
proj = 'global-news-crawl'
table_downloaded = 'news_dataset.downloaded'
table_trashed = 'news_dataset.trashed'
credentials = service_account.Credentials.from_service_account_file('global-news-crawl-c48d7cd9aa81.json')
newsids_pkl = 'newsids.pickle'

In [343]:
class NewsIDs:
    def __init__(self, fpath, init=False):
        self.fpath = fpath
        
        if init:
            self.ids = set()
                
        else:
            with open(fpath, 'rb') as f:
                self.ids = pickle.load(f)
        
    def has(self, id):
        return id in self.ids
    
    @property
    def size(self):
        return len(self.ids)
    
    def push(self, *ids_to_add):
        self.ids |= set(ids_to_add)
        
    def save(self):
        with open(self.fpath, 'wb') as f:
            pickle.dump(self.ids, f)
            
    def push_and_save(self, *ids_to_add):
        self.push(*ids_to_add)
        self.save()

In [393]:
newsids = NewsIDs(newsids_pkl, init=False); newsids.size

120000

In [394]:
bq_client = bigquery.Client(project=proj, credentials=credentials)
_table_downloaded = bq_client.get_table(table_downloaded)
_table_trashed = bq_client.get_table(table_trashed)

assert (_table_downloaded.num_rows + _table_trashed.num_rows) == newsids.size, 'size of newsids mismatched'

In [395]:
def extract_contents(newsids=None, where=None, n=1000):
    df = {}
    _n = 0
    
    for file in Path(where).glob('**/*.json'):
        id = file.stem
        
        if not newsids.has(id):
            try:
                js = json.loads(file.read_text())
            
                if 'authors' in js:
                    js['authors'] = ', '.join(js['authors'])

                df[id] = js

                _n += 1
                print('\r{}'.format(_n), end='')
                if _n == n: break
                    
            except:
                print(file)
           
    df = pd.DataFrame.from_dict(df, orient='index')
    df.index.name = 'id'
    return df.reset_index()

In [None]:
df_downloaded = extract_contents(newsids=newsids, where='newsdata/downloaded', n=10000); print('\n')
df_trashed = extract_contents(newsids=newsids, where='newsdata/trashed', n=10000)

8441

In [389]:
df_trashed;

In [390]:
intersect = set(df_trashed.id) & set(df_downloaded.id)
assert len(intersect) == 0, 'ids overlapping'

In [391]:
newsids.push_and_save(*df_downloaded.id, *df_trashed.id); newsids.size

120000

In [392]:
gbq.to_gbq(df_downloaded, table_downloaded, project_id=proj, if_exists='append', chunksize=1000, credentials=credentials)
gbq.to_gbq(df_trashed, table_trashed, project_id=proj, if_exists='append', chunksize=1000, credentials=credentials)

10it [01:21,  8.14s/it]
10it [00:45,  4.58s/it]
