# _Clean & Store in BigQuery_

In [1]:
import utils
import pandas as pd

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [19]:
%%time
utils.download_json('2020-05-19')

Blob dailies/2020-05-19/2020-05-19_clean-dataset.json downloaded to /home/jupyter/covid_disinfo_detect/experiments/playground_data/2020-05-19_clean-dataset.json.
CPU times: user 25.3 s, sys: 7.79 s, total: 33.1 s
Wall time: 45.5 s


In [24]:
def load_data(date, chunksize=50000):
    cols_interest = [
        'created_at',
        'id_str',
        'user',
        'lang',
        'full_text'
    ]
    
    chunks = pd.read_json(
        f'playground_data/{date}_clean-dataset.json',
        lines=True,
        chunksize=chunksize,
        dtype={
            'id_str': str,
            'in_reply_to_status_id_str': str,
            'quoted_status_id_str': str
        }
    )
    
    df = pd.concat(chunk for chunk in chunks)
    print('Loaded data...\n')
    return df[cols_interest]


def clean_data(df):
    df2clean = df.copy()
    df2clean['user_id_str'] = df2clean['user'].apply(lambda user: str(user['id_str']))
    df2clean.drop(labels = 'user', axis = 1, inplace = True)
    dfclean = df2clean[['created_at', 'id_str', 'user_id_str', 'lang', 'full_text']]
    print('Cleaned data...\n')
    return dfclean


def data_setup(date):
    df = clean_data(load_data(date))
    return df


def load_bigquery(df, date):
    client = bigquery.Client(location='US')
    dataset = client.get_dataset('twitter_dailies')
    bq_date = date.replace('-', '_')    # need to slightly change date string
    table_ref = dataset.table(bq_date)
    job = client.load_table_from_dataframe(df, table_ref, location='US')
    job.result()
    print(f'Loaded {job.output_rows} rows from dataframe to {table_ref.path}\n')
    
    
def data_bq_wrapper():
    date = input('What is the date of the data that you would like to store in BigQuery?\n')
    df = data_setup(date)
    load_bigquery(df, date)
    print(f'Data from {date} successfully stored in BigQuery.\n')

In [25]:
%%time
data_bq_wrapper()

What is the date of the data that you would like to store in BigQuery?
 2020-05-19


Loaded data...

Cleaned data...

Loaded 1009123 rows from dataframe to /projects/covid-disinfo-detect/datasets/twitter_dailies/tables/2020_05_19

Data from 2020-05-19 successfully stored in BigQuery.

CPU times: user 4min 13s, sys: 8.08 s, total: 4min 21s
Wall time: 5min 25s
