In [0]:
from tqdm import tqdm
import pandas as pd
import json
import numpy as np

from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()

In [0]:
import logging
logger = logging.getLogger('indentation_gbq')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [0]:
project_id = "wikidetox-viz"

# Connect to Google's BigQuery
client = bigquery.Client(project=project_id)
#bigquery.Create(project=project_id)

In [0]:
def get_random_comments_of_indent(indent=0, rand=.5, limit=1000):
    # one could use order by to shuffle and use seed (here this is not possible; https://www.oreilly.com/learning/repeatable-sampling-of-data-sets-in-bigquery-for-machine-learning)
    query = """
        SELECT *
        FROM `en_20180501_wikiconvviz_data.en_20180501_scored`
        WHERE length(cleaned_content)>1 and indentation={indent} and RAND()<{prob}
        LIMIT {limit}
    """.format(limit=limit, indent=indent, prob=rand)
    query_job = client.query(query)
    results = query_job.result()
    return results


In [0]:
# decode useful record attributes to a dict
wrapper = lambda r: { \
#    "timestamp": r.timestamp, \
    "indentation": r.indentation, \
    "RockV6_1_TOXICITY": r.RockV6_1_TOXICITY, \
    "id": r.id, \
#    "page_title": r.page_title, \
#    "content": r.content, \
    "cleaned_content": r.cleaned_content, \
#    "replyTo_id": r.replyTo_id, \
#    "user_id": r.user_id, \
#    "user_text": r.user_text, \
#    "type": r.type, \
    "conversation_id": r.conversation_id, \
#    "rev_id": r.rev_id
}



In [74]:
output_df = pd.DataFrame()
table = []
for i in tqdm(range(0, 11)):
    sample = get_random_comments_of_indent(indent=i)
    wrapped = [wrapper(r) for r in sample]
    table.extend(wrapped) # append in simple table
    out = pd.DataFrame(wrapped) # append in dataframe
    output_df = pd.concat([output_df, out], ignore_index=True, axis=0)
    print (i, pd.DataFrame(out).RockV6_1_TOXICITY.mean())
#output_df.to_gbq("en_20180501_wikiconvviz_data.samples_by_indentation", project_id=project_id)

  9%|▉         | 1/11 [00:05<00:56,  5.70s/it]

0 0.12050414917099996


 18%|█▊        | 2/11 [00:10<00:48,  5.41s/it]

1 0.12101686392999984


 27%|██▋       | 3/11 [00:17<00:47,  5.94s/it]

2 0.12416464208400006


 36%|███▋      | 4/11 [00:25<00:45,  6.47s/it]

3 0.13909506942499997


 45%|████▌     | 5/11 [00:37<00:48,  8.12s/it]

4 0.13043349743300006


 55%|█████▍    | 6/11 [00:55<00:55, 11.11s/it]

5 0.13749819441000014


 64%|██████▎   | 7/11 [01:22<01:03, 15.85s/it]

6 0.1419057613789999


 73%|███████▎  | 8/11 [01:56<01:04, 21.35s/it]

7 0.14141019827299986


 82%|████████▏ | 9/11 [02:27<00:48, 24.37s/it]

8 0.146473807178


 91%|█████████ | 10/11 [02:59<00:26, 26.52s/it]

9 0.1473632520311001


100%|██████████| 11/11 [03:30<00:00, 28.03s/it]

10 0.15560498813600002





In [0]:
#!pip install ndjson
import ndjson
with open("samples_by_indentation.json", "w") as o:
  o.write(ndjson.dumps(output))

In [0]:
from google.colab import files
files.download("samples_by_indentation.json")

In [0]:
# The following are failed attempts to directly create a BGQ table
#output_df["cleaned_content"] = output_df["cleaned_content"].apply(lambda x: x.replace(u'\r', u' ') if isinstance(x, str) or isinstance(x, unicode) else x)
#output_df.to_gbq("en_20180501_wikiconvviz_data.samples_by_indentation", project_id=project_id, if_exists="replace")