In [0]:
from tqdm import tqdm
import pandas as pd
import json
import numpy as np

from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()

In [0]:
import logging
logger = logging.getLogger('indentation_gbq')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [0]:
project_id = "wikidetox-viz"

# Connect to Google's BigQuery
client = bigquery.Client(project=project_id)
#bigquery.Create(project=project_id)

In [0]:
def get_random_comments_of_indent(indent=0, rand=.5, limit=1000):
    # one could use order by to shuffle and use seed (here this is not possible; https://www.oreilly.com/learning/repeatable-sampling-of-data-sets-in-bigquery-for-machine-learning)
    query = """
        SELECT *
        FROM `en_20180501_wikiconvviz_data.en_20180501_scored`
        WHERE length(cleaned_content)>1 and indentation={indent} and RAND()<{prob}
        LIMIT {limit}
    """.format(limit=limit, indent=indent, prob=rand)
    query_job = client.query(query)
    results = query_job.result()
    return results


In [0]:
# decode useful record attributes to a dict
wrapper = lambda r: { \
#    "timestamp": r.timestamp, \
    "indentation": r.indentation, \
    "RockV6_1_TOXICITY": r.RockV6_1_TOXICITY, \
    "id": r.id, \
#    "page_title": r.page_title, \
#    "content": r.content, \
    "cleaned_content": r.cleaned_content, \
#    "replyTo_id": r.replyTo_id, \
#    "user_id": r.user_id, \
#    "user_text": r.user_text, \
#    "type": r.type, \
    "conversation_id": r.conversation_id, \
#    "rev_id": r.rev_id
}



In [6]:
calls = [get_random_comments_of_indent(indent=i) for i in tqdm(range(11))]
table = [wrapper(r) for call in calls for r in call]

100%|██████████| 11/11 [03:17<00:00, 27.67s/it]


In [11]:
table_pd = pd.DataFrame(table)
for i in range(11): print (i, table_pd[table_pd.indentation==i].RockV6_1_TOXICITY.mean())

0 0.119359464393
1 0.11693414398200007
2 0.12904383405900005
3 0.12842927973800003
4 0.13536119019700002
5 0.13885123807400002
6 0.13960105472500003
7 0.14846751882199985
8 0.1386643318040001
9 0.13977263703500006
10 0.15901782167099995


In [0]:
#!pip install ndjson
import ndjson
with open("samples_by_indentation.json", "w") as o:
  o.write(ndjson.dumps(output))

In [0]:
from google.colab import files
files.download("samples_by_indentation.json")

In [0]:
# The following are failed attempts to directly create a BGQ table
#output_df["cleaned_content"] = output_df["cleaned_content"].apply(lambda x: x.replace(u'\r', u' ') if isinstance(x, str) or isinstance(x, unicode) else x)
#output_df.to_gbq("en_20180501_wikiconvviz_data.samples_by_indentation", project_id=project_id, if_exists="replace")
#sch = [{"name":"indentation","type":"INTEGER"},{"name":"RockV6_1_TOXICITY", "type":"FLOAT"}, {"name":"id", "type":"STRING"}, {"name":"conversation_id", "type":"STRING"}, {"name":"cleaned_content", "type":"STRING"}]
#table_pd.to_gbq("en_20180501_wikiconvviz_data.samples_by_indentation", project_id=project_id, if_exists="replace", table_schema=sch)