<a href="https://colab.research.google.com/github/ipavlopoulos/wikidetox/blob/ipavlopoulos-research-context/sampling_by_indentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.cloud import bigquery
from tqdm import tqdm
import pandas as pd
import json
import numpy as np

from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()

In [0]:
project_id = "wikidetox-viz"

# Connect to Google's BigQuery
client = bigquery.Client(project=project_id)

In [0]:
def get_random_comments_of_indent(indent=0, rand=.5, limit=1000, superfast=True):
    # one could use order by to shuffle and use seed (here this is not possible; https://www.oreilly.com/learning/repeatable-sampling-of-data-sets-in-bigquery-for-machine-learning)
    query = """
        SELECT *
        FROM `en_20180501_wikiconvviz_data.en_20180501_scored`
        WHERE indentation={indent} and RAND()<{prob}
        LIMIT {limit}
    """.format(limit=limit, indent=indent, prob=rand) if not superfast else """
    SELECT count(*), avg(sample.RockV6_1_TOXICITY)
        FROM (
          SELECT *
          FROM `en_20180501_wikiconvviz_data.en_20180501_scored`
          WHERE indentation={indent} and RAND()>{prob}
          LIMIT {limit}
          ) sample
    """.format(limit=limit, indent=indent, prob=rand)
    query_job = client.query(query)
    results = query_job.result()
    return results


In [0]:
# decode useful record attributes to a dict
wrapper = lambda r: { \
    "timestamp": r.timestamp, \
    "indentation": r.indentation, \
    "RockV6_1_TOXICITY": r.RockV6_1_TOXICITY, \
    "id": r.id, \
    "page_title": r.page_title, \
    "content": r.content, \
    "cleaned_content": r.cleaned_content, \
    "replyTo_id": r.replyTo_id, \
    "user_id": r.user_id, \
    "user_text": r.user_text, \
    "type": r.type, \
    "conversation_id": r.conversation_id, \
    "rev_id": r.rev_id
}

sample = get_random_comments_of_indent(indent=0, superfast=False)
starters = pd.DataFrame([wrapper(r) for r in sample])
for i in tqdm(range(1, 11)):
    sample = get_random_comments_of_indent(indent=i, superfast=False)
    out = pd.DataFrame([wrapper(r) for r in sample])
    out['indentation'] = i
    table = pd.concat([out, starters], axis=0)
    print (i, pd.DataFrame(out).RockV6_1_TOXICITY.mean())
out.to_gbq("en_20180501_wikiconvviz_data.samples_by_indentation", project_id=project_id)