In [0]:
import pandas as pd
from tqdm import tqdm

from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()

In [0]:
import logging
logger = logging.getLogger('preselecting_wcomments_wrt_tox')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [0]:
# Connect to Google's BigQuery
project_id = "wikidetox-viz"
client = bigquery.Client(project=project_id)

In [0]:
def query(low_tox=0, high_tox=1, rand=.5, limit=1000, type="ADDITION", max_txt=1000):
    query = """
    SELECT * 
        FROM `wikiconv_v2.en_20180701_external`
        WHERE length(cleaned_content)>1 AND length(cleaned_content)<{max_txt} AND replyTo_id!="null" AND type="{typ}" AND toxicity>{low} AND toxicity<{high} AND RAND()<{ran}
        LIMIT {lim}
    """.format(low=low_tox, high=high_tox, ran=rand, lim=limit, typ=type, max_txt=max_txt)
    query_job = client.query(query)
    results = query_job.result()
    return results.to_dataframe()


In [0]:
def sample(bins=10):
    # Create random sample (initialization)
    table = query()
    table["toxic_zone"] = [-1] * table.shape[0]
    # Add samples following a toxicity-preselection
    for i in tqdm(range(bins)):
        preselected = query(low_tox=i / float(bins), high_tox=(i + 1) / float(bins))
        preselected["toxic_zone"] = [i] * preselected.shape[0]
        # Stack to initialized/updated table
        table = pd.concat([table, preselected], axis=0)
    # Shuffle
    table = table.sample(frac=1).reset_index(drop=True)
    return table


In [15]:
# run the sample code
table = sample()
table.to_csv("sample.11-toxic-zones.addition.csv", index=False)
table[:100].to_csv("sample.100posts.csv", index=False)


  0%|          | 0/10 [00:00<?, ?it/s][A
 10%|█         | 1/10 [00:02<00:25,  2.88s/it][A
 20%|██        | 2/10 [00:06<00:25,  3.22s/it][A
 30%|███       | 3/10 [00:12<00:27,  4.00s/it][A
 40%|████      | 4/10 [00:21<00:32,  5.41s/it][A
 50%|█████     | 5/10 [00:38<00:44,  8.93s/it][A
 60%|██████    | 6/10 [00:57<00:48, 12.00s/it][A
 70%|███████   | 7/10 [01:18<00:43, 14.57s/it][A
 80%|████████  | 8/10 [01:38<00:32, 16.16s/it][A
 90%|█████████ | 9/10 [01:57<00:17, 17.17s/it][A
100%|██████████| 10/10 [02:17<00:00, 17.86s/it][A
[A

In [26]:
# ensure stratification of the sample
assert table.toxic_zone.count() == 11000
print (table.toxic_zone.value_counts())

 7    1000
-1    1000
 6    1000
 5    1000
 4    1000
 3    1000
 2    1000
 9    1000
 1    1000
 8    1000
 0    1000
Name: toxic_zone, dtype: int64
