<a href="https://colab.research.google.com/github/ipavlopoulos/wikidetox/blob/ipavlopoulos-research-context/research/context/rho_indentation_constructiveness_toxicity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from tqdm import tqdm
import pandas as pd
import json
import numpy as np

from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()

In [0]:
import logging
logger = logging.getLogger('indentation_constructiveness_gbq')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [0]:
project_id = "wikidetox-viz"

# Connect to Google's BigQuery
client = bigquery.Client(project=project_id)
#bigquery.Create(project=project_id)

In [0]:
def get_comments_scored_for_constructiveness_n_toxicity():
  query = """
    SELECT t1.id as id1, t2.id as id2, t1.RockV6_1_TOXICITY as tox, t2.constructiveness as constr, t1.indentation as indent, t1.cleaned_content as txt
    FROM `en_20180501_wikiconvviz_data.sampled_by_indentation` as t1
    JOIN `en_20180501_wikiconvviz_data.constructive_scored_sampled_by_indentation` as t2
    ON t1.id = t2.id
  """
  query_job = client.query(query)
  results = query_job.result()
  return results

In [0]:
wrapper = lambda r: {"toxicity":r.tox, "indentation": r.indent, "constructiveness":r.constr, "txt":r.txt}

In [0]:
table = [wrapper(r) for r in get_comments_scored_for_constructiveness()]

In [0]:
table_pd = pd.DataFrame(table)
table_group = table_pd.groupby('indentation', as_index=False)
tox = table_group.toxicity.mean().toxicity
indent = table_group.toxicity.mean().indentation # same for constructiveness
constr = table_group.constructiveness.mean().constructiveness

In [0]:
from scipy.stats import pearsonr, spearmanr, kendalltau

In [0]:
# Correlation between indentation and toxicity
for rho in (pearsonr, spearmanr, kendalltau):
  print (rho(indent, tox))

In [89]:
for rho in (pearsonr, spearmanr, kendalltau):
  print (rho(indent,constr))

(0.8946090673788041, 0.00020098866179414686)
SpearmanrResult(correlation=0.9363636363636365, pvalue=2.208207644917754e-05)
KendalltauResult(correlation=0.8545454545454545, pvalue=0.0002532556429228073)


In [90]:
# sanity check (note that both the score and the pvalue are lower)
for rho in (pearsonr, spearmanr, kendalltau):
  print (rho(tox, constr))

(0.6953071090793349, 0.017529553783087574)
SpearmanrResult(correlation=0.8090909090909091, pvalue=0.002558580199713915)
KendalltauResult(correlation=0.6000000000000001, pvalue=0.01019787677624025)


In [91]:
table_group.describe().constructiveness

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,1000.0,0.454828,0.291335,0.004132,0.21126,0.437431,0.690714,0.999234
1,1000.0,0.414909,0.332039,0.002115,0.107372,0.325429,0.737006,0.998413
2,1000.0,0.443107,0.342376,0.003989,0.105806,0.377415,0.77941,0.999227
3,1000.0,0.446959,0.343304,0.002864,0.108946,0.391698,0.794435,0.998917
4,1000.0,0.456843,0.341892,0.004099,0.125816,0.409719,0.799723,0.998955
5,1000.0,0.465584,0.349871,0.004209,0.122161,0.403672,0.824543,0.997362
6,1000.0,0.466329,0.345636,0.006566,0.12407,0.424553,0.823085,0.997545
7,1000.0,0.488071,0.350374,0.002941,0.131367,0.470991,0.844855,0.998895
8,1000.0,0.477406,0.340754,0.001178,0.130981,0.454997,0.8194,0.999732
9,1000.0,0.494487,0.34725,0.003194,0.137019,0.484232,0.848046,0.998072
