In [0]:
from tqdm import tqdm
import pandas as pd
import json
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
import textwrap
from google.cloud import bigquery
from google.colab import auth
auth.authenticate_user()

In [0]:
import logging
logger = logging.getLogger('indentation_constructiveness_gbq')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [0]:
project_id = "wikidetox-viz"

# Connect to Google's BigQuery
client = bigquery.Client(project=project_id)
#bigquery.Create(project=project_id)

In [0]:
def get_comments_scored_for_constructiveness_n_toxicity():
  query = """
    SELECT t1.id as id1, t2.id as id2, t1.RockV6_1_TOXICITY as tox, t2.constructiveness as constr, t1.indentation as indent, t1.cleaned_content as txt
    FROM `en_20180501_wikiconvviz_data.sampled_by_indentation` as t1
    JOIN `en_20180501_wikiconvviz_data.constructive_scored_sampled_by_indentation` as t2
    ON t1.id = t2.id
  """
  query_job = client.query(query)
  results = query_job.result()
  return results

In [0]:
wrapper = lambda r: {"toxicity":r.tox, "indentation": r.indent, "constructiveness":r.constr, "txt":r.txt}

In [0]:
table = [wrapper(r) for r in get_comments_scored_for_constructiveness_n_toxicity()]

In [0]:
table_pd = pd.DataFrame(table)
table_group = table_pd.groupby('indentation', as_index=False)
tox = table_group.toxicity.mean().toxicity
indent = table_group.toxicity.mean().indentation # same for constructiveness
constr = table_group.constructiveness.mean().constructiveness

In [0]:
# Correlation between indentation and toxicity
for rho in (pearsonr, spearmanr, kendalltau):
  print (rho.__name__, rho(indent, tox)[0])

pearsonr 0.8579422317177552
spearmanr 0.8272727272727273
kendalltau 0.6727272727272727


In [0]:
# Correlation between indentation and constructiveness
for rho in (pearsonr, spearmanr, kendalltau):
  print (rho.__name__, rho(indent,constr)[0])

pearsonr 0.8946090673788041
spearmanr 0.9363636363636365
kendalltau 0.8545454545454545


We now want to study the relation between toxicity and constructiveness

In [0]:
# in comments not deep in thread, how toxic are constructive comments and the other way round
table_deep = table_pd[table_pd.indentation>5]
low_ind_toxicity_of_constructive_comments = table_deep[table_deep.constructiveness>0.8].toxicity
low_ind_constructiveness_of_toxic_comments = table_deep[table_deep.toxicity>0.8].constructiveness
#print (low_ind_toxicity_of_constructive_comments.mean(), low_ind_constructiveness_of_toxic_comments.mean())

In [0]:
# in comments deeper in thread, how toxic are constructive comments and the other way round
table_not_deep = table_pd[table_pd.indentation<5]
high_ind_toxicity_of_constructive_comments = table_not_deep[table_not_deep.constructiveness>0.8].toxicity
high_ind_constructiveness_of_toxic_comments = table_not_deep[table_not_deep.toxicity>0.8].constructiveness
#print (high_ind_toxicity_of_constructive_comments.mean(), high_ind_constructiveness_of_toxic_comments.mean())

In [0]:
# mean toxicity in constructive comments is higher with a st. signif. diff in deeper threads
# Note: the same does not apply for mean constructiveness of toxic comments
from scipy import stats
print ("Constructive comments' toxicity across low/high indentation:", stats.ttest_ind(low_ind_toxicity_of_constructive_comments, high_ind_toxicity_of_constructive_comments))
print ("Toxic comments' constructiveness across low/high indentation:", stats.ttest_ind(low_ind_constructiveness_of_toxic_comments, high_ind_constructiveness_of_toxic_comments))

Constructive comments' toxicity across low/high indentation: Ttest_indResult(statistic=3.724546078291254, pvalue=0.00019993294461080643)
Toxic comments' constructiveness across low/high indentation: Ttest_indResult(statistic=0.688114416337236, pvalue=0.49771650581089655)


In [0]:
# toxicity and constructiveness are not correlated
print (spearmanr (table_pd.constructiveness, table_pd.toxicity))

#print (stats.ttest_ind(table_pd.toxicity, table_pd.constructiveness))

SpearmanrResult(correlation=0.08377099357551539, pvalue=1.3627011455772379e-18)


In [0]:
# Show a constructive text (in random) from indented comments
txts = table_deep[table_deep.constructiveness>0.8].txt.tolist()
textwrap.wrap(txts[np.random.randint(len(txts))])

["I've noticed that as well. IMO the organisms are more important than",
 'the families. We should keep only the most important ones and delete',
 "the rest. I'll take a closer look on the list tomorrow and post the",
 "articles which I think should be removed. I don't think we should set",
 'a limit to how many articles to remove. Looking at the list they are',
 "many articles which are not vital. I don't see a problem with removing",
 'more articles, and adding some later. We should focus on quality and',
 'not quantity']

In [0]:
# Show a toxic text (in random) from not indented comments
txts = table_not_deep[table_not_deep.toxicity>0.8].txt.tolist()
textwrap.wrap(txts[np.random.randint(len(txts))])

['im a retard in wikipedia']