In [1]:
import pandas as pd
import numpy as np
from collections import OrderedDict, Counter
import math

In [2]:
# Read constructiveness annotations csv
tdf = pd.read_csv('../CF_output/Batch2_1000/f1270155_toxicity.csv')

In [3]:
# Remove test questions    
tdf = tdf.query('_golden == False')    

In [4]:
# Annotation info
annotation_info = {}
annotation_info['nannotators'] = len(tdf['_worker_id'].unique())
annotation_info['ncomments'] = tdf['comment_counter'].unique().shape[0]
print(annotation_info)

{'nannotators': 55, 'ncomments': 1001}


In [5]:
tdf.columns

Index(['_unit_id', '_created_at', '_golden', '_id', '_missed', '_started_at',
       '_tainted', '_channel', '_trust', '_worker_id', '_country', '_region',
       '_city', '_ip', 'agree', 'crowd_comments', 'crowd_discard',
       'crowd_toxicity_level', 'expert_has_content', 'other_toxic_chars',
       'toxicity_characteristics', 'orig__golden', 'agree_gold',
       'article_author', 'article_id', 'article_published_date',
       'article_text', 'article_title', 'article_url', 'comment_author',
       'comment_counter', 'comment_text', 'crowd_comments_gold',
       'crowd_discard_gold', 'crowd_toxicity_level_gold',
       'crowd_toxicity_level_gold_reason', 'expert_has_content_gold',
       'other_toxic_chars_gold', 'toxicity_characteristics_gold'],
      dtype='object')

In [6]:
# Relevant columns 
unit_id_col = ['_unit_id']
meta_cols = ['article_author', 'article_id', 'article_published_date',
       'article_text', 'article_title', 'article_url', 'comment_author',
       'comment_counter', 'comment_text']
binary_attributes = ['agree']
nominal_attributes = ['toxicity_characteristics']
rating_attributes = ['crowd_toxicity_level']
text_attributes = ['crowd_comments', 'other_toxic_chars', 'crowd_discard', 'expert_has_content']    

In [7]:
# Replace text values with numerical values in the dataframe
attrs = tdf[binary_attributes].replace(['yes', 'no', 'partially', 'not_sure', 'noopinion'], [1, 0, 0.5, 0.5, np.nan])
other_cols = unit_id_col + meta_cols + nominal_attributes + rating_attributes + text_attributes
tdf = tdf[other_cols].join(attrs)

In [8]:
# aggregation method for characteristics
def list_and_sort(vals):
    cnt = Counter()
    all_vals = []
    for v in vals: 
        all_vals.extend(v.split())
    for val in all_vals:
        val = val.strip()
        cnt[val] += 1
    L = cnt.most_common()
    #print('L:', L)
    L1 = [key + ':' + str(freq) for (key, freq) in L] 
    #L = sorted(cnt, key=cnt.get, reverse=True)
    return '\n'.join(L1)

In [9]:
# aggregation method for text attributes
def concatenate(vals):
    L = []
    for val in vals:
        if type(val) != str and math.isnan(val):
            L.append('')
        else:
            L.append(str(val))
            
    return "\n".join(L)

In [10]:
# agreegation method for each class of attributes
binary_dict = {k: 'mean' for k in binary_attributes}
meta_dict = {k: 'first' for k in meta_cols}
nominal_dict = {k: list_and_sort for k in nominal_attributes}
rating_dict = {k: 'mean' for k in rating_attributes}
text_dict = {k: concatenate for k in text_attributes}

In [11]:
agg_dict = {**binary_dict, **meta_dict, **nominal_dict, **rating_dict, **text_dict}

In [18]:
# Aggregate the results for all workers on a particular comment
aggregated_df = tdf.groupby('_unit_id').agg(agg_dict)
print(aggregated_df.columns)

Index(['agree', 'article_author', 'article_id', 'article_published_date',
       'article_text', 'article_title', 'article_url', 'comment_author',
       'comment_counter', 'comment_text', 'toxicity_characteristics',
       'crowd_toxicity_level', 'crowd_comments', 'other_toxic_chars',
       'crowd_discard', 'expert_has_content'],
      dtype='object')


In [20]:
aggregated_df.rename(columns = {'expert_has_content':'has_content'}, inplace=True)

In [21]:
aggregated_df['has_content']

_unit_id
1741535932               \n\nyes\nyes\n
1741535933                     \n\n\n\n
1741535934                     \n\n\n\n
1741535935            \nno\nno\nyes\nno
1741535936                  \n\n\n\nyes
1741535937                     \n\n\n\n
1741535938       yes\nno\nyes\nyes\nyes
1741535939                     \n\n\n\n
1741535940               yes\n\n\n\nyes
1741535941                     \n\n\n\n
1741535942                  \n\n\n\nyes
1741535943                     \n\n\n\n
1741535944         no\nyes\nno\nno\nyes
1741535945                     \n\n\n\n
1741535946                     \n\n\n\n
1741535947                     \n\n\n\n
1741535948                     \n\n\n\n
1741535949               no\n\nno\nno\n
1741535950                     \n\n\n\n
1741535951                     \n\n\n\n
1741535952            yes\n\n\nyes\nyes
1741535953                     \n\n\n\n
1741535954                     \n\n\n\n
1741535955                     \n\n\n\n
1741535956                \n\ny

In [22]:
# relevant columns
cols = (['article_id', 'article_author', 'article_published_date', 'article_text', 'article_title',
   'article_url', 'comment_author', 'comment_counter', 'comment_text', 'agree', 'crowd_toxicity_level', 
   'has_content', 'crowd_discard',
   'toxicity_characteristics', 'other_toxic_chars', 'crowd_comments'])

In [23]:
# Write the aggregated csv
aggregated_df.to_csv('../CF_output/Batch2_1000/f1270155_toxicity_aggregated.csv', columns = cols, index = False)

In [18]:
# Returns a sorted dataframe of comments on an attribute
def sort_comments(df, field, ascending = False, num = 10):
    return df.sort_values(by = field, ascending=ascending).head(10)[['comment_text', field]]

In [20]:
sort_comments(aggregated_df, 'crowd_toxicity_level', ascending=False)

Unnamed: 0_level_0,comment_text,crowd_toxicity_level
_unit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1741536176,"Stupid, stupid, stupid.",3.8
1741536180,Are you ossholes focking batshot ?Get off the ...,3.6
1741536874,F(*k you Pierre Poutine!,3.6
1741536603,Yakabuski greatly minimizes and often ignores ...,3.6
1741535980,"Just more bluster from our cowardly, bullying ...",3.6
1741536859,Another globe editorial pretending to be upset...,3.4
1741536161,Pathetic... the globe stands side by side with...,3.2
1741536835,Such tough talk Merkle.........you 'could' be ...,3.2
1741536181,This has to be the most ridiculous editorial e...,3.2
1741536087,The Tories? You gotta be f#@%ing kidding. Who ...,3.2
