In [1]:
import pandas as pd
import numpy as np
from collections import OrderedDict, Counter
import math

In [2]:
# Read constructiveness annotations csv
cdf = pd.read_csv('../CF_output/Batch2_1000/f1269916_constructiveness.csv')

In [3]:
# Remove test questions    
cdf = cdf.query('_golden == False')    

In [4]:
# Annotation info
annotation_info = {}
annotation_info['nannotators'] = len(cdf['_worker_id'].unique())
annotation_info['ncomments'] = cdf['comment_counter'].unique().shape[0]
print(annotation_info)

{'nannotators': 45, 'ncomments': 1000}


In [6]:
# Relevant columns 
unit_id_col = ['_unit_id']
meta_cols = ['article_author', 'article_id', 'article_published_date',
       'article_text', 'article_title', 'article_url', 'comment_author',
       'comment_counter', 'comment_text']
binary_attributes = ['agree', 'constructive']
nominal_attributes = ['constructive_characteristics', 'non_constructive_characteristics']
text_attributes = ['crowd_comments', 'other_con_chars', 'other_noncon_chars']

In [8]:
# Replace text values with numerical values in the dataframe
attrs = cdf[binary_attributes].replace(['yes', 'no', 'partially', 'not_sure', 'noopinion'], [1, 0, 0.5, 0.5, np.nan])
other_cols = unit_id_col + meta_cols + nominal_attributes + text_attributes
cdf = cdf[other_cols].join(attrs)

In [9]:
# aggregation method for characteristics
def list_and_sort(vals):
    cnt = Counter()
    all_vals = []
    for v in vals: 
        all_vals.extend(v.split())
    for val in all_vals:
        val = val.strip()
        cnt[val] += 1
    L = cnt.most_common()
    #print('L:', L)
    L1 = [key + ':' + str(freq) for (key, freq) in L] 
    #L = sorted(cnt, key=cnt.get, reverse=True)
    return '\n'.join(L1)

In [32]:
# aggregation method for text attributes
def concatenate(vals):
    L = []
    for val in vals:
        if type(val) != str and math.isnan(val):
            L.append('')
        else:
            L.append(str(val))
            
    return "\n".join(L)

In [33]:
# agreegation method for each class of attributes
binary_dict = {k: 'mean' for k in binary_attributes}
meta_dict = {k: 'first' for k in meta_cols}
nominal_dict = {k: list_and_sort for k in nominal_attributes}
text_dict = {k: concatenate for k in text_attributes}

In [34]:
agg_dict = {**binary_dict, **meta_dict, **nominal_dict, **text_dict}

In [40]:
# Aggregate the results for all workers on a particular comment
aggregated_df = cdf.groupby('_unit_id').agg(agg_dict)
aggregated_df['constructive'] = aggregated_df['constructive'].apply(pd.to_numeric)
print(aggregated_df.columns)

Index(['agree', 'constructive', 'article_author', 'article_id',
       'article_published_date', 'article_text', 'article_title',
       'article_url', 'comment_author', 'comment_counter', 'comment_text',
       'constructive_characteristics', 'non_constructive_characteristics',
       'crowd_comments', 'other_con_chars', 'other_noncon_chars'],
      dtype='object')


In [41]:
# relevant columns
cols = (['article_id', 'article_author', 'article_published_date', 'article_text', 'article_title',
   'article_url', 'comment_author', 'comment_counter', 'comment_text', 'agree', 'constructive', 
   'constructive_characteristics', 'non_constructive_characteristics', 'other_con_chars', 
   'other_noncon_chars', 'crowd_comments'])

In [42]:
# Write the aggregated csv
aggregated_df.to_csv('../CF_output/Batch2_1000/f1269916_constructiveness_aggregated.csv', columns = cols, index = False)

In [45]:
# Returns a sorted dataframe of comments on constructiveness score
def sort_comments(df, field, ascending = False, num = 10):
    return df.sort_values(by = field, ascending=ascending).head(10)[['comment_text', field]]

In [47]:
sort_comments(aggregated_df, 'constructive', ascending=True)

Unnamed: 0_level_0,comment_text,constructive
_unit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1740685938,The laws are worse than the drug.,0.0
1740686033,"Slow down there Sarah, not quite sure if you'r...",0.0
1740685581,Globe and Mail endorses conservatives. Quelle ...,0.0
1740686028,"yes, Avi. Destroying our economy will certainl...",0.0
1740686021,"'''Avi Lewis is a journalist, filmmaker,.........",0.0
1740686019,Dozens of people wrote it--and millions oppose...,0.0
1740686014,No interest in hanging out with you.,0.0
1740686012,"Discriminatory quotas are the problem, not the...",0.0
1740686010,Get serious girl!,0.0
1740686009,"It's reassuring to know, they were also the mo...",0.0
