In [17]:
import pandas as pd
import numpy as np
from collections import OrderedDict, Counter
import math

In [18]:
# Read constructiveness annotations csv
#cdf = pd.read_csv('../../CF_output/constructiveness/batch3_f1271448.csv')
cdf = pd.read_csv('../../samples/f1269916_constructiveness.csv')

In [19]:
# Remove test questions    
cdf = cdf.query('_golden == False')

In [20]:
#cdf.rename(columns = {'internal_gold':'internal_gold_constructiveness'}, inplace = True)

In [21]:
cdf.columns

Index(['_unit_id', '_created_at', '_golden', '_id', '_missed', '_started_at',
       '_tainted', '_channel', '_trust', '_worker_id', '_country', '_region',
       '_city', '_ip', 'agree', 'constructive', 'constructive_characteristics',
       'crowd_comments', 'non_constructive_characteristics', 'other_con_chars',
       'other_noncon_chars', 'orig__golden', 'agree_gold', 'article_author',
       'article_id', 'article_published_date', 'article_text', 'article_title',
       'article_url', 'comment_author', 'comment_counter', 'comment_text',
       'constructive_characteristics_gold', 'constructive_gold',
       'constructive_gold_reason', 'crowd_comments_gold',
       'non_constructive_characteristics_gold', 'other_con_chars_gold',
       'other_noncon_chars_gold'],
      dtype='object')

In [22]:
# Annotation info
annotation_info = {}
annotation_info['nannotators'] = len(cdf['_worker_id'].unique())
annotation_info['ncomments'] = cdf['comment_counter'].unique().shape[0]
print(annotation_info)

{'nannotators': 45, 'ncomments': 1000}


In [23]:
# Relevant columns 
unit_id_col = ['_unit_id']
meta_cols = ['article_author', 'article_id', 'article_published_date',
       'article_text', 'article_title', 'article_url', 'comment_author',
       'comment_counter', 'comment_text','constructive_gold']
binary_attributes = ['agree', 'constructive']
nominal_attributes = ['constructive_characteristics', 'non_constructive_characteristics']
text_attributes = ['crowd_comments', 'other_con_chars', 'other_noncon_chars']

In [24]:
# Replace text values with numerical values in the dataframe
attrs = cdf[binary_attributes].replace(['yes', 'no', 'partially', 'not_sure', 'noopinion'], [1, 0, 0.5, 0.5, np.nan])
other_cols = unit_id_col + meta_cols + nominal_attributes + text_attributes
cdf = cdf[other_cols].join(attrs)

In [25]:
# aggregation method for characteristics
def list_and_sort(vals):
    cnt = Counter()
    all_vals = []
    for v in vals: 
        all_vals.extend(v.split())
    for val in all_vals:
        val = val.strip()
        cnt[val] += 1
    L = cnt.most_common()
    #print('L:', L)
    L1 = [key + ':' + str(freq) for (key, freq) in L] 
    #L = sorted(cnt, key=cnt.get, reverse=True)
    return '\n'.join(L1)

In [26]:
# aggregation method for text attributes
def concatenate(vals):
    L = []
    for val in vals:
        if type(val) != str and math.isnan(val):
            L.append('')
        else:
            L.append(str(val))
            
    return "\n".join(L)

In [27]:
# agreegation method for each class of attributes
binary_dict = {k: 'mean' for k in binary_attributes}
meta_dict = {k: 'first' for k in meta_cols}
nominal_dict = {k: list_and_sort for k in nominal_attributes}
text_dict = {k: concatenate for k in text_attributes}

In [28]:
agg_dict = {**binary_dict, **meta_dict, **nominal_dict, **text_dict}

In [29]:
# Aggregate the results for all workers on a particular comment
aggregated_df = cdf.groupby('_unit_id').agg(agg_dict)
aggregated_df['constructive'] = aggregated_df['constructive'].apply(pd.to_numeric)
#print(aggregated_df.columns)

In [30]:
aggregated_df.rename(columns = {'agree':'agree_constructiveness_expt', 'crowd_comments':'crowd_comments_constructiveness_expt'}, inplace=True)

In [31]:
# relevant columns
cols = (['article_id', 'article_author', 'article_published_date', 'article_text', 'article_title',
   'article_url', 'comment_author', 'comment_counter', 'comment_text', 'agree_constructiveness_expt', 'constructive' , 
   'constructive_gold', 
   'constructive_characteristics', 'non_constructive_characteristics', 'other_con_chars', 
   'other_noncon_chars', 'crowd_comments_constructiveness_expt'])

In [32]:
# Write the aggregated csv
aggregated_df.to_csv('../../samples/f1269916_constructiveness_aggregated.csv', columns = cols, index = False)

In [22]:
# Returns a sorted dataframe of comments on constructiveness score
def sort_comments(df, field, ascending = False, num = 10):
    return df.sort_values(by = field, ascending=ascending).head(10)[['comment_text', field]]

In [23]:
sort_comments(aggregated_df, 'constructive', ascending=True)

Unnamed: 0_level_0,comment_text,constructive
_unit_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1748029213,My New Years resolution is to ignore this poli...,0.0
1748029266,Are you saying you endorse the decision but no...,0.0
1748029643,Biggest cabinet and biggest deficit in history...,0.0
1748029263,"'Vic Toews, the Public Safety minister, said S...",0.0
1748029261,Babysitters just aren't safe in this country. ...,0.0
1748029258,this government isn't interested in debate or ...,0.0
1748029648,And I will just hold my breath haha,0.0
1748029256,Why does stephen harper keep on hurting Canada...,0.0
1748029653,ENOUGH IS ENOUGH - CONS! The more you rap on t...,0.0
1748029654,Junior... just will never be ready! PEriod!,0.0
