# Finding Politeness

In [1]:
import collections
import glob
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None, 'display.max_rows', None, )

dataset_dir = "../../final_dataset/"

#SUBSETS = "train dev test".split()
SUBSETS = ["train"]


datasets = collections.defaultdict(list)

for subset in SUBSETS:
    for filename in glob.glob(dataset_dir + subset + "/*"):
        with open(filename, 'r') as f:
            datasets[subset].append(json.load(f))
            
all_pairs = sum(datasets.values(), [])

def total_and_average_len(list_of_lists):
    big_list = sum(list_of_lists, [])
    return len(big_list), len(big_list)/len(list_of_lists)

def count_dataset(pairs, subset):
    # TODO: Add double-annotated and adjudicated
    review_total, review_average = total_and_average_len([pair["review_sentences"] for pair in pairs])
    rebuttal_total, rebuttal_average = total_and_average_len([pair["rebuttal_sentences"] for pair in pairs])
    return {
        "subset":subset,
        "pairs": len(pairs),
        "forums": len(set(pair["metadata"]["forum_id"] for pair in pairs)),
        "adjudicated": len([pair for pair in pairs if pair["metadata"]["annotator"] == "anno0"]),
        "review_sentences": review_total,
        "rebuttal_sentences": rebuttal_total,
        "review_avg_sentences": review_average,
        "rebuttal_avg_sentences": rebuttal_average,
        
    }
# Distribution of examples over sets
df_dicts = [count_dataset(pairs, subset) for subset, pairs in datasets.items()]
df = pd.DataFrame.from_dict(df_dicts)

In [22]:
from tools.convokit_politeness import get_convokit_politeness_labels

pair = all_pairs[1]

all_sentences = {
    'metadata': {'review_id': 'xyz'},
    'review_sentences': [],
    'rebuttal_sentences': []
}

for pair in all_pairs:
    all_sentences['review_sentences'] += pair['review_sentences']
    all_sentences['rebuttal_sentences'] += pair['rebuttal_sentences']

get_convokit_politeness_labels(pair)['meta.politeness_markers'][0]
# pair_df = get_convokit_politeness_labels(all_sentences)
# pair_df.head()

{'politeness_markers_==Please==': [],
 'politeness_markers_==Please_start==': [],
 'politeness_markers_==HASHEDGE==': [],
 'politeness_markers_==Indirect_(btw)==': [],
 'politeness_markers_==Hedges==': [],
 'politeness_markers_==Factuality==': [],
 'politeness_markers_==Deference==': [],
 'politeness_markers_==Gratitude==': [],
 'politeness_markers_==Apologizing==': [],
 'politeness_markers_==1st_person_pl.==': [[('we', 1, 19)]],
 'politeness_markers_==1st_person==': [[('i', 1, 10)]],
 'politeness_markers_==1st_person_start==': [],
 'politeness_markers_==2nd_person==': [],
 'politeness_markers_==2nd_person_start==': [],
 'politeness_markers_==Indirect_(greeting)==': [],
 'politeness_markers_==Direct_question==': [],
 'politeness_markers_==Direct_start==': [],
 'politeness_markers_==HASPOSITIVE==': [],
 'politeness_markers_==HASNEGATIVE==': [[('problem', 1, 7)],
  [('problem', 1, 25)],
  [('problem', 1, 27)]],
 'politeness_markers_==SUBJUNCTIVE==': [],
 'politeness_markers_==INDICATIVE=

In [23]:
len(all_pairs)

251

In [4]:
rs = pd.json_normalize(all_pairs[1], record_path=['review_sentences'])
rs.head()

Unnamed: 0,review_id,sentence_index,text,coarse,fine,asp,pol
0,B1ez1LvJcB,0,"This paper presents a method for adapting a model that has been trained to perform one task, so that it can perform a new task (potentially without using any new training data at all—i.e., zero-shot learning).",arg_structuring,arg-structuring_summary,none,none
1,B1ez1LvJcB,1,In some ways the presented work is a form of meta-learning or *meta-mapping* as the authors refer to it.,arg_structuring,arg-structuring_summary,none,none
2,B1ez1LvJcB,2,The premise of the paper is very interesting and the overall problem is definitely of high interest and high potential impact.,arg_evaluative,none,asp_motivation-impact,pol_positive
3,B1ez1LvJcB,3,I believe that the presentation of the proposed method can be significantly improved.,arg_request,arg-request_edit,asp_clarity,pol_negative
4,B1ez1LvJcB,4,The method description was a bit confusing and unclear to me.,arg_evaluative,none,asp_clarity,pol_negative


In [None]:
pair = all_pairs[9]

df = get_convokit_politeness_labels(pair)

# len(all_pairs[1]['review_sentences'])
df