In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
import ast
import csv
import collections
from itertools import chain, cycle
import math
import matplotlib.pyplot as plt
import os
import pandas as pd
import pickle 
import numpy as np
import random
import scipy.stats as ss
import spacy
from sklearn import preprocessing
import re


from sklearn.feature_extraction.text import CountVectorizer

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 35
pd.options.display.max_colwidth = 800

np.random.seed(1234)
random.seed(1234)

from matplotlib import rc
#rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
rc('font',**{'family':'serif','serif':['Times'],'size':12})
#rcParams['font.size'] = 12
#rc('text', usetex=True)

In [3]:
import stanza
stanza.download('en')
#parser = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
parser = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy','mwt':'mwt','pos':'pos','lemma':'lemma'},)
def split_tokenize_text(text, lemmatize=False):
    sents = []
    doc = parser(text)
    for sent in doc.sentences:
        if lemmatize:
            sents.append([word.lemma for word in sent.words])
        else:
            sents.append([word.text for word in sent.words])
    return sents

def tokenize_text(text, lemmatize=True):
    doc = parser(text)
    if lemmatize:
        return [word.lemma for sent in doc.sentences for word in sent.words]
    else:
        return [word.text for sent in doc.sentences for word in sent.words]

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 120kB [00:00, 16.1MB/s]                    
2020-08-02 21:30:08 INFO: Downloading default packages for language: en (English)...
2020-08-02 21:30:09 INFO: File exists: /Users/elisa/stanza_resources/en/default.zip.
2020-08-02 21:30:13 INFO: Finished downloading models and saved to /Users/elisa/stanza_resources.
2020-08-02 21:30:13 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | spacy     |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-08-02 21:30:13 INFO: Use device: cpu
2020-08-02 21:30:13 INFO: Loading: tokenize
2020-08-02 21:30:13 INFO: Using spaCy as tokenizer
2020-08-02 21:30:13 INFO: Loading: pos
2020-08-02 21:30:14 INFO: Loading: lemma
2020-08-02 21:30:14 INFO: Loading: depparse
2020-08-02 21:30:15 INFO: Loading: ner
2020-08-02 21:30:16 INFO: 

In [4]:
# read file
gold_file = '../../data/gold/gold_full_data/annotated_questions_responses_gold.csv'
gold_df = pd.read_csv(gold_file, index_col=False, dtype=object)

In [5]:
# convert string columns back to tuples
columns = ['gold_label_set', 'gold_act_set', 'gold_acts','gold_intents','gold_intent_set','gold_coarse_labels', 'gold_fine_labels', 
           'gold_workers', 'gold_sentiments', 'gold_explanations',
           'gold_q_intents', 'gold_q_sentiments',
           'gold_work_times', 'gold_assignment_ids', 'gold_hit_ids', 'coarse_labels', 'fine_labels',
           'workers', 'sentiments', 'explanations']
for column in columns:
    gold_df[column] = [ast.literal_eval(x) for x in gold_df[column]]

In [6]:
def expand_row(row):
    plural_column_names = ['gold_coarse_labels','gold_acts','gold_intents', 
                          'gold_fine_labels', 'gold_workers', 'gold_sentiments',
                          'gold_explanations', 'gold_q_intents', 'gold_q_sentiments',
                          'gold_work_times', 'gold_assignment_ids', 'gold_hit_ids',
                          'coarse_labels', 'fine_labels', 'workers', 'sentiments',
                          'explanations', 'r_text_tokenized', 'q_text_tokenized',
                          'gold_explanation_tokenized']
    
    lists_dict = {'gold_coarse_label': row['gold_coarse_labels'],
         'gold_act': row['gold_acts'],
         'gold_intent': row['gold_intents'],
         'gold_fine_label': row['gold_fine_labels'],
         'gold_worker': row['gold_workers'],
         'gold_sentiment': row['gold_sentiments'],
         'gold_explanation': row['gold_explanations'],
         'gold_q_intent': row['gold_q_intents'],
         'gold_q_sentiment': row['gold_q_sentiments'],
         'gold_work_time': row['gold_work_times'],
         'gold_assignment_id': row['gold_assignment_ids'],
        'gold_hit_id': row['gold_hit_ids']
        }
    df = pd.DataFrame(lists_dict)
    column_names = row.index
    for column_name in column_names:
        if column_name not in plural_column_names:
            if not isinstance(row[column_name], tuple):
                df[column_name] = row[column_name]
    return df

In [7]:
def get_document_length(list_of_lists):
    return len(list(chain.from_iterable(list_of_lists)))

In [8]:
gold_df['r_text_tokenized'] = gold_df.apply(lambda x: split_tokenize_text(x['r_text'], lemmatize=False), axis=1)
gold_df['r_text_num_sents'] = gold_df['r_text_tokenized'].str.len()
gold_df['r_text_len'] = gold_df.apply(lambda x: get_document_length(x['r_text_tokenized']), axis=1)

gold_df['q_text_tokenized'] = gold_df.apply(lambda x: split_tokenize_text(x['q_text'], lemmatize=False), axis=1)
gold_df['q_text_num_sents'] = gold_df['q_text_tokenized'].str.len()
gold_df['q_text_len'] = gold_df.apply(lambda x: get_document_length(x['q_text_tokenized']), axis=1)

#### Add question type

In [9]:
from importlib import reload
#reload(qc)

In [10]:
import sys
sys.path.insert(1, '/Users/elisa/Documents/CompLing/congressional_hearing/code/analyses/')
import question_classifier as qc
question_type_tuples = qc.get_question_types(list((gold_df['q_text']).items()))

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 120kB [00:00, 4.70MB/s]                    
2020-08-02 21:46:04 INFO: Downloading default packages for language: en (English)...
2020-08-02 21:46:05 INFO: File exists: /Users/elisa/stanza_resources/en/default.zip.
2020-08-02 21:46:09 INFO: Finished downloading models and saved to /Users/elisa/stanza_resources.
2020-08-02 21:46:09 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | spacy     |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-08-02 21:46:09 INFO: Use device: cpu
2020-08-02 21:46:09 INFO: Loading: tokenize
2020-08-02 21:46:09 INFO: Using spaCy as tokenizer
2020-08-02 21:46:09 INFO: Loading: pos
2020-08-02 21:46:10 INFO: Loading: lemma
2020-08-02 21:46:10 INFO: Loading: depparse
2020-08-02 21:46:11 INFO: Loading: ner
2020-08-02 21:46:12 INFO: 

In [11]:
qt_series = pd.Series(dict(question_type_tuples),name='question_type')

In [12]:
gold_df = pd.concat([gold_df, qt_series],axis=1)

In [13]:
def get_first_question_and_rest(text):
    doc = parser(text)
    sents = []
    for s_idx, sentence in enumerate(doc.sentences):
        if sentence.words[-1].text == "?":
            for after_sent in doc.sentences[(s_idx):]:
                sents.append(after_sent.text.strip(' '))
            break
    return ' '.join(sents)

def get_last_question_and_rest(text):
    doc = parser(text)
    sents = []
    for s_idx, sentence in enumerate(reversed(doc.sentences)):
        if sentence.words[-1].text == "?":
            sents.append(sentence.text.strip(' '))
            if s_idx > 0:
                for after_sent in doc.sentences[(s_idx*-1):]:
                    sents.append(after_sent.text.strip(' '))
            break
    return ' '.join(sents)

gold_df['q_text_first_question_and_rest'] = gold_df.apply(lambda x: get_first_question_and_rest(x['q_text']), axis=1)
gold_df['q_text_last_question_and_rest'] = gold_df.apply(lambda x: get_last_question_and_rest(x['q_text']), axis=1)

In [14]:
def get_last_question(text):
    doc = parser(text)
    for sentence in reversed(doc.sentences):
        if sentence.words[-1].text == "?":
            return sentence.text.strip(' ')
gold_df['q_text_last_question'] = gold_df.apply(lambda x: get_last_question(x['q_text']), axis=1)

In [15]:
gold_df['q_text_last_question_len'] = gold_df['q_text_last_question'].str.len()

In [16]:
def intent_to_binary(intents):
    binary_intents = []
    intent_to_binary = {'direct':0, 'overans':1, 'dodge':1, 'correct':0, 
                        'lying':1, 'sincere':0}
    
    for intent in intents:
        binary_intents.append(intent_to_binary[intent])
    return binary_intents

gold_df['gold_intents_binary'] = gold_df.apply(lambda x: intent_to_binary(x['gold_intents']), axis=1)

In [17]:
fine_to_coarse_dict = {'veryNegative': 'negative', 'negative': 'negative', 'somewhatNegative': 'negative',
                                   'neutral': 'neutral',
                                   'veryPositive': 'positive', 'positive': 'positive', 'somewhatPositive': 'positive'}

def map_fine_to_coarse(row, column_name, fine_to_coarse_dict):
    fine_labels = row[column_name]
    coarse_labels = []
    for label in fine_labels:
        coarse_labels.append(fine_to_coarse_dict[label])
    return tuple(coarse_labels)
gold_df['gold_sentiments_coarse'] = gold_df.apply(map_fine_to_coarse, args=('gold_sentiments', fine_to_coarse_dict,), axis=1)
gold_df['gold_q_sentiments_coarse'] = gold_df.apply(map_fine_to_coarse, args=('gold_q_sentiments', fine_to_coarse_dict,), axis=1)                                                                            

In [18]:
def map_to_num(row, column, mapping_dict):
    nums = []
    labels = row[column]
    for label in labels:
        nums.append(mapping_dict[label])
    return ' '.join(str(num) for num in nums)

def get_count(row, column_name, label_names):
    counts = []
    for label in label_names:
        counts.append(row[column_name].count(label))
    return ' '.join(map(str, counts))

In [19]:
coarse_sent_to_num_dict = {"negative": -1, "neutral": 0, "positive": 1}

gold_df['gold_sentiments_coarse_num'] = gold_df.apply(map_to_num, args=('gold_sentiments_coarse', coarse_sent_to_num_dict,), axis=1)
gold_df['gold_q_sentiments_coarse_num'] = gold_df.apply(map_to_num, args=('gold_q_sentiments_coarse', coarse_sent_to_num_dict,), axis=1)



coarse_sent_labels = ['negative', 'neutral', 'positive']
mlb_coarse_sent = preprocessing.MultiLabelBinarizer(classes=coarse_sent_labels) # set ordering

gold_df['gold_sentiments_coarse_binary'] = [' '.join(map(str, label_set)) for label_set in mlb_coarse_sent.fit_transform(gold_df['gold_sentiments_coarse'].values)]      
gold_df['gold_sentiments_coarse_count'] = gold_df.apply(get_count, args=('gold_sentiments_coarse', coarse_sent_labels,), axis=1)

gold_df['gold_q_sentiments_coarse_binary'] = [' '.join(map(str, label_set)) for label_set in mlb_coarse_sent.fit_transform(gold_df['gold_q_sentiments_coarse'].values)]      
gold_df['gold_q_sentiments_coarse_count'] = gold_df.apply(get_count, args=('gold_q_sentiments_coarse', coarse_sent_labels,), axis=1)

In [20]:
class_labels = ["answer", "answer_overans-sway", "shift-dodge", "shift-correct", 
                "cant-answer-lying", "cant-answer-sincere"]
mlb = preprocessing.MultiLabelBinarizer(classes=class_labels) # set ordering

def digitize_index(qa_index):
    return re.sub(r'\D', '', qa_index)

gold_df['gold_labels_binary'] = [''.join(map(str, label_set)) for label_set in mlb.fit_transform(gold_df['gold_label_set'].values)]
gold_df['qa_index_digits'] = gold_df.apply(lambda x: digitize_index(x['qa_index']), axis=1)

In [21]:
gold_df['hit_order'] = pd.to_numeric(gold_df['hit_order'], downcast='integer')

## Add other columns for speaker info

In [22]:
def get_last_name(speaker):
    names = speaker.split(' ')
    return names[-1]
gold_df['q_speaker_last_name'] = gold_df.apply(lambda x: get_last_name(x['q_speaker']),axis=1)

In [23]:
def get_all_questions(text):
    doc = parser(text)
    questions = []
    for sentence in doc.sentences:
        if sentence.words[-1].text == "?":
            questions.append(sentence.text.strip(' '))
    return ' '.join(questions)
gold_df['q_text_all_questions'] = gold_df.apply(lambda x: get_all_questions(x['q_text']), axis=1)

In [24]:
def get_last_n_sents(text, n):
    doc = parser(text)
    sents = []
    n_sents = n
    if len(doc.sentences) <= n:
        n_sents = len(doc.sentences)
    for sent in doc.sentences[-n_sents:]:
        sents.append(sent.text.strip(' '))
    return ' '.join(sents)
gold_df['q_text_last_2_sents'] = gold_df.apply(lambda x: get_last_n_sents(x['q_text'], 2), axis=1)

In [25]:
gold_df['q_text_last_3_sents'] = gold_df.apply(lambda x: get_last_n_sents(x['q_text'], 3), axis=1)

In [26]:
qintent_labels = ["attack", "neutral", "favor"]
qintent_to_num_dict = {"attack": -1, "neutral": 0, "favor": 1}
        
gold_df['gold_q_intents_num'] = gold_df.apply(map_to_num, args=('gold_q_intents', qintent_to_num_dict,), axis=1)

mlb_qintent = preprocessing.MultiLabelBinarizer(classes=qintent_labels) # set ordering
gold_df['gold_q_intents_binary'] = [' '.join(map(str, label_set)) for label_set in mlb_qintent.fit_transform(gold_df['gold_q_intents'].values)]
      
gold_df['gold_q_intents_count'] = gold_df.apply(get_count, args=('gold_q_intents', qintent_labels,), axis=1)

In [27]:
qtype_to_num_dict = {"YN":1, "OR":2, "DC":3, "WH":4, "TG":5}
gold_df['question_type_num'] = gold_df['question_type'].map(qtype_to_num_dict)

In [28]:
sent_to_num_dict = {"veryNegative": -3, "negative": -2, "somewhatNegative": -1, 
                   "neutral": 0,
                   'somewhatPositive': 1, 'positive': 2, 'veryPositive': 3}

gold_df['gold_sentiments_num'] = gold_df.apply(map_to_num, args=('gold_sentiments', sent_to_num_dict,), axis=1)
gold_df['gold_q_sentiments_num'] = gold_df.apply(map_to_num, args=('gold_q_sentiments', sent_to_num_dict,), axis=1)



sent_labels = ['veryNegative', 'negative', 'somewhatNegative', 'neutral',
               'somewhatPositive', 'positive', 'veryPositive']
mlb_sent = preprocessing.MultiLabelBinarizer(classes=sent_labels) # set ordering

gold_df['gold_sentiments_binary'] = [' '.join(map(str, label_set)) for label_set in mlb_sent.fit_transform(gold_df['gold_sentiments'].values)]      
gold_df['gold_sentiments_count'] = gold_df.apply(get_count, args=('gold_sentiments', sent_labels,), axis=1)

gold_df['gold_q_sentiments_binary'] = [' '.join(map(str, label_set)) for label_set in mlb_sent.fit_transform(gold_df['gold_q_sentiments'].values)]      
gold_df['gold_q_sentiments_count'] = gold_df.apply(get_count, args=('gold_q_sentiments', sent_labels,), axis=1)

In [29]:
def get_speaker_role(speaker_detail):
    return ' '.join(speaker_detail.split(':')[-1].split(',')[1:])

In [30]:
gold_df['q_speaker_role'] = gold_df.apply(lambda x: get_speaker_role(x['q_speaker_detail']),axis=1)
gold_df['r_speaker_role'] = gold_df.apply(lambda x: get_speaker_role(x['r_speaker_detail']),axis=1)

In [31]:
politicians = gold_df['q_speaker_detail'].unique()
politicians_to_parties = {}
politicians_to_parties['politician: SCOTT PERRY, Pennsylvania']='Republican'
politicians_to_parties['politician: BRAD SHERMAN, California']='Democratic'
politicians_to_parties['politician: JIM JORDAN, Ohio']='Republican'
politicians_to_parties['politician: DARRELL E. ISSA, California, Chairman']='Republican'
politicians_to_parties['politician: JEB HENSARLING, Texas, Chairman']='Republican'
politicians_to_parties['politician: ELIJAH E. CUMMINGS, Maryland, Chairman']='Democratic'
politicians_to_parties['politician: FRANK PALLONE, Jr., New Jersey']='Democratic'
politicians_to_parties['politician: JOE BARTON, Texas']='Republican'
politicians_to_parties['politician: Debbie Wasserman Schultz, Florida']='Democratic'
politicians_to_parties['politician: Jim Jordan, Ohio, Ranking Minority']='Republican'
politicians_to_parties['politician: Mark E. Green, Tennessee']='Republican'
politicians_to_parties['politician: JOHN CONYERS, Jr., Michigan']='Democratic'
politicians_to_parties['politician: BOB GOODLATTE, Virginia, Chairman']='Republican'
politicians_to_parties['politician: F. JAMES SENSENBRENNER, Jr.,']='Republican'
politicians_to_parties['politician: DARRELL E. ISSA, California']='Republican'
politicians_to_parties['politician: ZOE LOFGREN, California']='Democratic'
politicians_to_parties['politician: LAMAR S. SMITH, Texas']='Republican'
politicians_to_parties['politician: JERROLD NADLER, New York']='Democratic'
politicians_to_parties['politician: ELIJAH E. CUMMINGS, Maryland,']='Democratic'
politicians_to_parties['politician: TED S. YOHO, Florida']='Republican'
politicians_to_parties['politician: THEODORE E. DEUTCH, Florida']='Democratic'
politicians_to_parties['politician: ADAM KINZINGER, Illinois']='Republican'
politicians_to_parties['politician: MAXINE WATERS, California, Ranking']='Democratic'
politicians_to_parties['politician: BLAINE LUETKEMEYER, Missouri']='Republican'
politicians_to_parties['politician: CAROLYN B. MALONEY, New York']='Democratic'
politicians_to_parties['politician: FRANK D. LUCAS, Oklahoma']='Republican'
politicians_to_parties['politician: DAVID SCOTT, Georgia']='Democratic'
politicians_to_parties['politician: BOB GIBBS, Ohio']='Republican'
politicians_to_parties['politician: JIM COSTA, California']='Democratic'
politicians_to_parties['politician: STEVE CHABOT, Ohio']='Republican'
politicians_to_parties['politician: STEVE KING, Iowa']='Republican'
politicians_to_parties['politician: JOHN F. TIERNEY, Massachusetts']='Democratic'
politicians_to_parties['politician: MICHAEL R. TURNER, Ohio']='Republican'
politicians_to_parties['politician: JOHN L. MICA, Florida']='Republican'
politicians_to_parties['politician: ELEANOR HOLMES NORTON, District of']='Democratic'
politicians_to_parties['politician: ANDY BARR, Kentucky']='Republican'
politicians_to_parties['politician: JOHN SHIMKUS, Illinois']='Republican'
politicians_to_parties['politician: RON DeSANTIS, Florida']='Republican'
politicians_to_parties['politician: PAUL TONKO, New York']='Democratic'
politicians_to_parties['politician: RANDY NEUGEBAUER, Texas']='Republican'
politicians_to_parties['politician: GREG WALDEN, Oregon']='Republican'
politicians_to_parties['politician: SCOTT H. PETERS, California']='Democratic'
politicians_to_parties['politician: RAUL RUIZ, California']='Democratic'
politicians_to_parties['politician: DAVID B. McKINLEY, West Virginia']='Republican'
politicians_to_parties['politician: Chip Roy, Texas']='Republican'
politicians_to_parties['politician: Kelly Armstrong, North Dakota']='Republican'
politicians_to_parties['politician: Gerald E. Connolly, Virginia']='Democratic'
politicians_to_parties['politician: Jamie Raskin, Maryland']='Democratic'
politicians_to_parties['politician: Ranking Minority Member']='Republican'
politicians_to_parties['politician: K. MICHAEL CONAWAY, Texas, Chairman']='Republican'
politicians_to_parties['politician: COLLIN C. PETERSON, Minnesota,']='Democratic'
politicians_to_parties['politician: MARCIA L. FUDGE, Ohio']='Democratic'
politicians_to_parties['politician: AUSTIN SCOTT, Georgia']='Republican'
politicians_to_parties['politician: BILL HUIZENGA, Michigan']='Republican'
politicians_to_parties['politician: NYDIA M. VELAZQUEZ, New York']='Democratic'
politicians_to_parties['politician: WILLIAM KEATING, Massachusetts']='Democratic'
politicians_to_parties['politician: PATRICK T. McHENRY, North Carolina,']='Republican'
politicians_to_parties['politician: SCOTT GARRETT, New Jersey']='Republican'
politicians_to_parties['politician: MATT CARTWRIGHT, Pennsylvania']='Democratic'
politicians_to_parties['politician: MICK MULVANEY, South Carolina']='Republican'
politicians_to_parties['politician: GWEN MOORE, Wisconsin']='Democratic'
politicians_to_parties['politician: JERROLD NADLER, New York, Chairman']='Democratic'
politicians_to_parties['politician: ANNA G. ESHOO, California']='Democratic'
politicians_to_parties['politician: MICHAEL C. BURGESS, Texas']='Republican'
politicians_to_parties['politician: ELIOT L. ENGEL, New York']='Democratic'
politicians_to_parties['politician: GENE GREEN, Texas']='Democratic'
politicians_to_parties['politician: ELIOT L. ENGEL, New York, Chairman']='Democratic'
politicians_to_parties['politician: MICHAEL T. McCAUL, Texas, Ranking']='Republican'
politicians_to_parties['politician: ALBIO SIRES, New Jersey']='Democratic'
politicians_to_parties['politician: STEVAN PEARCE, New Mexico']='Republican'
politicians_to_parties['politician: TAMMY DUCKWORTH, Illinois']='Democratic'
politicians_to_parties['politician: GREGORY W. MEEKS, New York']='Democratic'
politicians_to_parties['politician: SHEILA JACKSON LEE, Texas']='Democratic'
politicians_to_parties['politician: DOUG COLLINS, Georgia, Ranking']='Republican'
politicians_to_parties['politician: BOBBY L. RUSH, Illinois']='Democratic'
politicians_to_parties['politician: FRED UPTON, Michigan']='Republican'
politicians_to_parties['politician: JAMES P. McGOVERN, Massachusetts']='Democratic'
politicians_to_parties['politician: DWIGHT EVANS, Pennsylvania']='Democratic'
politicians_to_parties['politician: LISA BLUNT ROCHESTER, Delaware']='Democratic'
politicians_to_parties['politician: CHERI BUSTOS, Illinois']='Democratic'
politicians_to_parties['politician: TRENT KELLY, Mississippi']='Republican'
politicians_to_parties['politician: SCOTT DesJARLAIS, Tennessee']='Republican'
politicians_to_parties["politician: ERIC A. ``RICK'' CRAWFORD, Arkansas"]='Republican'
politicians_to_parties['politician: DOUG LaMALFA, California']='Republican'
politicians_to_parties['politician: RALPH LEE ABRAHAM, Louisiana']='Republican'
politicians_to_parties['politician: JIMMY PANETTA, California']='Democratic'
politicians_to_parties['politician: DARREN SOTO, Florida']='Democratic'
politicians_to_parties['politician: AL LAWSON, Jr., Florida']='Democratic'
politicians_to_parties['politician: JAMES COMER, Kentucky']='Republican'
politicians_to_parties['politician: GERALD E. CONNOLLY, Virginia']='Democratic'
politicians_to_parties['politician: J. RANDY FORBES, Virginia']='Republican'
politicians_to_parties['politician: STEVE COHEN, Tennessee']='Democratic'
politicians_to_parties["politician: HENRY C. ``HANK'' JOHNSON, Jr.,"]='Democratic'
politicians_to_parties['politician: JASON CHAFFETZ, Utah']='Republican'
politicians_to_parties['politician: WM. LACY CLAY, Missouri']='Democratic'
politicians_to_parties['politician: ROBERT PITTENGER, North Carolina']='Republican'
politicians_to_parties['politician: MICHAEL E. CAPUANO, Massachusetts']='Democratic'
politicians_to_parties['politician: TOM EMMER, Minnesota']='Republican'
politicians_to_parties['politician: EDWARD R. ROYCE, California']='Republican'

In [32]:
gold_df['q_speaker_party'] = gold_df['q_speaker_detail'].map(politicians_to_parties)

## Add entropy

In [33]:
import scipy.stats
from collections import Counter

def get_entropy(labels):
    counts = list(Counter(labels).values())
    label_entropy = scipy.stats.entropy(counts, base=2)
    return label_entropy

gold_df['entropy'] = gold_df.apply(lambda x: get_entropy(x['gold_coarse_labels']), axis=1)

In [34]:
# normalize the entropy
gold_df['entropy_norm'] = (gold_df['entropy']-gold_df['entropy'].min())/(gold_df['entropy'].max()-gold_df['entropy'].min())

In [35]:
# map to 4 buckets
entropy_norm_to_buckets={0:0, 
                         0.5445684476282008:0.5, 0.579380164285695: 0.5,
                         0.6126016192893443: 0.6, 0.6216097450797567: 0.6, 0.6309297535714575: 0.6,
                         0.9821410328348753: 1, 1: 1}
gold_df['entropy_norm_buckets'] = gold_df['entropy_norm'].map(entropy_norm_to_buckets)

In [36]:
gold_df['entropy_norm_buckets'].unique()

array([0.6, 0.5, 1. , 0. ])

In [37]:
entropy_categories = sorted(gold_df['entropy'].unique())

In [38]:
def get_entropy_binarized(entropy):
    category = entropy_categories.index(entropy)
    category_binarized = [0]*len(entropy_categories)
    category_binarized[category] = 1
    return ''.join(map(str, category_binarized))

In [39]:
gold_df['entropy_binarized'] = gold_df.apply(lambda x: get_entropy_binarized(x['entropy']), axis=1)

In [40]:
def get_entropy_ordinal(entropy):
    ordinal = entropy_categories.index(entropy)
    return ordinal
gold_df['entropy_ordinal'] = gold_df.apply(lambda x: get_entropy_ordinal(x['entropy']), axis=1)

In [41]:
entropy_norm_categories = sorted(gold_df['entropy_norm_buckets'].unique())

def get_entropy_ordinal_buckets(entropy):
    ordinal =entropy_norm_categories.index(entropy)
    return ordinal
gold_df['entropy_ordinal_buckets'] = gold_df.apply(lambda x: get_entropy_ordinal_buckets(x['entropy_norm_buckets']), axis=1)

## Expand dataset

In [42]:
expanded_dfs = gold_df.apply(expand_row, axis=1)
expanded_dfs = expanded_dfs.tolist()
expanded_df = pd.concat(expanded_dfs, ignore_index=True, sort=False)

In [43]:
expanded_df['gold_explanation_tokenized'] = expanded_df.apply(lambda x: split_tokenize_text(x['gold_explanation'], lemmatize=False), axis=1)
expanded_df['gold_explanation_num_sents'] = expanded_df['gold_explanation_tokenized'].str.len()
expanded_df['gold_explanation_len'] = expanded_df.apply(lambda x: get_document_length(x['gold_explanation_tokenized']), axis=1)

In [44]:
fine_to_coarse_dict = {'veryNegative': 'negative', 'negative': 'negative', 'somewhatNegative': 'negative',
                                   'neutral': 'neutral',
                                   'veryPositive': 'positive', 'positive': 'positive', 'somewhatPositive': 'positive'}
expanded_df['gold_sentiment_coarse'] = expanded_df['gold_sentiment'].map(fine_to_coarse_dict)
expanded_df['gold_q_sentiment_coarse'] = expanded_df['gold_q_sentiment'].map(fine_to_coarse_dict)

In [45]:
sent_to_num_dict = {'veryNegative': -3, 'negative': -2, 'somewhatNegative': -1,
                                   'neutral': 0,
                                   'veryPositive': 3, 'positive': 2, 'somewhatPositive': 1}
expanded_df['gold_sentiment_num'] = expanded_df['gold_sentiment'].map(sent_to_num_dict)
expanded_df['gold_q_sentiment_num'] = expanded_df['gold_q_sentiment'].map(sent_to_num_dict)

sent_coarse_to_num_dict = {'negative': -1, 'neutral': 0, 'positive': 1}
expanded_df['gold_sentiment_coarse_num'] = expanded_df['gold_sentiment_coarse'].map(sent_coarse_to_num_dict)
expanded_df['gold_q_sentiment_coarse_num'] = expanded_df['gold_q_sentiment_coarse'].map(sent_coarse_to_num_dict)

## Add politeness features

In [46]:
# source:
# https://github.com/facebookresearch/intentions-perceptions/blob/master/survey_results_analysis.ipynb

In [47]:
from convokit import User, Utterance, Corpus, TextParser, PolitenessStrategies, FightingWords

In [48]:
placeholder_user = User(name="User") # ConvoKit asks for user info but for this pure linguistic analysis we don't need that, so we'll just put in a dummy.
# We use ConvoKit (convokit.cornell.edu) to extract some of the features (namely those related to politeness). So, we must first reformat the
# text data for use with ConvoKit.
utts = []
# since we do not have unique IDs for comments in the data, we'll set up our own custom identification scheme: {A/B}{number}{r?} where the first component
# identifies which survey this comment was used in, the second is just a numerical index corresponding to rows in the source data table, and the optional
# third component distinguishes replies from initial comments ("r" for replies, blank for initial)
for row in expanded_df.itertuples():
    utts.append(Utterance(
        text=row.q_text,
        id='Q' + str(row.Index),
        root='Q' + str(row.Index),
        reply_to=None,
        user=placeholder_user,
        timestamp=0
    ))
    utts.append(Utterance(
        text=row.q_text_last_question,
        id='L' + str(row.Index),
        root='L' + str(row.Index),
        reply_to=None,
        user=placeholder_user,
        timestamp=0
    ))
    utts.append(Utterance(
        text=row.r_text,
        id='R' + str(row.Index),
        root='R' + str(row.Index),
        reply_to='Q' + str(row.Index),
        user=placeholder_user,
        timestamp=0
    ))
    utts.append(Utterance(
        text=row.gold_explanation,
        id='E' + str(row.Index),
        root='E' + str(row.Index),
        reply_to='R' + str(row.Index),
        user=placeholder_user,
        timestamp=0
    ))
corpus = Corpus(utterances=utts)



In [49]:
# extract politeness strategies using ConvoKit
corpus = TextParser().transform(corpus)
ps = PolitenessStrategies()
corpus_polite = ps.transform(corpus)

In [50]:
# Other features will be extracted manually based on dependency parses and NER, both conducted using SpaCy.
spacy_nlp = spacy.load('en')
utt_ids = corpus.get_utterance_ids()
utts = [corpus.get_utterance(i).text for i in utt_ids]
parses = dict(zip(utt_ids, spacy_nlp.pipe(utts, n_threads=-1)))
for utterance in corpus.iter_utterances():
    utterance.meta['spacy'] = parses[utterance.id]

In [51]:
# feature vector generation

# manually-constructed lexicon of question words/phrases, derived from features discovered in Liu and Jansen (2015)
questions_lexicon = [
    'who',
    'what',
    'should',
    'good',
    'who is going',
    'where',
    'what time',
    'want'
]

def linguistic_feature_extractor(utt):
    # list of features that *don't* come from the politeness strategies
    feats = {
        'has_adjective': 0,
        'has_cardinal_num': 0,
        'has_modal': 0,
        'has_adverb': 0,
        'has_named_entity': 0,
        'question_lexicon': 0
    }
    # we don't use all politeness strategies, only those directly related to prior work on subjectivity detection
    kept_strats = set([
        'feature_politeness_==Hedges==',
        'feature_politeness_==Factuality==',
        'feature_politeness_==Please==',
        'feature_politeness_==1st_person==',
        'feature_politeness_==2nd_person==',
        'feature_politeness_==HASPOSITIVE==',
        'feature_politeness_==HASNEGATIVE==',
        'feature_politeness_==INDICATIVE==' # Elisa added!
    ])
    # non-politeness features are extracted based on SpaCy language features
    for token in utt.meta['spacy']:
        if token.pos_ == 'ADJ':
            feats['has_adjective'] = 1
        if token.tag_ == 'CD':
            feats['has_cardinal_num'] = 1
        if token.tag_ == 'MD' and token.text.lower() != 'will':
            feats['has_modal'] = 1
        if token.pos_ == 'ADV' and token.text.lower()!= 'not':
            feats['has_adverb'] = 1
        if token.ent_type_ in set(['NORP', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'LAW', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY']):
            feats['has_named_entity'] = 1
    if any(li in utt.text.lower() for li in questions_lexicon):
        feats['question_lexicon'] = 1
    # update the feature vector (currently containing features from `feats`) with the relevant politeness strategies
    feats.update({k: v for k, v in utt.meta['politeness_strategies'].items()})# if k in kept_strats})
    # additionally include demographic variables to act as controls in the regression analysis
    return feats

In [52]:
question_ling_feats = {}
question_last_ling_feats = {}
response_ling_feats = {}
explanation_ling_feats = collections.defaultdict(list)
for utt in corpus.iter_utterances():
    feats = linguistic_feature_extractor(utt)
    true_id = utt.root[1:]
    if utt.root.startswith('Q'):
        question_ling_feats[true_id] = feats
    elif utt.root.startswith('L'):
        question_last_ling_feats[true_id] = feats
    elif utt.root.startswith('R'):
        response_ling_feats[true_id] = feats
    elif utt.root.startswith('E'):
        explanation_ling_feats[true_id] = feats



In [53]:
def add_ling_features(row):
    index = str(row.name)
    q_dict = update_dict_with_prefix(question_ling_feats[index], 'q_')
    ql_dict = update_dict_with_prefix(question_last_ling_feats[index], 'l_')
    r_dict = update_dict_with_prefix(response_ling_feats[index], 'r_')
    e_dict = update_dict_with_prefix(explanation_ling_feats[index], 'e_')
    merged_dict = {**q_dict, **ql_dict, **r_dict, **e_dict}
    return pd.Series(merged_dict)

def update_dict_with_prefix(old_dict, prefix):
    updated_keys = {key: prefix+key for key in old_dict.keys()}
    updated_dict = dict((updated_keys[key], value) for (key, value) in old_dict.items())
    return updated_dict

In [54]:
ling_df = expanded_df.apply(lambda x: add_ling_features(x), axis=1)
ling_columns = ling_df.columns


In [55]:
expanded_ling_df = pd.concat([expanded_df, ling_df],axis=1)

In [56]:
expanded_ling_df = pd.merge(expanded_ling_df, gold_df[['gold_coarse_labels','qa_index']],on=['qa_index'])

In [57]:
label_to_act = {'answer':'answer', 'answer_overans-sway':'answer','shift-dodge':'shift', 'shift-correct':'shift', 'cant-answer-lying':'cant-answer',
       'cant-answer-sincere':'cant-answer'}
expanded_ling_df['gold_act'] = expanded_ling_df['gold_coarse_label'].map(label_to_act)
label_to_intent = {'answer':'direct', 'answer_overans-sway':'overans','shift-dodge':'dodge', 'shift-correct':'correct', 'cant-answer-lying':'lying',
       'cant-answer-sincere':'sincere'}
expanded_ling_df['gold_intent'] = expanded_ling_df['gold_coarse_label'].map(label_to_intent)



In [58]:
expanded_ling_df.to_csv('../../data/gold/gold_full_data/expanded_with_features_annotated_questions_responses_gold.csv', sep=',', index=False)

In [59]:
def add_feature(row, feature_name):
    qa_index = row['qa_index']
    if 'tokenized' in feature_name:
        feature_value = expanded_ling_df[expanded_ling_df['qa_index']==qa_index][feature_name].values[0]
    else:
        feature_value = expanded_ling_df[expanded_ling_df['qa_index']==qa_index][feature_name].unique()
        if len(feature_value) > 1:
            feature_value = tuple(feature_value)
        else:
            feature_value = feature_value[0]
    return feature_value

In [60]:
ling_columns = set(expanded_ling_df.columns)
gold_columns = set(gold_df.columns)
new_columns = ling_columns.difference(gold_columns)
new_columns = [col for col in new_columns if not col.startswith('gold_') or 'tokenized' in col]


In [61]:
for column in new_columns:
    gold_df[column] = gold_df.apply(add_feature, args=(column,),axis=1)

## Get some stats on dataset

In [62]:
print('Q total sents:', gold_df['q_text_num_sents'].sum())
print('Q total tokens:',gold_df['q_text_len'].sum())
print('R total sents:',gold_df['r_text_num_sents'].sum())
print('R total tokens:',gold_df['r_text_len'].sum())

Q total sents: 4096
Q total tokens: 82582
R total sents: 2634
R total tokens: 48831


In [63]:
speakers = expanded_ling_df.groupby(['hearing_id'])['q_speaker'].unique().values
num_speakers = [len(speaker) for speaker in speakers]
np.mean(num_speakers)

8.15

In [64]:
workers = expanded_ling_df.groupby(['hearing_id'])['gold_worker'].unique()
num_workers = [len(worker) for worker in workers]
num_workers

[22, 18, 19, 17, 13, 13, 28, 19, 9, 24, 16, 20, 23, 29, 15, 18, 19, 11, 18, 17]

### Create label powerset

In [65]:
counts=gold_df.sort_values(by=['qa_index'])['gold_label_set'].value_counts()
label_sets = counts.iloc[np.lexsort((counts.index, -counts.values))].index.values
label_nums = list(map(str, list(range(len(label_sets)))))
label_set_dict=dict(zip(label_sets,label_nums))

set_labels = label_nums
enc_set = preprocessing.OneHotEncoder(sparse=False,categories=[set_labels])

index_to_powerset = {}
for row in gold_df.sort_values(by=['qa_index']).itertuples():
    label = label_set_dict[row.gold_label_set]
    index_to_powerset[row.Index]=label
powerset_df = pd.DataFrame.from_dict(index_to_powerset,orient='index',columns=['gold_label_powerset'])
gold_df = gold_df.merge(powerset_df, left_index=True, right_index=True)
gold_df['gold_label_powerset'] = gold_df['gold_label_powerset'].astype(str)
gold_df['gold_label_powerset_binary'] = [''.join(map(str, map(int,label_set))) for label_set in enc_set.fit_transform(gold_df['gold_label_powerset'].values.reshape(-1, 1))]

In [66]:
with open('/Users/elisa/Documents/CompLing/congressional_hearing/data/gold/gold_full_data/labels_to_powersets.csv', 'w') as f:
    w = csv.DictWriter(f, label_set_dict.keys())
    w.writeheader()
    w.writerow(label_set_dict)

In [67]:
gold_df.sort_values(by=['qa_index']).to_csv('/Users/elisa/Documents/CompLing/congressional_hearing/data/gold/gold_full_data/with_features_annotated_questions_responses_gold.tsv', sep='\t')