# Conversations gone awry, Wikipedia version

In [15]:
from convokit import Corpus, download
import spacy
import pickle
corpus = Corpus(filename=download('conversations-gone-awry-corpus'))
reddit_corpus = Corpus(filename=download('conversations-gone-awry-cmv-corpus'))

# Load liwc_dic
with open('liwc_dic.pkl', 'rb') as handle:
    liwc_dic = pickle.load(handle)

Dataset already exists at C:\Users\ewais\.convokit\downloads\conversations-gone-awry-corpus
Dataset already exists at C:\Users\ewais\.convokit\downloads\conversations-gone-awry-cmv-corpus


In [16]:
conv = corpus.get_conversation('146743638.12652.12652')
print(conv)

Conversation('id': '146743638.12652.12652', 'utterances': ['146743638.12652.12652', '146743638.12667.12652', '146842219.12874.12874', '146860774.13072.13072'], 'meta': {'page_title': 'User talk:2005', 'page_id': 1003212, 'pair_id': '143890867.11926.11926', 'conversation_has_personal_attack': False, 'verified': True, 'pair_verified': True, 'annotation_year': '2018', 'split': 'train'})


In [17]:
corpus.get_utterance('146860774.13072.13072').text

"I see what you saying I just read his pokerstars profile, It struck me when I saw the change because I remember him being called Bill when I watched the last season of high stakes poker, But you seem to have many more years experience in the Poker/Gambling world then I do(I'm still a bit of a newbie), so I wanted to check with you first. BTW as far as the WPT, I was thinking nine that made up final and 6 for the tv table, I read this article that say the WPT Final table is made up of 10 players, with the final six that make it on TV,  I just want to be sure that they are correct, when I update the players infobox stats, thanks ▪◦▪ "

In [18]:
corpus.print_summary_stats()

Number of Speakers: 8069
Number of Utterances: 30021
Number of Conversations: 4188


In [19]:
utt = corpus.random_utterance()
print(utt.speaker.id)

SitNGo


In [20]:
convo = corpus.random_conversation()
print(convo)
for utt in convo.get_utterance_ids():
    print(corpus.get_utterance[utt])

Conversation('id': '100646530.28762.28762', 'utterances': ['100646530.28800.28762', '100646530.28762.28762', '100783978.28875.28875', '103509566.29831.29831', '103517246.30257.30257', '103518966.30257.30257'], 'meta': {'page_title': 'Talk:1996 United States campaign finance controversy', 'page_id': 3079042, 'pair_id': '86036261.21021.21021', 'conversation_has_personal_attack': True, 'verified': False, 'pair_verified': False, 'annotation_year': '2019', 'split': 'test'})


TypeError: 'method' object is not subscriptable

In [21]:
paths = convo.get_longest_paths()
for path in paths:
    print([utt.id for utt in path])
    print([utt.get_speaker().id for utt in path])
    print([corpus.get_utterance(utt.id).retrieve_meta('comment_has_personal_attack') for utt in path])

['100646530.28762.28762', '100646530.28800.28762', '100783978.28875.28875', '103509566.29831.29831', '103517246.30257.30257']
['Mastgrr', 'Mastgrr', 'Jayzel68', 'Derex', 'Will Beback']
[False, False, False, False, False]
['100646530.28762.28762', '100646530.28800.28762', '100783978.28875.28875', '103509566.29831.29831', '103518966.30257.30257']
['Mastgrr', 'Mastgrr', 'Jayzel68', 'Derex', 'Jayzel68']
[False, False, False, False, True]


## Utterance features
- **id**: index of the utterance
- **conversation_id**: id of the first utterance in the converstaion this utterance belongs to
- **reply-to**: index of the utterance to which this utterance replies to (None if not a reply)
- **speaker**: the speaker who authored the utterance
- **timestamp**: timestamp of utterance
- **text**: textual content of the utterance
- **meta**: metadata for each utterance
    - **is_section_header**: whether the utterance is a conversation "title" or "subject" (if true, the utterance should be ignored)
    - **comment_has_personal_attack**: whether this comment was judged by 3 crowdsourced annotators to contain a personal comment_has_personal_attack
    - **parsed**: SpaCy parsed version of the utterance text
        - **rt**: ??
        - **toks**: List of parsed tokens
            - **tok**: the token (word, punctuation, etc.)
            - **tag**: Detailed part of speech tag
            - **dep**: syntactic dependency, i.e. the relation between the tokens
            - **up**: list related to dn, not sure how
            - **dn**: list related to up, not sure how

## Conversation features

- **id**: id of the conversation
- **utterances**: ids of utterances in the conversation (in order I presume)
- **meta**: conversation metadata
    - **page_title**: title of page under which conversation is occurring
    - **page_id**: unique numerical id of the talk page
    - **pair_id**: the id of the conversation that this comment's conversation is paired with
    - **conversation_has_personal_attack**: whether any comment in this comment's conversation contains a personal attack
    - **verified**: whether the personal attack label has been verified by an internal annotator
    - **pair_verified**: whether the personal attack label has been double checked by the internal annotator
    - **annotation_year**: self explanatory
    - **split**: (train, test, or val) whether this conversation was used as train, test, or val in "Trouble on the Horizon"


First, we want to find the conversations that are easy to analyze, i.e. have a structure like (a -> b -> a -> b -> ...). detect_interlocution should reveal those conversations

In [30]:
# We want to consider conversations with a call-reply structure between two speakers, having at least five utterances
def detect_interlocution(conv, min_utts, print=False):
    '''
    Finds whether the conversation has a call-reply structure between two speakers with at least min_utts utterances

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
        > min_utts - the minimum number of utterances that constitute a valid conversation
        > print - whether or not to print why the conversation was rejected
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > bool representing whether or not the conversation's longest path has the aforementioned structure
    '''
    # At the moment, only considering first longest path if there are multiple # TODO: Add functionality to examine all paths
    try:
        longest_path = conv.get_longest_paths()[0]
    except ValueError as v:
        if print:
            print(v)
            print('skipping...')
        return False
    
    if len(longest_path) < min_utts:
        if print:
            print('Less than {} utterances in conversation\nskipping...'.format(min_utts))
        return False

    speakers = []

    for utt in longest_path:
        speakers.append(utt.get_speaker().id)
    
    if len(set(speakers)) > 2:
        if print:
            print('More than 2 speakers in conversation\nskipping...')
        return False

    return True

def get_valid_conv_ids(corpus, exclude_last=True):
    # Determine number of valid conversations
    num_valid = 0
    valid_conv_ids = []  # Will hold IDs of all valid converations
    for conv in corpus.iter_conversations():
        if detect_interlocution(conv, 5):
            valid_conv_ids.append(conv.id)
            num_valid += 1
    
    # Remove invalid
    invalids = []
    for j in range(len(valid_conv_ids)):
        speakers, utts = get_speaker_utt_lists(corpus.get_conversation(valid_conv_ids[j]))
        try:
            pairs = get_pairs(speakers, utts)
        except:
            print('Error at index {}, removing'.format(j))
            num_valid -= 1
            invalids.append(j)
            

        # Remove the last utterance which contains the personal attack (or lack thereof)
        if exclude_last:
            last_utt = utts[-1]
            for i in range(len(pairs['a_b'])):
                if last_utt in pairs['a_b'][i]:
                    del pairs['a_b'][i]
                
            for i in range(len(pairs['b_a'])):
                if last_utt in pairs['b_a'][i]:
                    del pairs['b_a'][i]

        # Get markers from speaker a to b
        # Note the order of a_b switched to b_a here. This is to be consistent with
        # the notation of C(b,a) indicating the coordination of b to a
        elicit_b_a = initialize_dict()
        baseline_b_a = initialize_dict()
        for a_b in pairs['a_b']:
            u_a  = corpus.get_utterance(a_b[0])
            u_b = corpus.get_utterance(a_b[1])
            m_u_a = get_style_markers(u_a)
            m_u_b = get_style_markers(u_b)
            for k in m_u_a:
                if m_u_a[k]:
                    if m_u_a[k] == m_u_b[k]:  # If b responded to a with same style marker
                        elicit_b_a[k] += 1
                baseline_b_a[k] += m_u_b[k] # b's response contains m regardless of a's prompt
        
        # Get markers from speaker b to a
        elicit_a_b = initialize_dict()
        baseline_a_b = initialize_dict()
        for b_a in pairs['b_a']:
            u_b  = corpus.get_utterance(b_a[0])
            u_a = corpus.get_utterance(b_a[1])
            m_u_a = get_style_markers(u_a)
            m_u_b = get_style_markers(u_b)
            for k in m_u_b:  
                if m_u_b[k]:
                    if m_u_b[k] == m_u_a[k]:  # If a responded to b with same style marker
                        elicit_a_b[k] += 1
                baseline_a_b[k] += m_u_a[k] # If a's response contains m regardless of b's prompt
        
        
        # Convert to probabilities, preserving raw baselines for LSM calculation
        raw_b_a = baseline_b_a.copy()
        raw_a_b = baseline_a_b.copy()
        num_response_b_a = len(pairs['a_b'])  # Number of responses from b to a
        num_response_a_b = len(pairs['b_a'])  # Number of responses from a to b
        # Sometimes there aren't any responses from a to b or from b to a, continue if this is the case
        if not num_response_a_b or not num_response_b_a:
            print('Not enough responses at index {}'.format(j))
            invalids.append(j)
            num_valid -= 1

    for i, inv in enumerate(invalids):
        del valid_conv_ids[inv-i]

    print('({}/{}) {:.1f}% conversations valid'.format(num_valid, len(corpus.get_conversation_ids()), num_valid*100/len(corpus.get_conversation_ids())))
    return valid_conv_ids


valid_conv_ids = get_valid_conv_ids(corpus)
r_valid_conv_ids = get_valid_conv_ids(reddit_corpus)

Not enough responses at index 15
Not enough responses at index 32
Not enough responses at index 94
Not enough responses at index 95
Not enough responses at index 119
Not enough responses at index 155
Not enough responses at index 157
Not enough responses at index 159
Not enough responses at index 164
Not enough responses at index 174
Not enough responses at index 181
Not enough responses at index 184
Not enough responses at index 192
Not enough responses at index 197
Error at index 210, removing
Error at index 214, removing
Error at index 219, removing
Not enough responses at index 224
Not enough responses at index 234
Not enough responses at index 241
Not enough responses at index 244
Not enough responses at index 251
Not enough responses at index 253
Not enough responses at index 255
Not enough responses at index 257
Not enough responses at index 282
Not enough responses at index 286
Not enough responses at index 287
Not enough responses at index 294
Not enough responses at index 300

How to get a single conversation from list of valid IDs

In [33]:
with open('r_valid_conv_ids.pkl', 'wb') as f:
    pickle.dump(r_valid_conv_ids, f)
    f.close()

In [7]:
corpus.get_conversation(valid_conv_ids[4])

Conversation({'obj_type': 'conversation', 'meta': {'page_title': 'User talk:AngryParsley', 'page_id': 1282978, 'pair_id': '12941035.584.584', 'conversation_has_personal_attack': False, 'verified': True, 'pair_verified': True, 'annotation_year': '2018', 'split': 'train'}, 'vectors': [], 'tree': <convokit.model.utteranceNode.UtteranceNode object at 0x000001919C8E8910>, 'owner': <convokit.model.corpus.Corpus object at 0x000001919BF3E970>, 'id': '12451425.436.436'})

Now that we have a list of valid conversations, let's find a way to isolate the utterances in a way that we can easily analyze for style accommodation

In [23]:
def get_speaker_utt_lists(conv):
    '''
    Generates lists of speaker IDs corresponding to utterances in conv, and gets utterances

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > speakers - list of speakers corresponding to each utterance
        > utts - list of utterances from conversation
    '''
    # Recall, we use the first index longest path in our list of valid conversations
    longest_path = conv.get_longest_paths()[0]

    speakers = []
    utts = []

    for utt in longest_path:
        utts.append(utt.id)
        speakers.append(utt.get_speaker().id)

    return speakers, utts

def get_pairs(speakers, utts):
    '''
    Generates a dictionary of pairs of utterances, each pair representing a back and forth interaction

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > speakers - list of speakers corresponding to each utterance
        > utts - list of utterances from conversation
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > pairs - dictionary with the following structure
            a : ID of speaker a
            b : ID of speaker b
            a_b : [(tuple of 2 utterance IDs, first being from speaker a and second from speaker b), (...), ...]
            b_a : [(tuple of 2 utterance IDs, first being from speaker b and second from speaker a), (...), ...]
    '''
    # TODO: Account for instances where a speaker speaks multiple times in a row. Combine those into a list of utteranes within the tuples
    pairs = {
        'a' : list(set(speakers))[0],
        'b' : list(set(speakers))[1],
        'a_b' : [],
        'b_a' : []
    }

    # We'll say speaker a is the first speaker, and speaker b is the second.False
    speaker_shift = [1 if speakers[i] != speakers[i-1] else 0 for i in range(1, len(speakers))]
    speaker_shift.insert(0,0) # Prepend 0 (first utterance isn't a response)

    for i in range(1, len(speakers)):
       if speakers[i] == pairs['b'] and speakers[i-1] == pairs['a']:
           pairs['a_b'].append((utts[i-1], utts[i]))
       elif speakers[i] == pairs['a'] and speakers[i-1] == pairs['b']:
           pairs['b_a'].append((utts[i-1], utts[i]))

    return pairs

speakers, utts = get_speaker_utt_lists(corpus.get_conversation(valid_conv_ids[3]))
get_pairs(speakers, utts)

NameError: name 'valid_conv_ids' is not defined

To measure style accommodation we have to measure the style markers in each utterance. That's what this following function is for.

see https://spacy.io/models/en

In [9]:
spacy.explain('IN')

'conjunction, subordinating or preposition'

## spaCy tags


| spaCy tag 	| our tag 	| intended definition 	| actual                                   	|
|-----------	|---------	|---------------------	|------------------------------------------	|
| PRP       	| ppron   	| personal pronoun    	| personal pronoun                         	|
|           	| ipron   	| impersonal pronoun  	|                                          	|
|           	| article 	| article             	|                                          	|
| CC        	| conj    	| conjunction         	| coordinating conjunction                 	|
| IN        	| prep    	| preposition         	| subordinating or preposition conjunction 	|
| MD        	| auxverb 	| auxiliary verb      	| modal auxiliary verb                     	|
| RB        	| adverb  	| common adverb       	| adverb                                   	|
|           	| negate  	| negation            	|                                          	|
|           	| quant   	| quantifier          	|                                          	|


In [25]:
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

def get_style_markers(utt):
    '''
    Returns a dictionary containing the number of style markers in an utterance

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > utt - a single utterance
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > m - dictionary with the following key value pairs
            ppron : # personal pronouns
            ipron : # impersonal pronouns
            article : # articles
            conj : # conjunctions
            prep : # prepositions
            auxverb : # auxiliary verbs
            adverb : # common adverbs
            negate : # negations
            quant : # quantifiers
    '''
    m = {
        'ppron' : 0,
        'ipron' : 0,
        'article' : 0,
        'conj' : 0,
        'prep' : 0,
        'auxverb' : 0,
        'adverb' : 0,
        'negate' : 0,
        'quant' : 0
    }

    # Tokenize text
    text = utt.text.lower().split()

    # Analyze
    for word in text:
        if word in liwc_dic.keys():
            m[liwc_dic[word]] += 1
    
    # Convert to boolean
    for k in m.keys():
        if m[k]:
            m[k] = 1

    return m

speakers, utts = get_speaker_utt_lists(corpus.get_conversation(valid_conv_ids[3]))
print(set(speakers))
get_style_markers(corpus.get_utterance(utts[1]))

NameError: name 'valid_conv_ids' is not defined

In [29]:
def initialize_dict():
    return {
        'ppron' : 0,
        'ipron' : 0,
        'article' : 0,
        'conj' : 0,
        'prep' : 0,
        'auxverb' : 0,
        'adverb' : 0,
        'negate' : 0,
        'quant' : 0
    }

def wiki_measure_coordination(conv, corpus, exclude_last, print_output=False):
    '''
    Assumes the converation will only have two speakers

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
        > print_output - whether to print medial variables
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > C - dictionary with following key value pairs
            convID : ID of conversation
            a : ID of speaker a
            b : ID of speaker b
            num_response_b_a : number of responses from b to a
            num_response_a_b : number of responses from a to b
            C_b_a : dictrionary of asymmetric accomodation from speaker b to speaker a
            C_a_b : dictionary of asymmetric accomodation from speaker a to speaker b
            LSM : dictionary of symmetric accomodation between both speakers
            mean_C_b_a : average accomodation from b towards a across valid markers
            mean_C_a_b : average accomodation from a towards b across valid markers
            mean_LSM : average of symmetric accommodation
            valid_markers : list of valid markers
    '''
    # ~~~~~~~~~~~ VARIABLES ~~~~~~~~~~~
    #     > pairs - dictionary containing interlocution information
    #     > raw_b_a - number of style markers used in all responses from b to a
    #     > raw_a_b - number of style markers used in all responses from a to b
    #     > baseline_b_a - probability of style markers in b's response to a
    #     > baseline_a_b - probability of style markers in a's response to b
    #     > elicit_b_a - probability of style markers in b's response to a given a exhibited the same marker
    #     > elicit_a_b - probability of style markers in a's response to a given b exhibited the same marker
    
    speakers, utts = get_speaker_utt_lists(conv)
    try:
        pairs = get_pairs(speakers, utts)
    except IndexError as err:
        print(speakers)
        print(utts)
        print(err)
        print('Error!')
        return None
    personal_attack = corpus.get_utterance(utts[-1]).retrieve_meta('comment_has_personal_attack')

    # Remove the last utterance which contains the personal attack (or lack thereof)
    if exclude_last:
        last_utt = utts[-1]
        for i in range(len(pairs['a_b'])):
            if last_utt in pairs['a_b'][i]:
                del pairs['a_b'][i]
            
        for i in range(len(pairs['b_a'])):
            if last_utt in pairs['b_a'][i]:
                del pairs['b_a'][i]

    # Get markers from speaker a to b
    # Note the order of a_b switched to b_a here. This is to be consistent with
    # the notation of C(b,a) indicating the coordination of b to a
    elicit_b_a = initialize_dict()
    baseline_b_a = initialize_dict()
    for a_b in pairs['a_b']:
        u_a  = corpus.get_utterance(a_b[0])
        u_b = corpus.get_utterance(a_b[1])
        m_u_a = get_style_markers(u_a)
        m_u_b = get_style_markers(u_b)
        for k in m_u_a:
            if m_u_a[k]:
                if m_u_a[k] == m_u_b[k]:  # If b responded to a with same style marker
                    elicit_b_a[k] += 1
            baseline_b_a[k] += m_u_b[k] # b's response contains m regardless of a's prompt
    
    # Get markers from speaker b to a
    elicit_a_b = initialize_dict()
    baseline_a_b = initialize_dict()
    for b_a in pairs['b_a']:
        u_b  = corpus.get_utterance(b_a[0])
        u_a = corpus.get_utterance(b_a[1])
        m_u_a = get_style_markers(u_a)
        m_u_b = get_style_markers(u_b)
        for k in m_u_b:  
            if m_u_b[k]:
                if m_u_b[k] == m_u_a[k]:  # If a responded to b with same style marker
                    elicit_a_b[k] += 1
            baseline_a_b[k] += m_u_a[k] # If a's response contains m regardless of b's prompt
    
    
    # Convert to probabilities, preserving raw baselines for LSM calculation
    raw_b_a = baseline_b_a.copy()
    raw_a_b = baseline_a_b.copy()
    num_response_b_a = len(pairs['a_b'])  # Number of responses from b to a
    num_response_a_b = len(pairs['b_a'])  # Number of responses from a to b
    # Sometimes there aren't any responses from a to b or from b to a, continue if this is the case
    if not num_response_a_b or not num_response_b_a:
        print('Only one speaker in conversation, skipping')
        return None
    for k in elicit_a_b.keys():  # Could be any dictionary, they all have the same keys
        elicit_b_a[k] = elicit_b_a[k] / num_response_b_a 
        baseline_b_a[k] = baseline_b_a[k] / num_response_b_a
        elicit_a_b[k] = elicit_a_b[k] / num_response_a_b
        baseline_a_b[k] = baseline_a_b[k] / num_response_a_b

    # Determine asymmetric and symmetric accomodation
    C_b_a = initialize_dict() # Accomodation of b towards a
    C_a_b = initialize_dict() # Accomodation of a towards b
    LSM = initialize_dict()
    for k in C_b_a.keys():
        if baseline_b_a[k] and baseline_a_b[k]:  # If a and b both exhibited marker m at some point
            C_b_a[k] = baseline_b_a[k] - elicit_b_a[k]
            C_a_b[k] = baseline_a_b[k] - elicit_a_b[k]
            LSM[k] = 1 - abs(raw_a_b[k] - raw_b_a[k]) / (raw_a_b[k] + raw_b_a[k] + 0.0001)
        else:                                    # Else, the metric is undefined for marker m
            C_b_a[k] = None  # Set to None if there is no data
            C_a_b[k] = None
            LSM[k] = None

    # Get averages across asymmetric measure
    valid_markers = []
    mean_C_b_a = 0
    mean_C_a_b = 0
    mean_LSM = 0
    for k in C_b_a.keys():
        if C_b_a[k] is not None:
            mean_C_b_a += C_b_a[k]
            mean_C_a_b += C_a_b[k]
            mean_LSM += LSM[k]
            valid_markers.append(k)
    if valid_markers:
        mean_C_b_a /= len(valid_markers)
        mean_C_a_b /= len(valid_markers)
        mean_LSM /= len(valid_markers)

    # Construct dictionary to return
    C = {
        'convID' : conv.id,
        'a' : pairs['a'],
        'b' : pairs['b'],
        'num_response_b_a' : len(pairs['b_a']),
        'num_response_a_b' : len(pairs['a_b']),
        'C_b_a' : C_b_a,
        'C_a_b' : C_a_b,
        'LSM' : LSM,
        'mean_C_b_a' : mean_C_b_a,
        'mean_C_a_b' : mean_C_a_b,
        'mean_LSM' : mean_LSM,
        'valid_markers' : valid_markers,
        'corpus' : 'wikipedia',
        'personal_attack' : personal_attack
    } 

    if print_output:
        print('pairs: ', pairs)
        print('\nraw_b_a: ', raw_b_a)
        print('raw_a_b: ', raw_a_b)
        print('\nelicit_b_a: ', elicit_b_a)
        print('elicit_a_b: ', elicit_a_b)
        print('\nbaseline_b_a: ', baseline_b_a)
        print('baseline_a_b: ', baseline_a_b)
        print('\nC_b_a: ', C_b_a)
        print('C_a_b: ', C_a_b)
        print('\nLSM: ', LSM)
        print('\nmean_C_b_a: ', mean_C_b_a)
        print('mean_C_a_b: ', mean_C_a_b)
        
    return C

# for i in range(len(valid_conv_ids)):
#     conv = corpus.get_conversation(valid_conv_ids[i])
#     C = wiki_measure_coordination(conv, corpus = corpus, exclude_last = True, print_output=False)

conv = corpus.get_conversation(valid_conv_ids[10])
C = wiki_measure_coordination(conv, corpus = corpus, exclude_last=True, print_output=False)

print_coordination(C)

NameError: name 'valid_conv_ids' is not defined

In [26]:
def reddit_measure_coordination(conv, corpus, print_output=False):
    '''
    Assumes the converation will only have two speakers

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
        > print_output - whether to print medial variables
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > C - dictionary with following key value pairs
            convID : ID of conversation
            a : ID of speaker a
            b : ID of speaker b
            num_response_b_a : number of responses from b to a
            num_response_a_b : number of responses from a to b
            C_b_a : dictrionary of asymmetric accomodation from speaker b to speaker a
            C_a_b : dictionary of asymmetric accomodation from speaker a to speaker b
            LSM : dictionary of symmetric accomodation between both speakers
            mean_C_b_a : average accomodation from b towards a across valid markers
            mean_C_a_b : average accomodation from a towards b across valid markers
            mean_LSM : average of symmetric accommodation
            valid_markers : list of valid markers
    '''
    # ~~~~~~~~~~~ VARIABLES ~~~~~~~~~~~
    #     > pairs - dictionary containing interlocution information
    #     > raw_b_a - number of style markers used in all responses from b to a
    #     > raw_a_b - number of style markers used in all responses from a to b
    #     > baseline_b_a - probability of style markers in b's response to a
    #     > baseline_a_b - probability of style markers in a's response to b
    #     > elicit_b_a - probability of style markers in b's response to a given a exhibited the same marker
    #     > elicit_a_b - probability of style markers in a's response to a given b exhibited the same marker
    
    speakers, utts = get_speaker_utt_lists(conv)
    try:
        pairs = get_pairs(speakers, utts)
    except IndexError as err:
        print(speakers)
        print(utts)
        print(err)
        print('Error!')
        return None

    # Get markers from speaker a to b
    # Note the order of a_b switched to b_a here. This is to be consistent with
    # the notation of C(b,a) indicating the coordination of b to a
    elicit_b_a = initialize_dict()
    baseline_b_a = initialize_dict()
    for a_b in pairs['a_b']:
        u_a  = corpus.get_utterance(a_b[0])
        u_b = corpus.get_utterance(a_b[1])
        m_u_a = get_style_markers(u_a)
        m_u_b = get_style_markers(u_b)
        for k in m_u_a:
            if m_u_a[k]:
                if m_u_a[k] == m_u_b[k]:  # If b responded to a with same style marker
                    elicit_b_a[k] += 1
            baseline_b_a[k] += m_u_b[k] # b's response contains m regardless of a's prompt
    
    # Get markers from speaker b to a
    elicit_a_b = initialize_dict()
    baseline_a_b = initialize_dict()
    for b_a in pairs['b_a']:
        u_b  = corpus.get_utterance(b_a[0])
        u_a = corpus.get_utterance(b_a[1])
        m_u_a = get_style_markers(u_a)
        m_u_b = get_style_markers(u_b)
        for k in m_u_b:  
            if m_u_b[k]:
                if m_u_b[k] == m_u_a[k]:  # If a responded to b with same style marker
                    elicit_a_b[k] += 1
            baseline_a_b[k] += m_u_a[k] # If a's response contains m regardless of b's prompt
    
    
    # Convert to probabilities, preserving raw baselines for LSM calculation
    raw_b_a = baseline_b_a.copy()
    raw_a_b = baseline_a_b.copy()
    num_response_b_a = len(pairs['a_b'])  # Number of responses from b to a
    num_response_a_b = len(pairs['b_a'])  # Number of responses from a to b
    # Sometimes there aren't any responses from a to b or from b to a, continue if this is the case
    if not num_response_a_b or not num_response_b_a:
        print('Only one speaker in conversation, skipping')
        return None
    for k in elicit_a_b.keys():  # Could be any dictionary, they all have the same keys
        elicit_b_a[k] = elicit_b_a[k] / num_response_b_a 
        baseline_b_a[k] = baseline_b_a[k] / num_response_b_a
        elicit_a_b[k] = elicit_a_b[k] / num_response_a_b
        baseline_a_b[k] = baseline_a_b[k] / num_response_a_b

    # Determine asymmetric and symmetric accomodation
    C_b_a = initialize_dict() # Accomodation of b towards a
    C_a_b = initialize_dict() # Accomodation of a towards b
    LSM = initialize_dict()
    for k in C_b_a.keys():
        if baseline_b_a[k] and baseline_a_b[k]:  # If a and b both exhibited marker m at some point
            C_b_a[k] = baseline_b_a[k] - elicit_b_a[k]
            C_a_b[k] = baseline_a_b[k] - elicit_a_b[k]
            LSM[k] = 1 - abs(raw_a_b[k] - raw_b_a[k]) / (raw_a_b[k] + raw_b_a[k] + 0.0001)
        else:                                    # Else, the metric is undefined for marker m
            C_b_a[k] = None  # Set to None if there is no data
            C_a_b[k] = None
            LSM[k] = None

    # Get averages across asymmetric measure
    valid_markers = []
    mean_C_b_a = 0
    mean_C_a_b = 0
    mean_LSM = 0
    for k in C_b_a.keys():
        if C_b_a[k] is not None:
            mean_C_b_a += C_b_a[k]
            mean_C_a_b += C_a_b[k]
            mean_LSM += LSM[k]
            valid_markers.append(k)
    if valid_markers:
        mean_C_b_a /= len(valid_markers)
        mean_C_a_b /= len(valid_markers)
        mean_LSM /= len(valid_markers)

    # Construct dictionary to return
    C = {
        'convID' : conv.id,
        'a' : pairs['a'],
        'b' : pairs['b'],
        'num_response_b_a' : len(pairs['b_a']),
        'num_response_a_b' : len(pairs['a_b']),
        'C_b_a' : C_b_a,
        'C_a_b' : C_a_b,
        'LSM' : LSM,
        'mean_C_b_a' : mean_C_b_a,
        'mean_C_a_b' : mean_C_a_b,
        'mean_LSM' : mean_LSM,
        'valid_markers' : valid_markers,
        'corpus' : 'reddit',
        'personal_attack' : conv.meta['has_removed_comment']
    } 
    
    if print_output:
        print('pairs: ', pairs)
        print('\nraw_b_a: ', raw_b_a)
        print('raw_a_b: ', raw_a_b)
        print('\nelicit_b_a: ', elicit_b_a)
        print('elicit_a_b: ', elicit_a_b)
        print('\nbaseline_b_a: ', baseline_b_a)
        print('baseline_a_b: ', baseline_a_b)
        print('\nC_b_a: ', C_b_a)
        print('C_a_b: ', C_a_b)
        print('\nLSM: ', LSM)
        print('\nmean_C_b_a: ', mean_C_b_a)
        print('mean_C_a_b: ', mean_C_a_b)
        
    return C

for i in range(len(r_valid_conv_ids)):
    conv = reddit_corpus.get_conversation(r_valid_conv_ids[i])
    C = reddit_measure_coordination(conv, corpus = reddit_corpus, print_output=False)


NameError: name 'r_valid_conv_ids' is not defined

In [27]:
def print_coordination(C):
    '''
    Prints a coordination dictionary (output from measure_coordination) legibly
    '''
    for k in C.keys():
        if isinstance(C[k], dict):
            print('\n~~ {} ~~'.format(k))
            for m in C[k].keys():
                if C[k][m] is not None:
                    print('     {} : {:.2f}'.format(m, C[k][m]))
                else:
                    print('     {} : None'.format(m))
            if k == 'LSM':
                print('\n')
        else:
            print('{} : {}'.format(k, C[k]))

print_coordination(C)

NameError: name 'C' is not defined