# Conversations gone awry, Wikipedia version

In [39]:
corpus = Corpus(filename=download("conversations-gone-awry-cmv-corpus"))

Dataset already exists at /home/gaoag/.convokit/downloads/conversations-gone-awry-cmv-corpus


In [40]:
corpus = Corpus(filename=download('conversations-gone-awry-corpus'))

Dataset already exists at /home/gaoag/.convokit/downloads/conversations-gone-awry-corpus


In [15]:
from convokit import Corpus, download
import spacy
import pickle
corpus = Corpus(filename=download('conversations-gone-awry-corpus'))

# Load liwc_dic
with open('liwc_dic.pkl', 'rb') as handle:
    liwc_dic = pickle.load(handle)

Dataset already exists at /home/gaoag/.convokit/downloads/conversations-gone-awry-corpus


In [2]:
corpus.print_summary_stats()

Number of Speakers: 8069
Number of Utterances: 30021
Number of Conversations: 4188


In [3]:
utt = corpus.random_utterance()
print(utt.speaker.id)

Dposse


In [4]:
convo = corpus.random_conversation()
print(convo)

Conversation('id': '103071967.14294.14294', 'utterances': ['103071967.14294.14294', '103071967.14308.14294', '103085389.14715.14715', '103087243.14786.14786'], 'meta': {'page_title': 'Template talk:Summary of casualties of the Iraq War', 'page_id': 2877154, 'pair_id': '25213805.0.0', 'conversation_has_personal_attack': True, 'verified': True, 'pair_verified': True, 'annotation_year': '2018', 'split': 'val'})


In [5]:
paths = convo.get_longest_paths()
for path in paths:
    print([utt.id for utt in path])
    print([utt.get_speaker().id for utt in path])

['103071967.14294.14294', '103071967.14308.14294', '103085389.14715.14715', '103087243.14786.14786']
['Seigfried4220', 'Seigfried4220', 'Timeshifter', 'Seigfried4220']


## Utterance features
- **id**: index of the utterance
- **conversation_id**: id of the first utterance in the converstaion this utterance belongs to
- **reply-to**: index of the utterance to which this utterance replies to (None if not a reply)
- **speaker**: the speaker who authored the utterance
- **timestamp**: timestamp of utterance
- **text**: textual content of the utterance
- **meta**: metadata for each utterance
    - **is_section_header**: whether the utterance is a conversation "title" or "subject" (if true, the utterance should be ignored)
    - **comment_has_personal_attack**: whether this comment was judged by 3 crowdsourced annotators to contain a personal comment_has_personal_attack
    - **parsed**: SpaCy parsed version of the utterance text
        - **rt**: ??
        - **toks**: List of parsed tokens
            - **tok**: the token (word, punctuation, etc.)
            - **tag**: Detailed part of speech tag
            - **dep**: syntactic dependency, i.e. the relation between the tokens
            - **up**: list related to dn, not sure how
            - **dn**: list related to up, not sure how

## Conversation features

- **id**: id of the conversation
- **utterances**: ids of utterances in the conversation (in order I presume)
- **meta**: conversation metadata
    - **page_title**: title of page under which conversation is occurring
    - **page_id**: unique numerical id of the talk page
    - **pair_id**: the id of the conversation that this comment's conversation is paired with
    - **conversation_has_personal_attack**: whether any comment in this comment's conversation contains a personal attack
    - **verified**: whether the personal attack label has been verified by an internal annotator
    - **pair_verified**: whether the personal attack label has been double checked by the internal annotator
    - **annotation_year**: self explanatory
    - **split**: (train, test, or val) whether this conversation was used as train, test, or val in "Trouble on the Horizon"


First, we want to find the conversations that are easy to analyze, i.e. have a structure like (a -> b -> a -> b -> ...). detect_interlocution should reveal those conversations

In [6]:
# We want to consider conversations with a call-reply structure between two speakers, having at least five utterances
def detect_interlocution(conv, min_utts, print=False):
    '''
    Finds whether the conversation has a call-reply structure between two speakers with at least min_utts utterances

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
        > min_utts - the minimum number of utterances that constitute a valid conversation
        > print - whether or not to print why the conversation was rejected
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > bool representing whether or not the conversation's longest path has the aforementioned structure
    '''
    # At the moment, only considering first longest path if there are multiple # TODO: Add functionality to examine all paths
    try:
        longest_path = conv.get_longest_paths()[0]
    except ValueError as v:
        if print:
            print(v)
            print('skipping...')
        return False
    
    if len(longest_path) < min_utts:
        if print:
            print('Less than {} utterances in conversation\nskipping...'.format(min_utts))
        return False

    speakers = []

    for utt in longest_path:
        speakers.append(utt.get_speaker().id)
    
    if len(set(speakers)) > 2:
        if print:
            print('More than 2 speakers in conversation\nskipping...')
        return False

    return True

# Determine number of valid conversations
num_valid = 0
valid_conv_ids = []  # Will hold IDs of all valid converations
for conv in corpus.iter_conversations():
    if detect_interlocution(conv, 5):
        valid_conv_ids.append(conv.id)
        num_valid += 1

print('({}/{}) {:.1f}% conversations valid'.format(num_valid, len(corpus.get_conversation_ids()), num_valid*100/len(corpus.get_conversation_ids())))


(634/4188) 15.1% conversations valid


How to get a single conversation from list of valid IDs

In [7]:
corpus.get_conversation(valid_conv_ids[4])

Conversation({'obj_type': 'conversation', 'meta': {'page_title': 'User talk:AngryParsley', 'page_id': 1282978, 'pair_id': '12941035.584.584', 'conversation_has_personal_attack': False, 'verified': True, 'pair_verified': True, 'annotation_year': '2018', 'split': 'train'}, 'vectors': [], 'tree': <convokit.model.utteranceNode.UtteranceNode object at 0x7f56aa75a940>, 'owner': <convokit.model.corpus.Corpus object at 0x7f570d3d7370>, 'id': '12451425.436.436'})

Now that we have a list of valid conversations, let's find a way to isolate the utterances in a way that we can easily analyze for style accommodation

In [8]:
def get_speaker_utt_lists(conv):
    '''
    Generates lists of speaker IDs corresponding to utterances in conv, and gets utterances

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > speakers - list of speakers corresponding to each utterance
        > utts - list of utterances from conversation
    '''
    # Recall, we use the first index longest path in our list of valid conversations
    longest_path = conv.get_longest_paths()[0]

    speakers = []
    utts = []

    for utt in longest_path:
        utts.append(utt.id)
        speakers.append(utt.get_speaker().id)

    return speakers, utts

def get_pairs(speakers, utts):
    '''
    Generates a dictionary of pairs of utterances, each pair representing a back and forth interaction

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > speakers - list of speakers corresponding to each utterance
        > utts - list of utterances from conversation
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > pairs - dictionary with the following structure
            a : ID of speaker a
            b : ID of speaker b
            a_b : [(tuple of 2 utterance IDs, first being from speaker a and second from speaker b), (...), ...]
            b_a : [(tuple of 2 utterance IDs, first being from speaker b and second from speaker a), (...), ...]
    '''
    # TODO: Account for instances where a speaker speaks multiple times in a row. Combine those into a list of utteranes within the tuples
    pairs = {
        'a' : list(set(speakers))[0],
        'b' : list(set(speakers))[1],
        'a_b' : [],
        'b_a' : []
    }

    # We'll say speaker a is the first speaker, and speaker b is the second.False
    speaker_shift = [1 if speakers[i] != speakers[i-1] else 0 for i in range(1, len(speakers))]
    speaker_shift.insert(0,0) # Prepend 0 (first utterance isn't a response)

    for i in range(1, len(speakers)):
        if speakers[i] == pairs['b'] and speakers[i-1] == pairs['a']:
            pairs['a_b'].append((utts[i-1], utts[i]))
        elif speakers[i] == pairs['a'] and speakers[i-1] == pairs['b']:
            pairs['b_a'].append((utts[i-1], utts[i]))

    return pairs

speakers, utts = get_speaker_utt_lists(corpus.get_conversation(valid_conv_ids[3]))
get_pairs(speakers, utts)

{'a': 'Mike Garcia',
 'b': '66.36.136.123',
 'a_b': [('15832939.3151.3151', '15833000.3256.3256')],
 'b_a': [('15832773.3035.3019', '15832939.3151.3151'),
  ('15833000.3256.3256', '15833036.3275.3275')]}

To measure style accommodation we have to measure the style markers in each utterance. That's what this following function is for.

see https://spacy.io/models/en

In [37]:
conv = corpus.get_conversation(valid_conv_ids[3])

In [38]:
conv

Conversation({'obj_type': 'conversation', 'meta': {'page_title': 'Talk:Mezmerize (album)', 'page_id': 1219116, 'pair_id': '15835089.3408.3408', 'conversation_has_personal_attack': True, 'verified': True, 'pair_verified': True, 'annotation_year': '2018', 'split': 'test'}, 'vectors': [], 'tree': <convokit.model.utteranceNode.UtteranceNode object at 0x7f563eeb24f0>, 'owner': <convokit.model.corpus.Corpus object at 0x7f565789bb80>, 'id': '15832773.3019.3019'})

In [9]:
spacy.explain('IN')

'conjunction, subordinating or preposition'

## spaCy and ARK tags


| ARK tag | spaCy tag 	| our tag 	| intended definition 	| actual                                   	|
|-------- |-----------	|---------	|---------------------	|------------------------------------------	|
| O       | PRP       	| ppron   	| personal pronoun    	| personal pronoun                         	|
|         |           	| ipron   	| impersonal pronoun  	|                                          	|
|         |           	| article 	| article             	|                                          	|
| &       | CC        	| conj    	| conjunction         	| coordinating conjunction                 	|
| P       | IN        	| prep    	| preposition         	| subordinating or preposition conjunction 	|
| V       | MD        	| auxverb 	| auxiliary verb      	| modal auxiliary verb                     	|
| R       | RB        	| adverb  	| common adverb       	| adverb                                   	|
|         |           	| negate  	| negation            	|                                          	|
|         |           	| quant   	| quantifier          	|                                          	|
|N        |             |           | noun                  |                                           |  
|^        |             |           | proper noun           |                                           |
|S        |             |           | nominal+possessive    |                                           |
|Z        |             |           | proper noun+posessive |                                           |
|L        |             |           | nominal+verbal        |                                           |
|M        |             |           | proper noun+verbal    |                                           |
|A        |             |           | adjective             |                                           |
|!        |             |           | interjection          | use                                       |
|D        |             |           | determiner            |                                           |
|T        |             |           | verb particle         |                                           |
|X        |             |           | existential "there"   |                                           |
|Y        |             |           | x+verbal              |                                           |
|#        |             |           | hashtag               |                                           |
|@        |             |           | @ mention             |                                           |
|~        |             |           | discourse marker      | use (can't figure it out though)          |
|U        |             |           | URL or email          |                                           |
|E        |             |           | emoticon              | use                                       |
|$        |             |           | numeral               |                                           |
|,        |             |           | punctuation           | use                                       |
|G        |             |           | garbage/other         |                                           |



In [18]:
from convokit import Corpus, download

In [3]:
corpus = Corpus(filename=download("reddit-corpus-small"))

Dataset already exists at /home/gaoag/.convokit/downloads/reddit-corpus-small


In [4]:
convo = corpus.get_conversation('9c0sn1')

In [10]:
import re

In [28]:
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
import subprocess
nlp = English()
tokenizer = Tokenizer(nlp.vocab)

emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

def get_style_markers(utt):
    '''
    Returns a dictionary containing the number of style markers in an utterance

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > utt - a single utterance
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > m - dictionary with the following key value pairs
            ppron : # personal pronouns
            ipron : # impersonal pronouns
            article : # articles
            conj : # conjunctions
            prep : # prepositions
            auxverb : # auxiliary verbs
            adverb : # common adverbs
            negate : # negations
            quant : # quantifiers
    '''
    m = {
        'ppron' : 0,
        'ipron' : 0,
        'article' : 0,
        'conj' : 0,
        'prep' : 0,
        'auxverb' : 0,
        'adverb' : 0,
        'negate' : 0,
        'quant' : 0
    }

    # Tokenize text
    text = utt.text.lower().split()

    # Analyze using the LIWC keys
    for word in text:
        if word in liwc_dic.keys():
            m[liwc_dic[word]] += 1
    
    # Convert to boolean
    for k in m.keys():
        if m[k]:
            m[k] = 1

    return m

def get_style_markers_extended(utt):
    '''
    Returns a dictionary containing the number of style markers in an utterance

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > utt - a single utterance
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > m - dictionary with the following key value pairs - (outlined in the table above)
        should probably select for ones that are somewhat indicative of style - G and @, for instance, probably don't matter
            
    '''
    m = {
#         'N' : 0,
        'O' : 0,
#         '^' : 0,
#         'S' : 0,
#         'Z' : 0,
        'V' : 0,
#         'L' : 0,
#         'M' : 0,
#         'A' : 0,
        'R' : 0,
        '!' : 0,
#         'D' : 0,
        'P' : 0,
        '&' : 0,
#         'T' : 0,
#         'X' : 0,
#         'Y' : 0,
#         '#' : 0,
#         '@' : 0,
#         '~' : 0,
#         'U' : 0,
        'E' : 0,
#         '$' : 0,
        ',' : 0,
#         'G' : 0,
        'ipron' : 0,
        'article' : 0,
        'negate' : 0,
        'quant' : 0
    }
    
    text_demojied_with_marker = emoji_pattern.sub('wkkemojification', utt.text)
    
   
    # save utt text to file in the appropriate format
    temp_file_obj = open("tempinput.txt", 'w')
    temp_file_obj.write(text_demojied_with_marker)
    temp_file_obj.close()
    
    
    
    # subprocess the file to the java jar; pipe output to a script that returns a wordXPOS dict 
    proc = subprocess.Popen(['../ark-tweet-nlp/runTagger.sh', './tempinput.txt'], stdout=subprocess.PIPE)
#     output = subprocess.check_output(['~/nlp/ark-tweet-nlp/scripts/'], stdin = runtagger_call.stdout)
    while True:
        line = proc.stdout.readline()
        if not line:
            break
        tokens, tags, conf, orig = [l.decode("utf-8") for l in line.split(b'\t')]
        for tok, tag in zip(tokens, tags):
            if tag in m.keys():
                m[tag] += 1
            if tok in liwc_dic.keys() and liwc_dic[tok] in m.keys():
                m[liwc_dic[tok]] += 1
                
    # Convert to boolean
    for k in m.keys():
        if m[k]:
            m[k] = 1

    return m




In [29]:
import csv
import json

In [32]:
def initialize_dict():
    return {
        'ppron' : 0,
        'ipron' : 0,
        'article' : 0,
        'conj' : 0,
        'prep' : 0,
        'auxverb' : 0,
        'adverb' : 0,
        'negate' : 0,
        'quant' : 0
    }


def initialize_dict_extended():
    return  {
#         'N' : 0,
        'O' : 0,
#         '^' : 0,
#         'S' : 0,
#         'Z' : 0,
        'V' : 0,
#         'L' : 0,
#         'M' : 0,
#         'A' : 0,
        'R' : 0,
        '!' : 0,
#         'D' : 0,
        'P' : 0,
        '&' : 0,
#         'T' : 0,
#         'X' : 0,
#         'Y' : 0,
#         '#' : 0,
#         '@' : 0,
#         '~' : 0,
#         'U' : 0,
        'E' : 0,
#         '$' : 0,
        ',' : 0,
#         'G' : 0,
        'ipron' : 0,
        'article' : 0,
        'negate' : 0,
        'quant' : 0
    }
    
    
def measure_coordination(conv, print_output=False, extended=False):
    '''
    Assumes the converation will only have two speakers

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
        > print_output - whether to print medial variables
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > C - dictionary with following key value pairs
            convID : ID of conversation
            a : ID of speaker a
            b : ID of speaker b
            num_response_b_a : number of responses from b to a
            num_response_a_b : number of responses from a to b
            C_b_a : dictrionary of asymmetric accomodation from speaker b to speaker a
            C_a_b : dictionary of asymmetric accomodation from speaker a to speaker b
            LSM : dictionary of symmetric accomodation between both speakers
            mean_C_b_a : average accomodation from b towards a across valid markers
            mean_C_a_b : average accomodation from a towards b across valid markers
            mean_LSM : average of symmetric accommodation
            valid_markers : list of valid markers
    '''
    # ~~~~~~~~~~~ VARIABLES ~~~~~~~~~~~
    #     > pairs - dictionary containing interlocution information
    #     > raw_b_a - number of style markers used in all responses from b to a
    #     > raw_a_b - number of style markers used in all responses from a to b
    #     > baseline_b_a - probability of style markers in b's response to a
    #     > baseline_a_b - probability of style markers in a's response to b
    #     > elicit_b_a - probability of style markers in b's response to a given a exhibited the same marker
    #     > elicit_a_b - probability of style markers in a's response to a given b exhibited the same marker
    
    speakers, utts = get_speaker_utt_lists(conv)
    pairs = get_pairs(speakers, utts)

    # Get markers from speaker a to b
    # Note the order of a_b switched to b_a here. This is to be consistent with
    # the notation of C(b,a) indicating the coordination of b to a
    if extended:
        elicit_b_a = initialize_dict_extended()
        baseline_b_a = initialize_dict_extended()
    else:
        elicit_b_a = initialize_dict()
        baseline_b_a = initialize_dict()
    for a_b in pairs['a_b']:
        u_a  = corpus.get_utterance(a_b[0])
        u_b = corpus.get_utterance(a_b[1])
        if extended:
            m_u_a = get_style_markers_extended(u_a)
            m_u_b = get_style_markers_extended(u_b)
        else:
            m_u_a = get_style_markers(u_a)
            m_u_b = get_style_markers(u_b)
        for k in m_u_a:
            if m_u_a[k]:
                if m_u_a[k] == m_u_b[k]:  # If b responded to a with same style marker
                    elicit_b_a[k] += 1
            baseline_b_a[k] += m_u_b[k] # b's response contains m regardless of a's prompt
    
    # Get markers from speaker b to a
    
    if extended:
        elicit_a_b = initialize_dict_extended()
        baseline_a_b = initialize_dict_extended()
    else:
        elicit_a_b = initialize_dict()
        baseline_a_b = initialize_dict()
        
    for b_a in pairs['b_a']:
        u_b  = corpus.get_utterance(b_a[0])
        u_a = corpus.get_utterance(b_a[1])
        if extended:
            m_u_a = get_style_markers_extended(u_a)
            m_u_b = get_style_markers_extended(u_b)
        else:
            m_u_a = get_style_markers(u_a)
            m_u_b = get_style_markers(u_b)
        for k in m_u_b:  
            if m_u_b[k]:
                if m_u_b[k] == m_u_a[k]:  # If a responded to b with same style marker
                    elicit_a_b[k] += 1
            baseline_a_b[k] += m_u_a[k] # If a's response contains m regardless of b's prompt
    
    
    # Convert to probabilities, preserving raw baselines for LSM calculation
    raw_b_a = baseline_b_a.copy()
    raw_a_b = baseline_a_b.copy()
    num_response_b_a = len(pairs['a_b'])  # Number of responses from b to a
    num_response_a_b = len(pairs['b_a'])  # Number of responses from a to b
    for k in elicit_a_b.keys():  # Could be any dictionary, they all have the same keys
        elicit_b_a[k] = elicit_b_a[k] / num_response_b_a 
        baseline_b_a[k] = baseline_b_a[k] / num_response_b_a
        elicit_a_b[k] = elicit_a_b[k] / num_response_a_b
        baseline_a_b[k] = baseline_a_b[k] / num_response_a_b

    # Determine asymmetric and symmetric accomodation
    if extended:
        C_b_a = initialize_dict_extended()
        C_a_b = initialize_dict_extended()
        LSM = initialize_dict_extended()
    else:
        C_b_a = initialize_dict() # Accomodation of b towards a
        C_a_b = initialize_dict() # Accomodation of a towards b
        LSM = initialize_dict()
    for k in C_b_a.keys():
        if baseline_b_a[k] and baseline_a_b[k]:  # If a and b both exhibited marker m at some point
            C_b_a[k] = baseline_b_a[k] - elicit_b_a[k]
            C_a_b[k] = baseline_a_b[k] - elicit_a_b[k]
            LSM[k] = 1 - abs(raw_a_b[k] - raw_b_a[k]) / (raw_a_b[k] + raw_b_a[k] + 0.0001)
        else:                                    # Else, the metric is undefined for marker m
            C_b_a[k] = None  # Set to None if there is no data
            C_a_b[k] = None
            LSM[k] = None

    # Get averages across asymmetric measure
    valid_markers = []
    mean_C_b_a = 0
    mean_C_a_b = 0
    mean_LSM = 0
    for k in C_b_a.keys():
        if C_b_a[k] is not None:
            mean_C_b_a += C_b_a[k]
            mean_C_a_b += C_a_b[k]
            mean_LSM += LSM[k]
            valid_markers.append(k)
    if valid_markers:
        mean_C_b_a /= len(valid_markers)
        mean_C_a_b /= len(valid_markers)
        mean_LSM /= len(valid_markers)

    # Construct dictionary to return
    C = {
        'convID' : conv.id,
        'a' : pairs['a'],
        'b' : pairs['b'],
        'num_response_b_a' : len(pairs['b_a']),
        'num_response_a_b' : len(pairs['a_b']),
        'C_b_a' : C_b_a,
        'C_a_b' : C_a_b,
        'LSM' : LSM,
        'mean_C_b_a' : mean_C_b_a,
        'mean_C_a_b' : mean_C_a_b,
        'mean_LSM' : mean_LSM,
        'valid_markers' : valid_markers
    } 

    if print_output:
        print('pairs: ', pairs)
        print('\nraw_b_a: ', raw_b_a)
        print('raw_a_b: ', raw_a_b)
        print('\nelicit_b_a: ', elicit_b_a)
        print('elicit_a_b: ', elicit_a_b)
        print('\nbaseline_b_a: ', baseline_b_a)
        print('baseline_a_b: ', baseline_a_b)
        print('\nC_b_a: ', C_b_a)
        print('C_a_b: ', C_a_b)
        print('\nLSM: ', LSM)
        print('\nmean_C_b_a: ', mean_C_b_a)
        print('mean_C_a_b: ', mean_C_a_b)
        
    return C





In [41]:
full_results = []

for i, conv_id in enumerate(valid_conv_ids):
    if i%15==0:
        print(i)
    conv = corpus.get_conversation(conv_id)
    try:
        results = measure_coordination(conv, print_output=False, extended=True)
    except Exception as e:
        print(Exception, conv_id)
        continue
    full_results.append(results)
    
pickle.dump(full_results, open("full_results_wikipedia.pkl", "wb"))

0
15
30
45
60
75
90
105
120
135
150
<class 'Exception'> 143206656.10.10
165
<class 'Exception'> 97524250.56095.56095
180
<class 'Exception'> 368853045.12992.12992
195
210
<class 'Exception'> 724965073.205638.205638
<class 'Exception'> 48973343.25345.25345
<class 'Exception'> 709553734.4699.4699
225
240
<class 'Exception'> 230840298.7670.7670
255
270
285
<class 'Exception'> 468953735.92455.92455
300
<class 'Exception'> 45970779.4829.4829
315
330
345
360
<class 'Exception'> 139407294.103416.103416
375
390
405
<class 'Exception'> 593962760.54308.54308
420
435
450
465
<class 'Exception'> 13521263.3340.3340
480
495
510
525
540
555
570
585
<class 'Exception'> 598664341.8324.8324
600
615
630


In [43]:
q_full_results = []

# Determine number of valid conversations
corpus = Corpus(filename=download("conversations-gone-awry-cmv-corpus"))
num_valid = 0

valid_conv_ids = []  # Will hold IDs of all valid converations
for conv in corpus.iter_conversations():
    if detect_interlocution(conv, 5):
        valid_conv_ids.append(conv.id)
        num_valid += 1

print('({}/{}) {:.1f}% conversations valid'.format(num_valid, len(corpus.get_conversation_ids()), num_valid*100/len(corpus.get_conversation_ids())))


for i, conv_id in enumerate(valid_conv_ids):
    if i%15==0:
        print(i)
    conv = corpus.get_conversation(conv_id)
    try:
        results = measure_coordination(conv, print_output=False, extended=True)
    except Exception as e:
        print(Exception, conv_id)
        continue
    q_full_results.append(results)
    
pickle.dump(q_full_results, open("full_results_reddit.pkl", "wb"))

Dataset already exists at /home/gaoag/.convokit/downloads/conversations-gone-awry-cmv-corpus
(1224/6842) 17.9% conversations valid
0
15
30
<class 'Exception'> cxx5b28
45
60
75
90
105
120
135
150
165
180
195
210
225
240
255
270
285
300
315
330
345
360
375
390
405
420
435
450
465
480
495
510
525
540
555
570
<class 'Exception'> dyw3uva
585
600
615
630
645
660
675
690
705
<class 'Exception'> e6hmhtc
720
735
750
765
780
795
810
825
840
855
870
885
900
915
930
945
960
975
990
1005
<class 'Exception'> e7ycyr3
<class 'Exception'> cy1fo2m
1020
1035
1050
1065
1080
1095
1110
1125
1140
<class 'Exception'> dnpzmco
1155
<class 'Exception'> dusor10
1170
1185
<class 'Exception'> e1rufjv
1200
1215


In [12]:
def print_coordination(C):
    '''
    Prints a coordination dictionary (output from measure_coordination) legibly
    '''
    for k in C.keys():
        if isinstance(C[k], dict):
            print('\n~~ {} ~~'.format(k))
            for m in C[k].keys():
                if C[k][m] is not None:
                    print('     {} : {:.2f}'.format(m, C[k][m]))
                else:
                    print('     {} : None'.format(m))
            if k == 'LSM':
                print('\n')
        else:
            print('{} : {}'.format(k, C[k]))

print_coordination(C)

convID : 162766001.2409.2409
a : The undertow
b : Indubitably
num_response_b_a : 2
num_response_a_b : 3

~~ C_b_a ~~
     ppron : 0.00
     ipron : 0.00
     article : 0.00
     conj : 0.00
     prep : 0.00
     auxverb : 0.00
     adverb : 0.00
     negate : 0.00
     quant : None

~~ C_a_b ~~
     ppron : 0.50
     ipron : 0.00
     article : 0.00
     conj : 0.50
     prep : 0.00
     auxverb : 0.00
     adverb : 0.00
     negate : 0.50
     quant : None

~~ LSM ~~
     ppron : 0.67
     ipron : 1.00
     article : 0.80
     conj : 1.00
     prep : 0.67
     auxverb : 1.00
     adverb : 0.80
     negate : 0.67
     quant : None


mean_C_b_a : 0.0
mean_C_a_b : 0.1875
mean_LSM : 0.8250051665077829
valid_markers : ['ppron', 'ipron', 'article', 'conj', 'prep', 'auxverb', 'adverb', 'negate']
