# Conversations gone awry, Wikipedia version

In [291]:
from convokit import Corpus, download
import spacy
corpus = Corpus(filename=download('conversations-gone-awry-corpus'))

Dataset already exists at C:\Users\ewais\.convokit\downloads\conversations-gone-awry-corpus


In [3]:
corpus.print_summary_stats()

Number of Speakers: 8069
Number of Utterances: 30021
Number of Conversations: 4188


In [4]:
utt = corpus.random_utterance()
print(utt.speaker.id)

Radgeek


In [5]:
convo = corpus.random_conversation()
print(convo)

Conversation('id': '134588266.14633.14633', 'utterances': ['134588266.14633.14633', '134588266.14679.14633', '134588899.15511.15511', '134590586.15724.15724', '134705986.16293.16293', '134706505.17026.17026', '134735593.17628.17628', '134741187.19072.19072', '134746639.19354.19354'], 'meta': {'page_title': 'Talk:Final Fantasy XIII', 'page_id': 4920823, 'pair_id': '127102700.50633.50633', 'conversation_has_personal_attack': True, 'verified': False, 'pair_verified': False, 'annotation_year': '2019', 'split': 'test'})


In [6]:
paths = convo.get_longest_paths()
for path in paths:
    print([utt.id for utt in path])
    print([utt.get_speaker().id for utt in path])

['134588266.14633.14633', '134588266.14679.14633', '134588899.15511.15511', '134590586.15724.15724', '134706505.17026.17026', '134735593.17628.17628']
['72.49.194.145', '72.49.194.145', 'Corpsedust', 'NicholaiDaedalus', '72.49.194.145', 'NicholaiDaedalus']
['134588266.14633.14633', '134588266.14679.14633', '134588899.15511.15511', '134590586.15724.15724', '134706505.17026.17026', '134741187.19072.19072']
['72.49.194.145', '72.49.194.145', 'Corpsedust', 'NicholaiDaedalus', '72.49.194.145', 'Bluerfn']


## Utterance features
- **id**: index of the utterance
- **conversation_id**: id of the first utterance in the converstaion this utterance belongs to
- **reply-to**: index of the utterance to which this utterance replies to (None if not a reply)
- **speaker**: the speaker who authored the utterance
- **timestamp**: timestamp of utterance
- **text**: textual content of the utterance
- **meta**: metadata for each utterance
    - **is_section_header**: whether the utterance is a conversation "title" or "subject" (if true, the utterance should be ignored)
    - **comment_has_personal_attack**: whether this comment was judged by 3 crowdsourced annotators to contain a personal comment_has_personal_attack
    - **parsed**: SpaCy parsed version of the utterance text
        - **rt**: ??
        - **toks**: List of parsed tokens
            - **tok**: the token (word, punctuation, etc.)
            - **tag**: Detailed part of speech tag
            - **dep**: syntactic dependency, i.e. the relation between the tokens
            - **up**: list related to dn, not sure how
            - **dn**: list related to up, not sure how

## Conversation features

- **id**: id of the conversation
- **utterances**: ids of utterances in the conversation (in order I presume)
- **meta**: conversation metadata
    - **page_title**: title of page under which conversation is occurring
    - **page_id**: unique numerical id of the talk page
    - **pair_id**: the id of the conversation that this comment's conversation is paired with
    - **conversation_has_personal_attack**: whether any comment in this comment's conversation contains a personal attack
    - **verified**: whether the personal attack label has been verified by an internal annotator
    - **pair_verified**: whether the personal attack label has been double checked by the internal annotator
    - **annotation_year**: self explanatory
    - **split**: (train, test, or val) whether this conversation was used as train, test, or val in "Trouble on the Horizon"


First, we want to find the conversations that are easy to analyze, i.e. have a structure like (a -> b -> a -> b -> ...). detect_interlocution should reveal those conversations

In [254]:
# We want to consider conversations with a call-reply structure between two speakers, having at least five utterances
def detect_interlocution(conv, min_utts, print=False):
    '''
    Finds whether the conversation has a call-reply structure between two speakers with at least min_utts utterances

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
        > min_utts - the minimum number of utterances that constitute a valid conversation
        > print - whether or not to print why the conversation was rejected
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > bool representing whether or not the conversation's longest path has the aforementioned structure
    '''
    # At the moment, only considering first longest path if there are multiple # TODO: Add functionality to examine all paths
    try:
        longest_path = conv.get_longest_paths()[0]
    except ValueError as v:
        if print:
            print(v)
            print('skipping...')
        return False
    
    if len(longest_path) < min_utts:
        if print:
            print('Less than {} utterances in conversation\nskipping...'.format(min_utts))
        return False

    speakers = []

    for utt in longest_path:
        speakers.append(utt.get_speaker().id)
    
    if len(set(speakers)) > 2:
        if print:
            print('More than 2 speakers in conversation\nskipping...')
        return False

    return True

# Determine number of valid conversations
num_valid = 0
valid_conv_ids = []  # Will hold IDs of all valid converations
for conv in corpus.iter_conversations():
    if detect_interlocution(conv, 5):
        valid_conv_ids.append(conv.id)
        num_valid += 1

print('({}/{}) {:.1f}% conversations valid'.format(num_valid, len(corpus.get_conversation_ids()), num_valid*100/len(corpus.get_conversation_ids())))


(639/4188) 15.3% conversations valid


How to get a single conversation from list of valid IDs

In [257]:
corpus.get_conversation(valid_conv_ids[0])

Conversation({'obj_type': 'conversation', 'meta': {'page_title': 'Talk:Niger uranium forgeries', 'page_id': 1005730, 'pair_id': '66813686.23567.23567', 'conversation_has_personal_attack': True, 'verified': True, 'pair_verified': True, 'annotation_year': '2018', 'split': 'train'}, 'vectors': [], 'tree': <convokit.model.utteranceNode.UtteranceNode object at 0x0000022E29BF5250>, 'owner': <convokit.model.corpus.Corpus object at 0x0000022DC866C910>, 'id': '68000691.25417.25417'})

Now that we have a list of valid conversations, let's find a way to isolate the utterances in a way that we can easily analyze for style accommodation

In [285]:
def get_speaker_utt_lists(conv):
    '''
    Generates lists of speaker IDs corresponding to utterances in conv, and gets utterances

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > speakers - list of speakers corresponding to each utterance
        > utts - list of utterances from conversation
    '''
    # Recall, we use the first index longest path in our list of valid conversations
    longest_path = conv.get_longest_paths()[0]

    speakers = []
    utts = []

    for utt in longest_path:
        utts.append(utt.id)
        speakers.append(utt.get_speaker().id)

    return speakers, utts

def get_pairs(speakers, utts):
    '''
    Generates a dictionary of pairs of utterances, each pair representing a back and forth interaction

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > speakers - list of speakers corresponding to each utterance
        > utts - list of utterances from conversation
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > pairs - dictionary with the following structure
            a : ID of speaker a
            b : ID of speaker b
            a_b : [(tuple of 2 utterance IDs, first being from speaker a and second from speaker b), (...), ...]
            b_a : [(tuple of 2 utterance IDs, first being from speaker b and second from speaker a), (...), ...]
    '''
    # TODO: Account for instances where a speaker speaks multiple times in a row. Combine those into a list of utteranes within the tuples
    print(list(set(speakers)))
    pairs = {
        'a' : list(set(speakers))[0],
        'b' : list(set(speakers))[1],
        'a_b' : [],
        'b_a' : []
    }

    # We'll say speaker a is the first speaker, and speaker b is the second.False
    speaker_shift = [1 if speakers[i] != speakers[i-1] else 0 for i in range(1, len(speakers))]
    speaker_shift.insert(0,0) # Prepend 0 (first utterance isn't a response)

    for i in range(1, len(speakers)):
       if speakers[i] == pairs['b'] and speakers[i-1] == pairs['a']:
           pairs['a_b'].append((utts[i-1], utts[i]))
       elif speakers[i] == pairs['a'] and speakers[i-1] == pairs['b']:
           pairs['b_a'].append((utts[i-1], utts[i]))
    print(speakers)
    print(utts)
    print(speaker_shift)
    print(pairs)

    return

speakers, utts = get_speaker_utt_lists(corpus.get_conversation(valid_conv_ids[3]))
get_pairs(speakers, utts)

['Mike Garcia', '66.36.136.123']
['66.36.136.123', '66.36.136.123', 'Mike Garcia', '66.36.136.123', 'Mike Garcia']
['15832773.3019.3019', '15832773.3035.3019', '15832939.3151.3151', '15833000.3256.3256', '15833036.3275.3275']
[0, 0, 1, 1, 1]
{'a': 'Mike Garcia', 'b': '66.36.136.123', 'a_b': [('15832939.3151.3151', '15833000.3256.3256')], 'b_a': [('15832773.3035.3019', '15832939.3151.3151'), ('15833000.3256.3256', '15833036.3275.3275')]}


To measure style accommodation we have to measure the style markers in each utterance. That's what this following function is for.

see https://spacy.io/models/en

In [363]:
spacy.explain('IN')

'conjunction, subordinating or preposition'

## spaCy tags


| spaCy tag 	| our tag 	| intended definition 	| actual                                   	|
|-----------	|---------	|---------------------	|------------------------------------------	|
| PRP       	| ppron   	| personal pronoun    	| personal pronoun                         	|
|           	| ipron   	| impersonal pronoun  	|                                          	|
|           	| article 	| article             	|                                          	|
| CC        	| conj    	| conjunction         	| coordinating conjunction                 	|
| IN        	| prep    	| preposition         	| subordinating or preposition conjunction 	|
| MD        	| auxverb 	| auxiliary verb      	| modal auxiliary verb                     	|
| RB        	| adverb  	| common adverb       	| adverb                                   	|
|           	| negate  	| negation            	|                                          	|
|           	| quant   	| quantifier          	|                                          	|


In [289]:
def get_style_markers(utt):
    '''
    Returns a dictionary containing the number of style markers in an utterance

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > utt - a single utterance
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > m - dictionary with the following key value pairs
            ppron : # personal pronouns
            ipron : # impersonal pronouns
            article : # articles
            conj : # conjunctions
            prep : # prepositions
            auxverb : # auxiliary verbs
            adverb : # common adverbs
            negate : # negations
            quant : # quantifiers
    '''
    print(utt)

get_style_markers(corpus.get_utterance(utts[1])) # utts defined in the cell above

Utterance(id: '15832773.3035.3019', conversation_id: 15832773.3019.3019, reply-to: 15832773.3019.3019, speaker: Speaker(id: 66.36.136.123, vectors: [], meta: {}), timestamp: 1119740089.0, text: 'Just because Mike Garcia believes this to be true does not mean it should be in the article. I follow SOAD news very carefully and this has never been an issue, let alone being a confirmed fact as Mike Garcia seems to think it is. I would like to see a source or have this deleted. ', vectors: [], meta: {'is_section_header': False, 'comment_has_personal_attack': False, 'toxicity': 0.030666184, 'parsed': [{'rt': 11, 'toks': [{'tok': 'Just', 'tag': 'RB', 'dep': 'advmod', 'up': 4, 'dn': []}, {'tok': 'because', 'tag': 'IN', 'dep': 'mark', 'up': 4, 'dn': []}, {'tok': 'Mike', 'tag': 'NNP', 'dep': 'compound', 'up': 3, 'dn': []}, {'tok': 'Garcia', 'tag': 'NNP', 'dep': 'nsubj', 'up': 4, 'dn': [2]}, {'tok': 'believes', 'tag': 'VBZ', 'dep': 'advcl', 'up': 11, 'dn': [0, 1, 3, 7]}, {'tok': 'this', 'tag': 'DT

In [66]:
def populate_dict(conv):
    '''
    Initializes dictionary with ID information for measure_coordination()
    Assumes the conversation will only have two speakers

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > C - dictionary with the following key value pairs
            convID : ID of conversation
            a : ID of speaker a
            b : ID of speaker b
    '''

def measure_coordination(conv):
    '''
    Assumes the converation will only have two speakers

    ~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~
        > conv - entire conversation object
    ~~~~~~~~~~~~ RETURNS ~~~~~~~~~~~~
        > C - dictionary with following key value pairs
            convID : ID of conversation
            a : ID of speaker a
            b : ID of speaker b
            a_b : asymmetric accomodation from speaker a to speaker b
            b_a : asymmetric accomodation from speaker b to speaker a
            LSA : symmetric accomodation between bot speakers
        > C_m - dictionary with following key value pairs
            convID : ID of conversation
            a : ID of speaker a
            b : ID of speaker b
            <m>_a_b : asymmetric accommodation from speaker a to speaker b in style marker m
            <m>_b_a : asymmetric accommodation from speaker b to speaker a in style marker m
                where <m> can take on the values
                    ppron : personal pronouns
                    ipron : impersonal pronouns
                    article : articles
                    conj : conjunctions
                    prep : prepositions
                    auxverb : auxiliary verbs
                    adverb : common adverbs
                    negate : negations
                    quant : quantifiers

    '''
    C = populate_dict(conv)
    print(conv)
    longest_paths = conv.get_longest_paths()
    
    for path in longest_paths:
        print([utt.id for utt in path])
        print([utt.get_speaker().id for utt in path])
    return

conv = corpus.random_conversation()
measure_coordination(conv)

Conversation('id': '637809448.22348.22348', 'utterances': ['637809448.22369.22348', '637809448.22348.22348', '637811596.22648.22648', '637812747.23256.23256', '637813419.23856.23856', '637814004.24097.24097', '637814481.24395.24395', '637815918.25399.25399', '637825263.25890.25890', '637842043.26148.26148'], 'meta': {'page_title': 'Talk:Regulation of electronic cigarettes', 'page_id': 42877834, 'pair_id': '639648099.38003.38003', 'conversation_has_personal_attack': False, 'verified': False, 'pair_verified': False, 'annotation_year': '2019', 'split': 'test'})
['637809448.22348.22348', '637809448.22369.22348', '637811596.22648.22648', '637812747.23256.23256', '637814004.24097.24097', '637814481.24395.24395', '637815918.25399.25399', '637825263.25890.25890', '637842043.26148.26148']
['QuackGuru', 'QuackGuru', 'AlbinoFerret', 'QuackGuru', 'AlbinoFerret', 'QuackGuru', 'AlbinoFerret', 'QuackGuru', 'AlbinoFerret']
