## Project Milestone 4 - Linguistic Harbingers of Betrayal Extension

First, input all the libraries we need to get the plot:

In [20]:
import json

from scipy.stats import sem
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Since the data file contains only the selected games, there is no need to consider the selection criteria when retreving the message data. 
*The condition "relationships that contain at least two consecutive and reciprocated acts of support that span at least three seasons in game time, with no more than five seasons passing between two acts of friendship" is fullfilled.

Here, we read the data from json file and get the number of games, which is 500, including 250 selected games ended up betrayal and 250 selected games ended up lasting friendship between two players.


In [9]:
# read data from diplomacy_data.json with reading mode
with open("diplomacy_data.json", "r") as f:
        data = json.load(f)
len(data)

500

Since we only consider dialogs that happen before the last act of friendship, we need to know when does the last act of friendship happen. Later when extracting feature values, extract them from the seasons that is before the last support season.

In [10]:
# a function to get the season where the last support between two players happened
# entry refer to one entry in the dataset, one game
def last_support(entry):
    seasons = entry['seasons']
    last_support = None
    for season in seasons[:-1]:
        if 'support' in season['interaction'].values():
            last_support = season['season']
    return last_support

Given the messages sent by a player from a season in a game, we need to calculate the different variable values of the messages.

Variables we consider:

1. sent_pos: Positive sentiment
2. sent_neu: Neutral sentiment
3. sent_neg: Negative sentiment
4. discourse_comp: Discourse complexity (calculated through comparison, contingency, expansive and temporal)
5. plan: Planning level (calculated through future)
6. argu_level: Argumentation level (calculated through claim and premise)
7. n_request: Number of requests
8. politeness: Politeness
9. subj: Subjectivity
10. n_words: Number of words
11. n_sentences: Number of sentences


In [16]:
# a function to get the average value across the seasons for all features of the messages
# msgs: messages sent by a player from a season in a game
def extract_features(msgs):
    n_sents = sum(m['n_sentences'] for m in msgs) * 1.0
    
    # Sentiment
    # compute positive sentiment score
    sent_pos = sum(m['sentiment'].get("positive") for m in msgs) / len(msgs)
    # compute neutral sentiment score
    sent_neu = sum(m['sentiment'].get("neutral") for m in msgs) / len(msgs)
    # compute positive sentiment score
    sent_neg = sum(m['sentiment'].get("negative") for m in msgs) / len(msgs)
    
    # Argumentation and Discourse
    # compute discourse complexity through comparison, contingency, expansive and temporal
    comparison = sum(len(m['lexicon_words'].get("disc_comparison", []))for m in msgs) / n_sents
    contingency = sum(len(m['lexicon_words'].get("disc_contingency", []))for m in msgs) / n_sents
    expansive = sum(len(m['lexicon_words'].get("disc_expansion", []))for m in msgs) / n_sents
    temporal = sum(len(m['lexicon_words'].get("disc_temporal_rest", []))for m in msgs) / n_sents
    discourse_comp = comparison+contingency+expansive+temporal
    # compute the average number of markers refer to future
    plan = sum(len(m['lexicon_words'].get("disc_temporal_future", []))for m in msgs) / n_sents
    # compute argumentation level through claim and premise
    claim = sum(len(m['lexicon_words'].get("claim", []))for m in msgs) / n_sents
    premise = sum(len(m['lexicon_words'].get("premise", []))for m in msgs) / n_sents
    argu_level = claim+premise
    # compute average number of request
    n_request = sum(m['n_requests'] for m in msgs) / len(msgs)
    
    # Politeness
    politeness = sum(m['politeness'] for m in msgs) / len(msgs)
    
    # Subjectivity using allsubj
    subj = sum(len(m['lexicon_words'].get("allsubj", []))for m in msgs) / n_sents
    
    # Talkativeness
    # compute average number of words
    n_words = sum(m['n_words'] for m in msgs) / len(msgs)
    # compute average number of sentences
    n_sentences = sum(m['n_sentences'] for m in msgs) / len(msgs)

    return dict(
        sentPos = sent_pos,
        sentNeu = sent_neu,
        sentNeg = sent_neg,
        discourseComp = discourse_comp,
        plan = plan,
        arguLevel = argu_level,
        nRequest = n_request,
        politeness = politeness,
        subjectivity = subj,
        nWords = n_words,
        nSentences = n_sentences)

From paper: To ensure that we are studying conversational patterns that occur only when the two individuals in the dyad are ostensibly being friends, we only extract features from the messages exchanged before the last act of friendship.
Therefore, the messages we extract should from seasons less than last support season.

In [17]:
# a function to get average politeness scores from dataset entries 
# data: the dataset where we extract politeness score from
# betrayal: if the games we consider end up betrayal
# betrayer: if the person to be analyzed is the (potential) betrayer
def process_data(data, betrayal, betrayer):
    results = []
    
    # loop in every game in data
    for entry in data:
        for season in entry['seasons']:
            # consider every season in the game, extract features from the messages exchanged before the last act of friendship
            if(season['season'] <= last_support(entry)):
                if(len(season['messages']['betrayer']) > 0 and len(season['messages']['victim']) > 0):
                    if(entry['betrayal'] == betrayal):
                        if(betrayer):
                            results.append(extract_features(season['messages']['betrayer']))
                        else:
                            results.append(extract_features(season['messages']['victim']))
    # return value is a list of features of the indicated player in the indicated type of games
    return results

Get the politeness score of the four conditions below:

In [21]:
# for the games end up betrayal, get politeness score for each betrayer
betray_er = pd.DataFrame(process_data(data, True, True))

# for the games end up betrayal, get politeness score for each victim
betray_vi = pd.DataFrame(process_data(data, True, False)) 

# for the games not end up betrayal, get politeness score for each potential betrayer
control_er = pd.DataFrame(process_data(data, False, True))

# for the games not end up betrayal, get politeness score for each potential victim
control_vi = pd.DataFrame(process_data(data, False, False)) 

In [22]:
betray_er

Unnamed: 0,sentPos,sentNeu,sentNeg,discourseComp,plan,arguLevel,nRequest,politeness,subjectivity,nWords,nSentences
0,1.333333,1.333333,1.500000,0.680000,0.120000,0.320000,3.666667,0.803328,3.680000,81.500000,4.166667
1,0.142857,0.857143,1.285714,0.375000,0.125000,0.250000,1.285714,0.560083,2.125000,40.000000,2.285714
2,2.000000,2.500000,2.000000,1.538462,0.307692,0.461538,5.500000,0.982703,4.384615,166.500000,6.500000
3,1.800000,0.800000,2.200000,0.958333,0.208333,0.291667,3.200000,0.748802,3.416667,89.800000,4.800000
4,1.000000,1.000000,1.000000,0.333333,0.166667,0.333333,2.000000,0.899161,3.166667,39.000000,3.000000
...,...,...,...,...,...,...,...,...,...,...,...
658,1.666667,3.000000,4.000000,0.730769,0.076923,0.384615,3.000000,0.395679,2.692308,136.333333,8.666667
659,2.000000,1.000000,2.000000,1.400000,0.400000,0.200000,2.000000,0.785834,2.200000,71.000000,5.000000
660,0.500000,1.500000,3.500000,0.090909,0.000000,0.090909,2.000000,0.354644,1.545455,62.000000,5.500000
661,0.500000,0.500000,1.250000,0.000000,0.000000,0.111111,1.000000,0.510478,1.222222,21.500000,2.250000


In [23]:
betray_vi

Unnamed: 0,sentPos,sentNeu,sentNeg,discourseComp,plan,arguLevel,nRequest,politeness,subjectivity,nWords,nSentences
0,1.500000,1.250000,5.000000,0.483871,0.129032,0.677419,3.250000,0.809993,2.193548,120.000000,7.750000
1,1.333333,0.666667,1.000000,0.000000,0.000000,0.111111,0.666667,0.785508,1.555556,31.000000,3.000000
2,1.000000,3.666667,4.666667,0.178571,0.035714,0.214286,3.666667,0.607331,1.571429,120.000000,9.333333
3,1.000000,0.000000,2.000000,0.833333,0.333333,0.500000,0.500000,0.571134,1.833333,28.000000,3.000000
4,0.666667,1.666667,2.666667,0.266667,0.133333,0.266667,2.333333,0.563423,1.866667,85.333333,5.000000
...,...,...,...,...,...,...,...,...,...,...,...
658,1.500000,3.000000,1.500000,0.125000,0.000000,0.125000,3.500000,0.787815,1.625000,75.000000,6.000000
659,0.000000,1.333333,0.666667,0.000000,0.000000,0.333333,0.333333,0.345412,0.333333,7.666667,2.000000
660,0.500000,2.000000,4.000000,0.538462,0.153846,0.384615,2.000000,0.522554,1.461538,75.000000,6.500000
661,0.250000,3.000000,1.250000,0.388889,0.111111,0.166667,2.000000,0.501783,0.944444,34.750000,4.500000


In [24]:
control_er

Unnamed: 0,sentPos,sentNeu,sentNeg,discourseComp,plan,arguLevel,nRequest,politeness,subjectivity,nWords,nSentences
0,0.0,1.0,1.0,1.500000,0.500000,0.000000,2.0,0.448337,0.000000,14.0,2.0
1,0.0,0.0,1.0,0.000000,0.000000,0.500000,1.0,0.511553,3.000000,11.0,1.0
2,0.0,1.0,1.0,0.000000,0.000000,0.000000,1.0,0.390865,0.000000,17.0,2.0
3,0.0,0.5,1.0,0.333333,0.000000,0.000000,0.5,0.608813,1.666667,17.5,1.5
4,0.0,0.0,1.0,0.000000,0.000000,0.000000,0.0,0.384741,3.000000,50.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
714,0.0,0.0,1.0,0.000000,0.000000,0.000000,1.0,0.208446,1.000000,14.0,1.0
715,1.0,6.0,1.0,0.375000,0.000000,0.375000,3.0,0.934349,2.125000,111.0,8.0
716,3.0,5.0,8.0,0.562500,0.250000,0.687500,8.0,0.910394,3.250000,305.0,16.0
717,0.0,3.0,6.0,0.888889,0.333333,0.888889,6.0,0.822667,4.666667,215.0,9.0


In [25]:
control_vi

Unnamed: 0,sentPos,sentNeu,sentNeg,discourseComp,plan,arguLevel,nRequest,politeness,subjectivity,nWords,nSentences
0,0.0,3.0,1.0,1.750,0.250,0.000000,1.0,0.447457,0.75,47.0,4.0
1,0.0,1.0,0.0,0.000,0.000,0.000000,1.0,0.376480,0.00,8.0,1.0
2,0.0,0.0,1.0,0.000,0.000,0.000000,0.0,0.441322,1.00,5.0,1.0
3,1.0,1.0,0.0,1.000,0.500,0.500000,2.0,0.975253,2.00,23.0,2.0
4,0.0,0.0,1.0,0.000,0.000,1.000000,0.0,0.381919,0.00,28.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
714,1.0,0.5,2.5,0.625,0.375,0.500000,1.0,0.627844,2.25,44.0,4.0
715,0.0,1.0,0.0,0.000,0.000,0.000000,1.0,0.474890,0.00,10.0,1.0
716,1.0,1.0,2.0,0.000,0.000,0.000000,2.0,0.856649,3.00,56.0,4.0
717,2.0,0.0,1.0,0.000,0.000,0.333333,0.0,0.794841,1.00,20.0,3.0
