## Project Milestone 4 - Linguistic Harbingers of Betrayal Extension

First, input all the libraries we need to get the plot:

In [1]:
import json

from scipy.stats import sem
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

Since the data file contains only the selected games, there is no need to consider the selection criteria when retreving the message data. 
*The condition "relationships that contain at least two consecutive and reciprocated acts of support that span at least three seasons in game time, with no more than five seasons passing between two acts of friendship" is fullfilled.

Here, we read the data from json file and get the number of games, which is 500, including 250 selected games ended up betrayal and 250 selected games ended up lasting friendship between two players.


In [2]:
# read data from diplomacy_data.json with reading mode
with open("diplomacy_data.json", "r") as f:
        data = json.load(f)
len(data)

500

Since we only consider dialogs that happen before the last act of friendship, we need to know when does the last act of friendship happen. Later when extracting feature values, extract them from the seasons that is before the last support season.

In [3]:
# a function to get the season where the last support between two players happened
# entry refer to one entry in the dataset, one game
def last_support(entry):
    seasons = entry['seasons']
    last_support = None
    for season in seasons[:-1]:
        if 'support' in season['interaction'].values():
            last_support = season['season']
    return last_support

In [4]:
# a function to get the average value across the seasons for all features of the messages
# msgs: messages sent by a player from a season in a game
def extract_features(msgs):
    n_sents = sum(m['n_sentences'] for m in msgs) * 1.0
    if(n_sents==0.0):
        print(msgs)
    
    # Sentiment
    # compute positive sentiment score
    sent_pos = sum(m['sentiment'].get("positive") for m in msgs) / len(msgs)
    # compute neutral sentiment score
    sent_neu = sum(m['sentiment'].get("neutral") for m in msgs) / len(msgs)
    # compute positive sentiment score
    sent_neg = sum(m['sentiment'].get("negative") for m in msgs) / len(msgs)
    
    # Argumentation and Discourse
    # compute discourse complexity through comparison, contingency, expansive and temporal
    comparison = sum(len(m['lexicon_words'].get("disc_comparison", []))for m in msgs) / n_sents
    contingency = sum(len(m['lexicon_words'].get("disc_contingency", []))for m in msgs) / n_sents
    expansive = sum(len(m['lexicon_words'].get("disc_expansion", []))for m in msgs) / n_sents
    temporal = sum(len(m['lexicon_words'].get("disc_temporal_rest", []))for m in msgs) / n_sents
    # compute the average number of markers refer to future
    plan = sum(len(m['lexicon_words'].get("disc_temporal_future", []))for m in msgs) / n_sents
    # compute argumentation level through claim and premise
    claim = sum(len(m['lexicon_words'].get("claim", []))for m in msgs) / n_sents
    premise = sum(len(m['lexicon_words'].get("premise", []))for m in msgs) / n_sents
    argu_level = claim+premise
    # compute average number of request
    n_request = sum(m['n_requests'] for m in msgs) / len(msgs)
    
    # Politeness
    politeness = sum(m['politeness'] for m in msgs) / len(msgs)
    
    # Subjectivity using allsubj
    subj = sum(len(m['lexicon_words'].get("allsubj", []))for m in msgs) / n_sents
    
    # Talkativeness
    # compute average number of words
    n_words = sum(m['n_words'] for m in msgs) / len(msgs)
    # compute average number of sentences
    n_sentences = sum(m['n_sentences'] for m in msgs) / len(msgs)

    return np.array([sent_pos, sent_neu, sent_neg,comparison,contingency,expansive,temporal, plan, argu_level, n_request, politeness, subj, n_words, n_sentences])

From paper: To ensure that we are studying conversational patterns that occur only when the two individuals in the dyad are ostensibly being friends, we only extract features from the messages exchanged before the last act of friendship.
Therefore, the messages we extract should from seasons less than last support season.

In [5]:
# a function to get average politeness scores from dataset entries 
# data: the dataset where we extract politeness score from
# betrayal: if the games we consider end up betrayal
# betrayer: if the person to be analyzed is the (potential) betrayer
def process_data(data, betrayal, betrayer):
    results = []

    # loop in every game in data
    for entry in data:
        len_seasons = len(entry['seasons'])                   
        # a matrix to store features of different seasons of one game
        data41game = np.zeros((len_seasons, 14))
        cut_ind = 0
        # none is used to check if the season contain any valuale msg
        none = True
                        
        if(entry['betrayal'] == betrayal):
            for i, season in enumerate(entry['seasons']):
                if(season['season'] <= last_support(entry)):
                    #if(len(season['messages']['betrayer']) > 0 and len(season['messages']['victim']) > 0):                
                    if(betrayer):
                        if(len(season['messages']['betrayer']) > 0 and sum(m['n_sentences'] for m in season['messages']['betrayer'])>0):
                            none = False
                            data41game[i,:] = extract_features(season['messages']['betrayer'])
                    else:
                        if(len(season['messages']['victim']) > 0):
                            none = False
                            data41game[i,:] = extract_features(season['messages']['victim'])
                else:
                    cut_ind = i
                    break
        
        # we consider only games where friendship lasts for at least four season
        if(cut_ind>4):
            # save only the four seasons before the last support
            data41game = data41game[cut_ind-4:cut_ind,:]
            # 
            if(none==False):
                data41game = data41game.flat
                results.append(data41game)        

    # return value is a list of features of the indicated player in the indicated type of games
    return results

Get the politeness score of the four conditions below:

In [6]:
# for the games end up betrayal, get politeness score for each betrayer
betray_er = pd.DataFrame(process_data(data, True, True))

# for the games end up betrayal, get politeness score for each victim
betray_vi = pd.DataFrame(process_data(data, True, False)) 

# for the games not end up betrayal, get politeness score for each potential betrayer
control_er = pd.DataFrame(process_data(data, False, True))

# for the games not end up betrayal, get politeness score for each potential victim
control_vi = pd.DataFrame(process_data(data, False, False)) 

In [30]:

betray_er['betray'] = 1
control_er['betray'] = 0

frames = [betray_er,control_er]
Betrayer = pd.concat(frames,ignore_index=True)

betray_vi['betray'] = 1
control_vi['betray'] = 0

frames2 = [betray_vi,control_vi]
Victim = pd.concat(frames2,ignore_index=True)

frames = [Betrayer, Victim]
total= pd.concat(frames,ignore_index=True)







Given the messages sent by a player from a season in a game, we need to calculate the different variable values of the messages.

Variables we consider:

1. sent_pos: Positive sentiment
2. sent_neu: Neutral sentiment
3. sent_neg: Negative sentiment
4. discourse_comp: Discourse complexity (calculated through comparison, contingency, expansive and temporal)
5. plan: Planning level (calculated through future)
6. argu_level: Argumentation level (calculated through claim and premise)
7. n_request: Number of requests
8. politeness: Politeness
9. subj: Subjectivity
10. n_words: Number of words
11. n_sentences: Number of sentences


In [68]:
from sklearn.preprocessing import StandardScaler

XI = total.drop(['betray'],axis = 1)
yi = total['betray']


from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


XI_train, XI_test, yi_train, yi_test = train_test_split(XI, yi, test_size=0.2, random_state=10)

# ploy feature
XI_train = PolynomialFeatures().fit_transform(XI_train)

# standardizer
standardizer = StandardScaler().fit(XI_train)

XI_train = standardizer.transform(XI_train)

XI_test = standardizer.transform(XI_test)



In [60]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate


sv =SVC(C=5, kernel='rbf', gamma='auto', coef0=0.0, shrinking=True, probability=False,
        decision_function_shape='ovr', break_ties=False, random_state=None)

predicted = cross_validate(sv, XI_train, yi_train, cv=5)
np.mean(predicted['test_score'])

0.755223880597015

In [61]:
sv.fit(XI_train,yi_train)
sv.score(XI_test,yi_test)

0.5294117647058824

In [62]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier(n_estimators=100,max_depth=15,criterion='gini')
predicted = cross_validate(rf, XI_train, yi_train, cv=5)
np.mean(predicted['test_score'])


0.6488586479367866

In [67]:
rf.fit(XI_train,yi_train)
rf.score(XI_test,yi_test)

0.5176470588235295