In [1]:
%matplotlib inline

import json
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
import gensim.downloader as api

info = api.info()  # show info about available models/datasets
model = api.load("glove-twitter-25")  # download the model and return as object ready for use

nlp = spacy.load('en')

Read the Readme here:
https://github.com/RaRe-Technologies/gensim-data

# Getting the dataset

We will use here the dataset provided by the authors we will create a helper function to extract the features that we want.

In [46]:
data = pd.read_json("diplomacy_data/diplomacy_data.json")
data

Unnamed: 0,seasons,game,betrayal,idx,people
0,"[{'season': 1906.5, 'interaction': {'victim': ...",74,True,0,AT
1,"[{'season': 1911.5, 'interaction': {'victim': ...",165,False,1,EG
2,"[{'season': 1911.0, 'interaction': {'victim': ...",157,False,2,AR
3,"[{'season': 1902.0, 'interaction': {'victim': ...",58,False,3,AR
4,"[{'season': 1910.0, 'interaction': {'victim': ...",45,False,4,IT
...,...,...,...,...,...
495,"[{'season': 1908.0, 'interaction': {'victim': ...",197,False,495,RT
496,"[{'season': 1911.5, 'interaction': {'victim': ...",207,True,496,AR
497,"[{'season': 1905.5, 'interaction': {'victim': ...",158,True,497,RE
498,"[{'season': 1903.0, 'interaction': {'victim': ...",252,True,498,GE


In [61]:
data['seasons']

0      [{'season': 1906.5, 'interaction': {'victim': ...
1      [{'season': 1911.5, 'interaction': {'victim': ...
2      [{'season': 1911.0, 'interaction': {'victim': ...
3      [{'season': 1902.0, 'interaction': {'victim': ...
4      [{'season': 1910.0, 'interaction': {'victim': ...
                             ...                        
495    [{'season': 1908.0, 'interaction': {'victim': ...
496    [{'season': 1911.5, 'interaction': {'victim': ...
497    [{'season': 1905.5, 'interaction': {'victim': ...
498    [{'season': 1903.0, 'interaction': {'victim': ...
499    [{'season': 1903.5, 'interaction': {'victim': ...
Name: seasons, Length: 500, dtype: object

In [9]:
def last_support(entry):
    """
    This function returns the last season of friendship. The code is inspired by the provided code from
    the authors
    """
    last_support = None
    for season in entry[:-1]:
        if 'support' in season['interaction'].values():
            last_support = season['season']
    return last_support

def treat_msg_season(df):
    """
    This function loops over the whole dataset and creates a dictionnary with the set of features for each season 
    with its associated boolean (betrayal or not )
    """
    data_victim = {'features':[], 'betrayed':[]} # data of the (potential) victim 
    data_betrayer = {'features':[], 'betrayed':[]} # data of the (potential) betrayer
    for i in range(len(df.seasons.values)):
        entry = df['seasons'][i] # pick each entry
        for j in range(len(entry)): # pick each season
            season = entry[j]
            tab_vi = []
            tab_be = []
            if season['season'] <= last_support(entry): # check if the season is below the last season of friendship
                tab_vi.append(season['messages']['victim'])
                tab_be.append(season['messages']['betrayer'])
                if len(tab_be) != 0 and len(tab_vi) != 0: # keep only cases where both players have sent messages
                    data_victim['features'].append(tab_vi)
                    data_victim['betrayed'].append(df.betrayal.values[i])
                    data_betrayer['features'].append(tab_be)   
                    data_betrayer['betrayed'].append(df.betrayal.values[i])
    return data_victim, data_betrayer

data_victim, data_betrayer = treat_msg_season(data)

In [92]:
def to_dict(message):
    sentiment_positive = message['sentiment']['positive']
    sentiment_neutral = message['sentiment']['neutral']
    sentiment_negative = message['sentiment']['negative']
    n_requests = message['n_requests']
    frequent_words = message['frequent_words']
    n_words = message['n_words']
    politeness = message['politeness']
    n_sentences = message['n_sentences']
    return {"sentiment_positive": sentiment_positive,
           "sentiment_neutral": sentiment_neutral,
           'sentiment_negative': sentiment_negative,
           'n_requests': n_requests,
           'frequent_words': frequent_words,
           'n_words': n_words,
           'politeness': politeness,
           'n_sentences': n_sentences}
    


def preprocessing(df):
    result = []
    for row in df.iterrows():
        row = row[1]
        betrayal = row['betrayal']
        idx = row['idx']
        for season in row['seasons']:
            s = season['season']
            if s < last_support(row['seasons']):
                interaction_vitim = season['interaction']['victim']
                interaction_betrayer = season ['interaction']['betrayer']
                for m_vic in season['messages']['victim']:
                    data = to_dict(m_vic)
                    data['role'] = 'victim'
                    data['season'] = s
                    data['betrayal'] = betrayal
                    data['idx'] = idx
                    result.append(data)
                for m_bet in season['messages']['betrayer']:
                    data = to_dict(m_bet)
                    data['role'] = 'betrayer'
                    data['season'] = s
                    data['betrayal'] = betrayal
                    data['idx'] = idx
                    result.append(data)
                            
    return pd.DataFrame(result).set_index(['idx', 'season'])

In [95]:
df = preprocessing(data)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,sentiment_positive,sentiment_neutral,sentiment_negative,n_requests,frequent_words,n_words,politeness,n_sentences,role,betrayal
idx,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1906.5,0,0,2,1,"[just, bot, ,, ., take, unit, war, retreat, di...",35,0.367200,2,victim,True
0,1906.5,1,1,4,2,"[armies, north, the, armies, on, ., your, with...",77,0.932326,6,victim,True
0,1906.5,1,2,1,2,"[?, going, for, ser, balance, a, to, of, give,...",55,0.983373,4,victim,True
0,1906.5,4,2,13,8,"[only, he, alb, ., forced, italy's, is, be, .,...",313,0.957072,19,victim,True
0,1906.5,1,3,5,7,"[more, let, keep, we, side, we, don't, to, ., ...",146,0.832023,9,betrayer,True
...,...,...,...,...,...,...,...,...,...,...,...
499,1904.0,0,3,0,0,"[he, on, please, kaiser, italy, ..., ., move, ...",12,0.323602,3,victim,True
499,1904.0,1,0,2,2,"[request, can, as, save, i, ., your, consider,...",32,0.777903,3,betrayer,True
499,1904.0,0,0,1,1,"[do, ?, the, ,, things, thing, ,, taking, both...",22,0.599901,1,betrayer,True
499,1904.0,1,2,0,0,"[!!, !, and, dmz, win, agreed, ok, ,, you, all...",13,0.389337,3,betrayer,True


In [4]:
def get_nb_msg(data):
    """
    Get the mean number of messages sent per season
    """
    tab = []
    for features in data["features"]:
        tab.append(len(features[0]))
    return tab

print("In each season, potential betrayers send in average {}, with a maximum of {} messages".format(np.mean(get_nb_msg(data_betrayer)), np.max(get_nb_msg(data_betrayer))))
print("In each season, potential victims send in average {}, with a maximum of {} messages".format(np.mean(get_nb_msg(data_victim)), np.max(get_nb_msg(data_victim))))

In each season, potential betrayers send in average 1.627498001598721, with a maximum of 38 messages
In each season, potential victims send in average 1.515587529976019, with a maximum of 28 messages


In [23]:
def get_lexicon_words(entry):
    """
    get the set of lexicon words for each entry of the dataset
    1 entry = 1 row = 1 set of messages
    Can be improved
    """
    for entries in entry[0]: #loop over the messages
        # get the lexicon words
        di_words = entries["lexicon_words"]
        tab_words = []
        for key in di_words:
            tab = di_words[key]
            for words in tab:
                word = words.split(' ')
                for w in word:
                    if w not in tab_words:
                        tab_words.append(w)
    return tab_words

test = get_lexicon_words(data_victim["features"][-1])


[{'sentiment': {'positive': 0, 'neutral': 0, 'negative': 2}, 'lexicon_words': {'allsubj': ['just', 'war', 'prefer', 'really', 'light', 'retreat'], 'premise': ['in light of']}, 'n_requests': 1, 'frequent_words': ['just', 'bot', ',', '.', 'take', 'unit', 'war', 'retreat', "didn't", 'to', 'turn', 'really', 'mos', 'from', '.', ',', 'nwy', 'prefer', 'using', 'stp', 'if', 'that', 'of', 'i', 'can', 'and', 'me', 'in', "i'd", 'your', 'this'], 'n_words': 35, 'politeness': 0.36720018621437905, 'n_sentences': 2}, {'sentiment': {'positive': 1, 'neutral': 1, 'negative': 4}, 'lexicon_words': {'allsubj': ['against', 'lose', 'even', 'loss', 'support', 'attack', 'good', 'will'], 'disc_expansion': ['later', 'next'], 'disc_comparison': ['after'], 'disc_temporal_future': ['next', 'after', 'later'], 'premise': ['for', 'for']}, 'n_requests': 2, 'frequent_words': ['armies', 'north', 'the', 'armies', 'on', '.', 'your', 'with', 'mos', 'we', 'against', 'the', ',', '.', "i'll", 'be', ',', 'after', 'sounds', '.', 

TypeError: list indices must be integers or slices, not str

# NLP pipeline

### stopwords removal

In [None]:
with codecs.open("helpers/stopwords.txt", encoding='utf-8') as h:
    stopwords = h.read().split('\n')

# Spacy & Glove word embedding

Here we use the spacy library to compute the embeddings. We have to try also with Glove and Word2Vec

In [6]:
from scipy.spatial import distance

def get_word_embedding(word):
    return nlp(word).vector

def get_word_embedding_model(model, word):
    return model[word]

print("Word :{} , embedding : {}".format(test[1], get_word_embedding_model(model, test[1])))
#model.most_similar("man")

Word :still , embedding : [-0.22114   0.67529   0.59344  -1.0235   -0.6578    0.64357   1.6461
 -0.13754  -0.20652   0.41388  -0.12224   0.92581  -5.0168   -0.11061
  0.034176  0.35356   0.027989 -0.55968  -0.2286   -0.79967   0.58868
  0.56942   0.29349   0.3104   -0.50146 ]


# Word2Vec embedding