# 1. Data reading and preprocessing
 

## 1.1 Initial setup

In [None]:
# Package installs
# !pip install unidecode
# !pip install pytorch-pretrained-bert
# !pip install pytorch-transformers
# !pip install politenessr
# !pip install stanza

In [2]:
import json
import requests
import nltk
import pandas as pd
import pickle
import stanza as stanza

from nltk.tokenize.punkt import PunktSentenceTokenizer

from tqdm import tqdm
from politenessr import Politenessr

In [3]:
# Downloads
stanza.download('en')
nltk.download('punkt')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 18.7MB/s]                    
2020-12-09 21:10:47 INFO: Downloading default packages for language: en (English)...
12/09/2020 21:10:47 - INFO - stanza -   Downloading default packages for language: en (English)...
2020-12-09 21:10:48 INFO: File exists: /home/david/stanza_resources/en/default.zip.
12/09/2020 21:10:48 - INFO - stanza -   File exists: /home/david/stanza_resources/en/default.zip.
2020-12-09 21:10:51 INFO: Finished downloading models and saved to /home/david/stanza_resources.
12/09/2020 21:10:51 - INFO - stanza -   Finished downloading models and saved to /home/david/stanza_resources.
[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Combine all datasets int one here
def load_dataset():
    url_train = "https://raw.githubusercontent.com/DenisPeskov/2020_acl_diplomacy/master/data/train.jsonl"
    url_test = "https://raw.githubusercontent.com/DenisPeskov/2020_acl_diplomacy/master/data/test.jsonl"
    url_validation = "https://raw.githubusercontent.com/DenisPeskov/2020_acl_diplomacy/master/data/validation.jsonl"
    response_train = requests.get(url_train)
    response_test = requests.get(url_test)
    response_validation = requests.get(url_validation)

    # Data
    deception_train= [json.loads(jline) for jline in response_train.content.splitlines()]
    deception_test= [json.loads(jline) for jline in response_test.content.splitlines()]
    deception_validation= [json.loads(jline) for jline in response_validation.content.splitlines()]

    # Merge into one
    deception = []
    deception.extend(deception_train)
    deception.extend(deception_validation)
    deception.extend(deception_test)
    
    return deception

In [5]:
deception = load_dataset()
print(len(deception))
with open('deception_without_politeness_sentiment.pkl', 'wb') as f:
    pickle.dump(deception, f)

252


In [6]:
def flatten_deception(dataset):
    flat_data = {}
    for game in dataset:
        game_length = len(game["messages"])
        for k, v in game.items():
            if k == "game_id":
                v = [v] * game_length
            if k == "players":
                v = [",".join(v)] * game_length
                    
            if k not in flat_data:
                flat_data[k] = v
            else:
                flat_data[k].extend(v)
                
    return flat_data

df = pd.DataFrame.from_dict(flatten_deception(deception))

In [7]:
df.shape

(17289, 13)

## 1.2 Politeness

The Stanford Politeness Classifier used in "Linguistic Harbingers of Betrayal" has been quite badly maintained, and reusing it seems very difficult nowadays (even when using a Python 2 environment). As we couldn't load and use it for our analysis, we decided to use an alternative classifier instead, using the Politenessr package. This classifier has been fine-tuned by Prof. [David Jurgens from the University of Michigan](https://jurgens.people.si.umich.edu/), and is based on the pre-trained [`bert-base-cased`](https://huggingface.co/transformers/pretrained_models.html)  model by Huggingface. You might need a GPU to run this classifier.

In [8]:
pr = Politenessr()
politeness = pr.predict(list(df['messages']))
df['politeness'] = politeness

12/09/2020 21:10:55 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at /home/david/.cache/torch/pytorch_transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391
12/09/2020 21:10:55 - INFO - pytorch_transformers.modeling_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torchscript": false,
  "type_voca

## 1.3 Sentiment analysis


For sentiment analysis, we reuse the Stanford Sentiment Analyzer they used, using the Stanza package.

In [9]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')


negative_sentiments = []
neutral_sentiments = []
positive_sentiments = []
    
for message in tqdm(df['messages']):
    counts = [0, 0, 0]
    message = nlp(message)
    if len(message.sentences) > 0:
        for sentence in message.sentences:
            counts[sentence.sentiment] += 1
            normalized_counts = [count / len(message.sentences) for count in counts]
    else:
        normalized_counts = [0, 0, 0]

    
    
    negative_sentiments.append(normalized_counts[0])
    neutral_sentiments.append(normalized_counts[1])
    positive_sentiments.append(normalized_counts[2])
            
df["negative_sentiment"] = negative_sentiments
df["neutral_sentiment"] = neutral_sentiments
df["positive_sentiment"] = positive_sentiments

2020-12-09 21:14:09 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| sentiment | sstplus |

12/09/2020 21:14:09 - INFO - stanza -   Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| sentiment | sstplus |

2020-12-09 21:14:09 INFO: Use device: gpu
12/09/2020 21:14:09 - INFO - stanza -   Use device: gpu
2020-12-09 21:14:09 INFO: Loading: tokenize
12/09/2020 21:14:09 - INFO - stanza -   Loading: tokenize
2020-12-09 21:14:09 INFO: Loading: sentiment
12/09/2020 21:14:09 - INFO - stanza -   Loading: sentiment
2020-12-09 21:14:10 INFO: Done loading processors!
12/09/2020 21:14:10 - INFO - stanza -   Done loading processors!
100%|██████████| 17289/17289 [01:33<00:00, 185.63it/s]


In addition, we add the compound sentiment score as provided by [VADER](http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf), which could prove useful for some analysis.

In [10]:
# Optional: VADER sentiment analysis
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
vader_score = []
for message in tqdm(df['messages']):
    vader_score.append(sid.polarity_scores(message)["compound"])

df["vader_score"] = vader_score

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/david/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
100%|██████████| 17289/17289 [00:02<00:00, 6774.79it/s]


## 1.4 Saving

In [12]:
df.sample(5)

Unnamed: 0,messages,sender_labels,receiver_labels,speakers,receivers,absolute_message_index,relative_message_index,seasons,years,game_score,game_score_delta,players,game_id,politeness,negative_sentiment,neutral_sentiment,positive_sentiment,vader_score
7985,Do you still plan to go for Rum?,True,True,turkey,austria,167,7,Fall,1901,3,0,"austria,turkey",5,3.206943,0.0,1.0,0.0,0.0
12606,I have briefly talked to both Italy and Austri...,True,True,germany,england,92,10,Spring,1901,3,0,"germany,england",10,3.40771,0.0,1.0,0.0,0.7506
15806,How are your relations with Russia and France?,True,True,italy,england,1101,9,Fall,1904,4,-2,"italy,england",4,3.266819,0.0,1.0,0.0,0.0
17114,Literally just to take Belgium- after that it ...,True,True,france,england,809,149,Fall,1903,5,1,"england,france",4,3.318613,0.0,1.0,0.0,0.4019
6327,"But yes, mostly true!",True,True,italy,england,241,46,Fall,1901,3,0,"italy,england",3,3.509423,0.0,0.0,1.0,0.6996


In [13]:
df.to_pickle("deception_df.pkl")