In [1]:
%cd /Users/georgi/dev/dialogue_modeling

/Users/georgi/dev/dialogue_modeling


In [2]:
from collections import defaultdict

In [3]:
from supporting_classifiers.agreement_classifier import *
from solution_tracker.simple_sol import solution_tracker, process_raw_to_solution_tracker
import spacy
import string
from read_data import read_solution_annotaions, read_wason_dump, read_3_lvl_annotation_file
import pandas as pd
from featurisers.raw_wason_featuriser import get_y

In [4]:
raw_data = read_wason_dump('data/all/')


In [5]:
conv = raw_data[3].raw_db_conversation

In [6]:
conv

[{'message_id': '48e5ef64-7e50-4f92-a19e-200b19539ca1',
  'user_name': 'SYSTEM',
  'user_id': '-1',
  'message_type': 'WASON_INITIAL',
  'content': [{'value': 'G', 'checked': False},
   {'value': '2', 'checked': False},
   {'value': '7', 'checked': False},
   {'value': 'A', 'checked': False}],
  'user_status': 'UKN',
  'timestamp': '2020-08-19 18:22:18.553825+00:00',
  'user_type': 'participant'},
 {'message_id': 'f993decc-a252-448c-8e6a-ce23da89d84f',
  'user_name': 'Bat',
  'user_id': 'f72989875dbe4a63a214055059ca0fc4',
  'message_type': 'JOIN_ROOM',
  'content': '',
  'user_status': 'USR_ONBOARDING',
  'timestamp': '2020-08-19 18:22:18.971115+00:00',
  'user_type': 'participant'},
 {'message_id': 'acbee860-1734-448d-8ba7-9dcc786fa9a0',
  'user_name': 'Hedgehog',
  'user_id': '3846adbce13f46f987bf284ce3dad593',
  'message_type': 'JOIN_ROOM',
  'content': '',
  'user_status': 'USR_ONBOARDING',
  'timestamp': '2020-08-19 18:22:36.466840+00:00',
  'user_type': 'participant'},
 {'message

In [7]:
allowed = {
    'vowels': {'A', 'O', 'U', 'E', 'I', 'Y'},
    'consonants': {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Z', 'W'},
    'odds': {'1', '3', '5', '7', '9'},
    'evens': {'0', '2', '4', '6', '8'}
}
def process_solution(solution):
    res = set()
    for item in solution:
        for a_k, a_v in allowed.items():
            if item['checked'] and item['value'] in a_v:
                res.update(a_k[0])
    return res

In [8]:
def process_conversation(conversation):
    user_map = defaultdict(lambda: [])
    conversation_map = []
    for item in conversation:
        if item['message_type'] == 'WASON_SUBMIT':
            if item['user_status'] == 'USR_ONBOARDING':
                user_map[item['user_name']] = [process_solution(item['content'])]
            else:
                user_map[item['user_name']].append(process_solution(item['content']))
                conversation_map.append(process_solution(item['content']))
    return user_map, conversation_map

In [9]:
all_conv_stats_per_user = []
all_conv_stats_per_game = []

for item in raw_data:
    user, conv_m = process_conversation(item.raw_db_conversation)
    all_conv_stats_per_user.append(user)
    all_conv_stats_per_game.append(conv_m)

In [10]:
total_submissions = 0
total_users = 0
total_convs = 0
for item in all_conv_stats_per_user:
    total_convs += 1
    total_users += len(item)
    for subm in item:
        total_submissions += len(subm)

In [11]:
total_submissions / total_users

5.475

In [12]:
total_submissions / total_convs

17.139130434782608

In [13]:
len([a for s in all_conv_stats_per_game for a in s]) / len(all_conv_stats_per_game)

3.121739130434783

## Calculate probabilities

In [14]:
from collections import Counter

In [15]:
# BIGRAMS - per user submissions

In [16]:
prob_counter = Counter()
total_pairs = 0
for conv in all_conv_stats_per_user:
    for user, elements in conv.items():
        els = ["".join(e) for e in elements]
        processed = ['BEGIN', *els, "END"]
        for pr, current in zip(processed[:-1], processed[1:]):
            total_pairs += 1
            prob_counter.update(["{}->{}".format(pr, current)])

In [17]:
prob_counter.most_common(50)

[('BEGIN->ev', 119),
 ('ev->END', 119),
 ('ov->END', 107),
 ('BEGIN->v', 68),
 ('v->END', 62),
 ('ev->ev', 53),
 ('BEGIN->ov', 41),
 ('ov->ov', 33),
 ('v->v', 28),
 ('v->ev', 24),
 ('ev->ov', 22),
 ('eov->END', 22),
 ('BEGIN->c', 16),
 ('BEGIN->eov', 16),
 ('ev->v', 15),
 ('BEGIN->e', 15),
 ('v->ov', 14),
 ('BEGIN->eocv', 12),
 ('eocv->END', 12),
 ('BEGIN->ec', 11),
 ('BEGIN->o', 11),
 ('eov->ov', 11),
 ('ev->eov', 9),
 ('BEGIN->co', 9),
 ('e->END', 9),
 ('BEGIN->oecv', 8),
 ('e->ev', 8),
 ('c->ev', 8),
 ('BEGIN->cv', 8),
 ('BEGIN->eo', 6),
 ('ec->END', 5),
 ('ocev->END', 5),
 ('ec->ov', 4),
 ('eocv->ev', 4),
 ('eov->eov', 4),
 ('ev->ocev', 4),
 ('oecv->ov', 4),
 ('c->v', 4),
 ('BEGIN->', 4),
 ('->v', 4),
 ('ov->ev', 4),
 ('e->e', 4),
 ('ev->eocv', 4),
 ('cv->ev', 4),
 ('ec->v', 3),
 ('cv->v', 3),
 ('eocv->eocv', 3),
 ('o->ov', 3),
 ('BEGIN->ecv', 3),
 ('co->END', 3)]

In [18]:
# Bigrams - per game submissions

In [19]:
all_conv_stats_per_game

[[{'o', 'v'}],
 [{'o', 'v'}, {'o', 'v'}],
 [{'v'}, {'e', 'v'}, {'e', 'v'}, {'v'}, {'v'}, {'v'}],
 [{'e'}, {'e', 'v'}, {'o'}, {'e', 'o'}, {'e', 'v'}, {'e', 'v'}, {'e', 'v'}],
 [{'v'}, {'v'}, {'v'}],
 [{'v'}, {'o', 'v'}, {'o', 'v'}],
 [{'e', 'v'}, {'e', 'v'}, {'e', 'v'}],
 [{'o', 'v'}, {'o', 'v'}, {'o', 'v'}],
 [{'c', 'e', 'o', 'v'}, {'e', 'o', 'v'}, {'e', 'o', 'v'}, {'e', 'o', 'v'}],
 [{'e', 'o', 'v'}, {'e', 'o', 'v'}],
 [{'o', 'v'}, {'o', 'v'}, {'o', 'v'}, {'o', 'v'}],
 [{'e', 'v'},
  {'c', 'e', 'o', 'v'},
  {'c', 'e', 'o', 'v'},
  {'c', 'e', 'o', 'v'}],
 [{'o', 'v'}, {'o', 'v'}, {'o', 'v'}, {'o', 'v'}],
 [{'o', 'v'}, {'o', 'v'}, {'o', 'v'}],
 [{'v'}, {'v'}, {'v'}],
 [{'v'}, {'v'}, {'v'}],
 [{'e', 'o', 'v'}, {'e', 'o', 'v'}, {'e', 'o', 'v'}],
 [{'v'}, {'v'}, {'v'}],
 [{'o', 'v'}, {'o', 'v'}],
 [{'e', 'v'}, {'e', 'v'}, {'e', 'v'}, {'e', 'v'}],
 [{'e', 'v'}, {'e', 'v'}],
 [{'c', 'e', 'o', 'v'}, {'e', 'o', 'v'}, {'c', 'e', 'o', 'v'}],
 [{'e', 'v'}, {'e', 'v'}, {'e', 'v'}],
 [{'v'}, {'v'},

In [20]:
prob_counter_per_game = Counter()
total_pairs_per_game = 0
for conv in all_conv_stats_per_game:
    els = ["".join(e) for e in conv]
    processed = ['BEGIN', *els, "END"]
    for pr, current in zip(processed[:-1], processed[1:]):
        total_pairs += 1
        prob_counter_per_game.update(["{}->{}".format(pr, current)])

In [21]:
prob_counter_per_game.most_common(30)

[('ov->ov', 55),
 ('ev->ev', 55),
 ('ov->END', 41),
 ('ev->END', 34),
 ('BEGIN->ev', 34),
 ('BEGIN->ov', 31),
 ('v->v', 30),
 ('BEGIN->v', 23),
 ('v->END', 19),
 ('eov->eov', 11),
 ('v->ev', 10),
 ('BEGIN->e', 8),
 ('ev->ov', 8),
 ('ev->v', 7),
 ('v->ov', 6),
 ('eov->END', 6),
 ('BEGIN->eov', 6),
 ('eocv->eocv', 6),
 ('e->ev', 5),
 ('ov->ev', 4),
 ('ocev->ocev', 3),
 ('eocv->END', 3),
 ('BEGIN->coev', 2),
 ('coev->eov', 2),
 ('ev->ocev', 2),
 ('ocev->END', 2),
 ('BEGIN->eo', 2),
 ('eov->ev', 2),
 ('eov->ov', 2),
 ('ec->ec', 2)]

In [22]:
# Trigrams per user:

In [23]:
prob_counter_trigrams = Counter()
total_triples = 0
for conv in all_conv_stats_per_user:
    for user, elements in conv.items():
        els = ["".join(e) for e in elements]
        processed = ['BEGIN', *els, "END"]
        for pr, current, next_ in zip(processed[:-2], processed[1:-1], processed[2:]):
            total_triples += 1
            prob_counter_trigrams.update(["{}->{}->{}".format(pr, current, next_)])

In [24]:
prob_counter_trigrams.most_common(50)

[('BEGIN->ev->ev', 51),
 ('ev->ev->END', 50),
 ('ov->ov->END', 32),
 ('BEGIN->ov->ov', 29),
 ('BEGIN->v->v', 24),
 ('ev->ov->END', 22),
 ('v->v->END', 22),
 ('v->ev->END', 20),
 ('BEGIN->ev->END', 19),
 ('BEGIN->ev->ov', 19),
 ('BEGIN->v->ev', 15),
 ('v->ov->END', 12),
 ('BEGIN->v->ov', 12),
 ('ev->v->END', 11),
 ('BEGIN->v->END', 11),
 ('BEGIN->ev->v', 11),
 ('eov->ov->END', 11),
 ('BEGIN->eov->ov', 10),
 ('c->ev->END', 8),
 ('BEGIN->ev->eov', 8),
 ('BEGIN->c->ev', 7),
 ('ev->eov->END', 7),
 ('BEGIN->ov->END', 6),
 ('BEGIN->ec->ov', 4),
 ('ec->ov->END', 4),
 ('BEGIN->eocv->ev', 4),
 ('BEGIN->eov->eov', 4),
 ('BEGIN->e->ev', 4),
 ('ev->ocev->END', 4),
 ('BEGIN->oecv->ov', 4),
 ('oecv->ov->END', 4),
 ('BEGIN->c->v', 4),
 ('BEGIN->->v', 4),
 ('e->ev->END', 4),
 ('BEGIN->ov->ev', 4),
 ('ov->ev->END', 4),
 ('BEGIN->e->e', 4),
 ('BEGIN->e->END', 4),
 ('ev->eocv->END', 4),
 ('BEGIN->ec->v', 3),
 ('ec->v->END', 3),
 ('eocv->ev->END', 3),
 ('eov->eov->END', 3),
 ('c->v->END', 3),
 ('cv->v->END

In [25]:
#Trigrams per game:

In [26]:
prob_counter_per_game_tri = Counter()
total_pairs_per_game_tri = 0
for conv in all_conv_stats_per_game:
    els = ["".join(e) for e in conv]
    processed = ['BEGIN', *els, "END"]
    for pr, current, next_ in zip(processed[:-2], processed[1:-1], processed[2:]):
        total_pairs += 1
        prob_counter_per_game_tri.update(["{}->{}->{}".format(pr, current, next_)])

In [27]:
prob_counter_per_game_tri.most_common(50)

[('ov->ov->END', 32),
 ('ev->ev->END', 27),
 ('BEGIN->ov->ov', 26),
 ('ev->ev->ev', 23),
 ('ov->ov->ov', 21),
 ('BEGIN->ev->ev', 20),
 ('BEGIN->v->v', 15),
 ('v->v->END', 14),
 ('v->v->v', 13),
 ('eov->eov->END', 6),
 ('BEGIN->e->ev', 5),
 ('eov->eov->eov', 5),
 ('BEGIN->ev->ov', 5),
 ('BEGIN->v->ev', 4),
 ('v->ev->ev', 4),
 ('BEGIN->eov->eov', 4),
 ('ev->ov->ov', 4),
 ('BEGIN->v->ov', 3),
 ('v->ov->ov', 3),
 ('BEGIN->ev->v', 3),
 ('ev->v->END', 3),
 ('eocv->eocv->eocv', 3),
 ('eocv->eocv->END', 3),
 ('ev->v->ev', 3),
 ('v->v->ev', 3),
 ('v->ov->END', 3),
 ('BEGIN->ov->END', 2),
 ('ev->ev->v', 2),
 ('BEGIN->coev->eov', 2),
 ('ev->ocev->ocev', 2),
 ('ocev->ocev->END', 2),
 ('e->ev->ev', 2),
 ('v->ev->END', 2),
 ('ev->ov->ev', 2),
 ('ov->ev->END', 2),
 ('eov->ev->ev', 2),
 ('v->ev->v', 2),
 ('BEGIN->e->v', 2),
 ('BEGIN->ev->eov', 2),
 ('ev->v->v', 1),
 ('e->ev->o', 1),
 ('ev->o->eo', 1),
 ('o->eo->ev', 1),
 ('eo->ev->ev', 1),
 ('coev->eov->eov', 1),
 ('BEGIN->ev->ocev', 1),
 ('ocev->ocev

# Solution Tracker

In [28]:
from featurisers.raw_wason_featuriser import calculate_stats, preprocess_conversation_dump
from solution_tracker.augment_with_solution import merge_with_solution_raw, merge_with_solution_annotation_message_level

In [29]:
nlp = spacy.load("en_core_web_sm")

In [37]:
def process_solution_simple(solution):
    res = set()
    for item in solution:
        for a_k, a_v in allowed.items():
            if item in a_v:
                res.update(a_k[0])
    return res

{'o', 'v'}

In [41]:
solutions_per_dialogue = []
for item in raw_data:
    prepr = preprocess_conversation_dump(item.raw_db_conversation)    
    item.wason_messages_from_raw()
    item.preprocess_everything(nlp)
    agreement_predictor = Predictor('models/agreement.pkl')
    sol_tracker = solution_tracker(item, False, agreement_predictor)
    
    res = []
    for m in sol_tracker:
        res.append(process_solution_simple(m['value']))
    solutions_per_dialogue.append(res)

In [42]:
solutions_per_dialogue

[[{'o', 'v'},
  {'e', 'v'},
  {'e', 'v'},
  {'o', 'v'},
  {'v'},
  {'o'},
  {'e'},
  {'c'},
  {'o', 'v'}],
 [{'o', 'v'},
  {'c', 'e'},
  {'e', 'v'},
  {'c', 'e'},
  {'o', 'v'},
  {'c', 'e', 'o', 'v'},
  {'e', 'o', 'v'},
  {'e'},
  {'v'},
  {'o', 'v'},
  {'o', 'v'},
  {'o', 'v'},
  {'o', 'v'}],
 [{'e', 'v'},
  {'c', 'e', 'o', 'v'},
  {'e', 'v'},
  {'v'},
  {'e', 'v'},
  {'v'},
  {'e'},
  {'e'},
  {'o'},
  {'c'},
  {'e', 'v'},
  {'v'},
  {'e', 'v'},
  {'e', 'v'},
  {'e'},
  {'c', 'e', 'o', 'v'},
  {'v'},
  {'v'},
  {'v'}],
 [{'e', 'v'},
  {'o'},
  {'e', 'v'},
  {'c'},
  {'e', 'v'},
  {'e', 'v'},
  {'e'},
  {'e'},
  {'e', 'v'},
  {'o'},
  {'o'},
  {'o'},
  {'e', 'o'},
  {'c'},
  {'c', 'v'},
  {'e'},
  {'o'},
  {'v'},
  {'e', 'v'},
  {'e', 'v'},
  {'e', 'v'},
  {'e', 'v'},
  {'e', 'v'}],
 [{'c', 'e', 'o', 'v'},
  {'c', 'e'},
  {'v'},
  {'c', 'e'},
  {'v'},
  {'e', 'v'},
  {'c', 'e', 'v'},
  {'e'},
  {'c', 'e'},
  {'o', 'v'},
  {'v'},
  {'v'},
  {'e'},
  {'e'},
  {'c', 'e'},
  {'e', 'v'},
 

In [56]:
solution_tracker_bigrams = Counter()
total_pairs = 0
for conv in solutions_per_dialogue:
    elements = list(conv)
    els = ["".join(e) for e in elements]
    processed = ['BEGIN', *els, "END"]
    for pr, current in zip(processed[:-1], processed[1:]):
        total_pairs += 1
        solution_tracker_bigrams.update(["{}->{}".format(pr, current)])

In [65]:
solution_tracker_bigrams.most_common(1000)

[('v->v', 168),
 ('ev->ev', 157),
 ('ov->ov', 116),
 ('v->ev', 80),
 ('ev->v', 64),
 ('e->v', 40),
 ('e->e', 40),
 ('BEGIN->ev', 39),
 ('ov->END', 38),
 ('ev->END', 33),
 ('oecv->oecv', 33),
 ('ov->v', 32),
 ('v->e', 32),
 ('ocev->ocev', 31),
 ('ev->ov', 29),
 ('ev->oecv', 29),
 ('v->ov', 28),
 ('o->o', 28),
 ('ev->e', 27),
 ('ov->ev', 26),
 ('v->o', 26),
 ('oecv->ev', 25),
 ('e->ov', 24),
 ('c->ev', 23),
 ('ev->o', 22),
 ('BEGIN->v', 20),
 ('c->v', 20),
 ('v->oecv', 19),
 ('o->ov', 19),
 ('oecv->v', 17),
 ('v->END', 17),
 ('c->c', 17),
 ('eov->eov', 17),
 ('o->v', 16),
 ('c->ov', 15),
 ('ev->c', 15),
 ('e->ev', 15),
 ('e->c', 14),
 ('o->ev', 14),
 ('oecv->ov', 14),
 ('oecv->e', 14),
 ('o->e', 13),
 ('v->c', 13),
 ('ov->o', 13),
 ('ov->oecv', 12),
 ('ov->e', 12),
 ('v->ocev', 12),
 ('ocev->v', 12),
 ('ev->ocev', 12),
 ('BEGIN->ov', 11),
 ('ec->v', 11),
 ('v->eov', 11),
 ('o->oecv', 11),
 ('ov->c', 10),
 ('eocv->ev', 10),
 ('e->oecv', 9),
 ('ocev->ov', 9),
 ('oecv->END', 9),
 ('oecv->c'

In [61]:
solution_tracker_trigrams = Counter()
total_pairs = 0
for conv in solutions_per_dialogue:
    elements = list(conv)
    els = ["".join(e) for e in elements]
    processed = ['BEGIN', *els, "END"]
    for pr, current, next_ in zip(processed[:-2], processed[1:-1], processed[2:]):
        total_pairs += 1
        solution_tracker_trigrams.update(["{}->{}->{}".format(pr, current, next_)])

In [62]:
solution_tracker_trigrams.most_common(50)

[('v->v->v', 74),
 ('ev->ev->ev', 65),
 ('ov->ov->ov', 49),
 ('v->ev->ev', 31),
 ('ov->ov->END', 29),
 ('ev->v->ev', 25),
 ('v->v->ev', 22),
 ('v->ev->v', 21),
 ('ev->v->v', 21),
 ('ev->ev->END', 20),
 ('ocev->ocev->ocev', 17),
 ('ev->ev->e', 15),
 ('ev->ev->v', 15),
 ('v->ov->ov', 14),
 ('v->v->END', 14),
 ('e->v->v', 14),
 ('BEGIN->ev->ev', 13),
 ('ev->oecv->ev', 11),
 ('v->v->oecv', 11),
 ('e->ov->ov', 11),
 ('e->e->v', 11),
 ('v->e->v', 11),
 ('ov->ev->ev', 10),
 ('ov->v->v', 10),
 ('o->ov->ov', 10),
 ('oecv->oecv->oecv', 10),
 ('c->v->v', 10),
 ('ev->ev->oecv', 10),
 ('v->e->e', 9),
 ('c->ev->ev', 9),
 ('e->v->ev', 9),
 ('ov->ov->oecv', 9),
 ('e->v->e', 9),
 ('ev->e->e', 8),
 ('o->o->o', 8),
 ('v->o->o', 8),
 ('ov->ov->v', 8),
 ('eov->eov->eov', 8),
 ('ev->ov->ov', 8),
 ('oecv->v->v', 7),
 ('ev->o->ev', 7),
 ('v->v->e', 7),
 ('v->v->ov', 7),
 ('ov->e->ov', 7),
 ('ocev->v->v', 7),
 ('BEGIN->v->ev', 7),
 ('ev->ov->ev', 7),
 ('c->ov->ov', 7),
 ('ev->ev->ocev', 7),
 ('ev->e->v', 7)]