In [1]:
import pathlib as pl
import pandas as pd

import spacy
from spacy import displacy
# from spacy.matcher import Matcher
# from spacy.lang.en import English
from spacy.pipeline import EntityRuler

from read_utils import read_tables, read_network

In [2]:
!python3 -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


## Define paths.

In [3]:
# Inputs.
data_dir = pl.Path('../data')
output_dir = pl.Path('../outputs')
interm_dir = output_dir.joinpath('intermediate')
corpus_dir = interm_dir.joinpath('corpus')
network_file = data_dir.joinpath('metadata/network.json')

# # Outputs.
# output_dir = pl.Path('../outputs')
# intermediate_dir = output_dir.joinpath('intermediate')
annot_corpus_dir = interm_dir.joinpath('annotated_corpus')
# combined_dir = intermediate_dir.joinpath('logs_with_transcripts')

for d in [annot_corpus_dir]:
    if not d.exists():
        d.mkdir()
        print('Created {}'.format(d))

## Load data.

### Load corpus tables (logs with transcripts).

In [4]:
corpus_dfs = read_tables(corpus_dir, form='transcript')

Reading transcript files from ../outputs/intermediate/corpus.
transcript 10 files found.
File justhink19_corpus_07 belongs to team  7
File justhink19_corpus_08 belongs to team  8
File justhink19_corpus_09 belongs to team  9
File justhink19_corpus_10 belongs to team 10
File justhink19_corpus_11 belongs to team 11
File justhink19_corpus_17 belongs to team 17
File justhink19_corpus_18 belongs to team 18
File justhink19_corpus_20 belongs to team 20
File justhink19_corpus_28 belongs to team 28
File justhink19_corpus_47 belongs to team 47
Transcript of  7 has 1059 utterances
Transcript of  8 has  932 utterances
Transcript of  9 has 1076 utterances
Transcript of 10 has  769 utterances
Transcript of 11 has  910 utterances
Transcript of 17 has  451 utterances
Transcript of 18 has  506 utterances
Transcript of 20 has  653 utterances
Transcript of 28 has  490 utterances
Transcript of 47 has  642 utterances


### Load the background network.

In [5]:
network = read_network(network_file)
print('Network read from {}: {} nodes, {} edges'.format(
    network, network.number_of_nodes(), network.number_of_edges()))

Network read from : 10 nodes, 20 edges


## Prepare parsers and rulers.

### Define intent keywords.

In [6]:
intent_keywords = {
    'ADD': {
        'add', # "adding zurich to bern ."
        'do', 
        'go',
        'put',  # "putting that one there"
        'connect',
        'build', # "i'll build mount luzern to zermatt"
    },
    'REMOVE': {
        'remove',
        "delete", #"okay so delete that ."
        'erase', 
        'cut', # 'yeah then cut out mount basel to mount interlaken .'
        'away', # 'take away',
        'rub', # 'rub that out', #as in "it's 3 francs rub that out" ; "no wait let me rub that out again ." ; "oh then rub that out"
    },
}

for k, words in intent_keywords.items():
    print(k, words)

ADD {'add', 'put', 'build', 'connect', 'go', 'do'}
REMOVE {'cut', 'remove', 'rub', 'away', 'delete', 'erase'}


### Define function to recognise intent.

In [7]:
def prepare_ruler(network, intent_keywords):
    nlp = spacy.load("en_core_web_sm", disable=["ner"])
    ruler = EntityRuler(nlp)

    node_ids = list()
    node_patterns = list()
    for u, d in network.nodes(data=True):
        word = d['label'].split()[-1]
        identifier = str(u)
        pattern = {'id': identifier, 'label': 'NODE', "pattern": [
            {'LOWER': word.lower()}]}
        node_patterns.append(pattern)
        node_ids.append(identifier)

    intent_ids = list()
    intent_patterns = list()
    for label, words in intent_keywords.items():
        for word in words:
            identifier = str(label)  # [0]
            pattern = {'id': identifier, 'label': label, "pattern": [
                {'LOWER': word.lower()}]}
            intent_patterns.append(pattern)
            intent_ids.append(identifier)

    demonstrative_patterns = list()
    for word in ['it', 'that']:
        identifier = 'DEM'
        pattern = {'id': identifier, 'label': 'DEM', "pattern": [
            {'LOWER': word.lower()}]}  # str(identifier)} #  word} #
        node_patterns.append(pattern)

    patterns = [
        *node_patterns,
        *intent_patterns,
        *demonstrative_patterns,
    ]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    return nlp, node_ids, intent_ids


def get_node_ids(text, doc):
    node_ids = [int(ent.ent_id_) for ent in doc.ents if ent.label_ == 'NODE']
    return node_ids


class Edge(object):
    def __init__(self, u, v):
        self.u = int(u) if u is not None else u
        self.v = int(v) if v is not None else v

    def __eq__(self, other):
        if isinstance(other, Edge):
            return (self.u == other.u and self.v == other.v) \
                or (self.u == other.v and self.v == other.u)

    def __hash__(self):
        return hash((self.u, self.v))

    def __str__(self):
        return '{}, {}'.format(self.u, self.v)


class Action(object):
    def __init__(self, name, content):
        self.name = name
        self.content = content

    def __eq__(self, other):
        if isinstance(other, Action):
            return self.name == other.name \
                and self.content == other.content

    def partial_equals(self, other):
        assert isinstance(self.content, Edge) and isinstance(
            other.content, Edge), 'not implemented'
        if self.name != other.name:
            return False
        u, v = self.content.u, self.content.v
        ou, ov = other.content.u, other.content.v
        if v is None or ov is None:
            return u == ou or u == ov or v == ou or v == ov
        else:
            return (u == ou or u == ov) and (v == ou or v == ov)

    def __str__(self):
        return self.__repr__()

    def __hash__(self):
        return hash((self.name, self.content))

    def __repr__(self):
        return '{}({})'.format(self.name, self.content)


class AddAction(Action):
    def __init__(self, edge):
        self.name = 'ADD'
        self.edge = edge
        super().__init__(self.name, self.edge)


class RemoveAction(Action):
    def __init__(self, edge):
        self.name = 'REMOVE'
        self.edge = edge
        super().__init__(self.name, self.edge)


class CoreAct(object):
    def __init__(self, name, content, agent='X'):
        self.name = name
        self.content = content
        self.agent = agent

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        return '{}_{}({})'.format(self.name, self.agent, self.content)

    def __hash__(self):
        return hash((self.name, self.content, self.agent))

    def __eq__(self, other):
        if isinstance(other, CoreAct):
            return self.name == other.name \
                and self.content == other.content \
                and self.agent == other.agent


class SuggestAct(CoreAct):
    def __init__(self, action, agent='X'):
        self.name = 'SUGGEST'
        self.action = action
        self.agent = agent
        super().__init__('{}'.format(self.name), action, agent)


class FreeAct(CoreAct):
    def __init__(self, action, agent='X'):
        self.name = 'FREE'
        self.action = action
        self.agent = agent
        super().__init__('{}'.format(self.name), action, agent)


class PhysicalAct(CoreAct):
    def __init__(self, action, agent='X'):
        self.name = 'DO'
        self.action = action
        self.agent = agent
        super().__init__('{}'.format(self.name), action, agent)


class AcceptAct(CoreAct):
    def __init__(self, suggest_act, agent='X'):
        self.name = 'ACCEPT'
        self.suggest_act = suggest_act
        super().__init__('{}'.format(self.name), suggest_act, agent)


class RejectAct(CoreAct):
    def __init__(self, suggest_act, agent='X'):
        self.name = 'REJECT'
        self.suggest_act = suggest_act
        super().__init__('{}'.format(self.name), suggest_act, agent)


def recognize_intents(text, doc, node_ids, intent_ids, default_intent_id='ADD'):
    # # Intents.
    intents = list()
    intent = [None, [None, None]]
    is_inferring = False
    for ent in doc.ents:
        ent_id = ent.ent_id_

        if ent_id in intent_ids:
            if intent[0] is None:
                intent[0] = ent_id
            else:
                intent = [ent_id, [None, None]]

        elif ent_id in node_ids:
            if intent[1][0] is None:
                intent[1][0] = ent_id
            elif intent[1][1] is None:
                if ent_id != intent[1][0]:
                    intent[1][1] = ent_id
                # Start a new intent if has a verb. Default to add or the previous
                if intent[0] is None:
                    if len(intents) > 0:
                        # intent[0] = intents[-1][0]
                        intent[0] = intents[-1].name
                    else:
                        intent[0] = default_intent_id

                action = make_edit_action(intent[0], intent[1])
                # intents.append(intent)
                if action is not None:
                    intents.append(action)
                intent = [None, [None, None]]

    if intent[1][0] is not None:
        if intent[0] is None:  # assume adds if otherwise detected
            intent[0] = 'ADD'
        action = make_edit_action(intent[0], intent[1])
        if action is not None:
            intents.append(action)

    return intents


def make_edit_action(action_name, action_edge):
    action = None
    u, v = action_edge
    edge = Edge(u, v)
    if action_name == 'ADD':
        action = AddAction(edge)
    elif action_name == 'REMOVE':
        action = RemoveAction(edge)
    return action


# Try
text = "go from basel to zurich and then from zurich to saint gallen ."
# text = "then rub that out and then go , interlaken to mount bern ."
text = "okay rub it out and go bern to interlaken ."
text = 'is that how much that ?'
text = "how do i get off this screen ?"
text = "go from basel to zurich and then from zurich to saint gallen ."
text = 'to mount davos .'
# text = "then rub that out and then go , interlaken ."
nlp, node_ids, intent_ids = prepare_ruler(network, intent_keywords)
doc = nlp(text)

intents = recognize_intents(text, doc, node_ids, intent_ids)
# print(intents)
display(intents)

displacy.render(doc, style="ent")

[ADD(9, None)]

### Parser for edge objects in the extended transcripts.

In [8]:
def parse_edge_object(obj, names=False):
    ''''Zurich-Gallen (2-8)' to [2, 8]'''
    if names: # Parse for names.
        (u, v) = obj.split()[0].split('-') 
    else: # Parse for node indices.
        (u, v) = obj.split()[1].strip('(').strip(')').split('-')
        u = int(u)
        v = int(v)
    return [u, v]

obj = 'Zurich-Gallen (2-8)'
parse_edge_object(obj)

[2, 8]

### process_intents

In [9]:
def process_svo(sbj, verb, obj, nlp, node_ids, intent_ids):
    '''recognise intents, make a suggest act or an edit act'''
    # Find node list and intent list.
    if verb == 'says' and sbj in ['A', 'B']:
        text = obj
        doc = nlp(text)
        node_list = get_node_ids(text, doc)
        intent_list = recognize_intents(text, doc, node_ids, intent_ids)
        if intent_list is None:
            intent_list = []
    else:
        node_list = []
        intent_list = []
        
    # Find act list.
    act_list = list()
    for intent in intent_list:
        act = SuggestAct(intent, agent=sbj)
        act_list.append(act)
        # if intent[0] is not None:
        #     act = ['SUGGEST', sbj, intent]
        #     act_list.append(act)

    if verb == 'adds':
        act_verb = 'ADD'
    elif verb == 'removes':
        act_verb = 'REMOVE'
    else:
        act_verb = None
    if act_verb is not None:
        action = make_edit_action(act_verb, parse_edge_object(obj))
        act = PhysicalAct(action, sbj)
        act_list.append(act)
    
    return node_list, intent_list, act_list

In [12]:
def process_intents(df, network, intent_keywords, inplace=True):
    nlp, node_ids, intent_ids = prepare_ruler(network, intent_keywords)

    if not inplace:
        df = df.copy()
    node_lists = list()
    intent_lists = list()
    act_lists = list()
    for i, row in df.iterrows():
        sbj = row['subject']
        verb = row['verb']
        obj = row['object']

        node_list, intent_list, act_list = process_svo(
            sbj, verb, obj, nlp, node_ids, intent_ids)

        node_lists.append(node_list)
        intent_lists.append(intent_list)
        act_lists.append(act_list)

    df['nodes'] = node_lists
    df['intents'] = intent_lists
    df['acts'] = act_lists

    return df

# Try:
team_no = 28 #8
df = corpus_dfs[team_no]
process_intents(df, network, intent_keywords)

KeyError: 'subject'

### process_suggests

In [None]:
def process_suggests(df, inplace=True, verbose=False):
    if not inplace:
        df = df.copy()

    pending_suggest_actlist = list()
    pending_suggest_acts = list() #set()
    turn_no = 1
    for i, row in df.iterrows():
        #pending_suggest_acts = set(pending_suggest_acts)
        pending_suggest_acts = list(pending_suggest_acts)
        act_list = row['acts']
        current_turn_no = row['turn_no']

        # flush at every turn change
        if current_turn_no != -1 and current_turn_no == turn_no + 1:
            if verbose:
                print('Cleared at {} at row {}'.format(current_turn_index, i))
            pending_suggest_acts = list()
            turn_no = current_turn_no


        suggest_acts = [act for act in act_list if isinstance(act, SuggestAct)]
        edit_acts = [act for act in act_list if isinstance(act, PhysicalAct)]
        assert len(edit_acts) <= 1, 'more than one edit act? at {}'.format(row)

        pending_suggest_acts = pending_suggest_acts + suggest_acts

        if len(edit_acts) > 0:
            edit_act = edit_acts[0]

            if verbose and len(pending_suggest_acts) > 0:
                print()
                print('Matching {} to {}'.format(
                    pending_suggest_acts, edit_acts))

            # check with its only item in this trivial case
            # get suggests by the other speaker.
            others_acts = [
                a for a in pending_suggest_acts if a.agent != row['subject']]
            if len(others_acts) > 0:
                new_act = None

                for suggest_act in others_acts:
                    if suggest_act.action.partial_equals(edit_act.action):
                        new_act = AcceptAct(suggest_act, agent=edit_act.agent)
                        if verbose:
                            print('Matched {} to {}'.format(
                                edit_act, suggest_act))

                suggest_act = others_acts[-1]
                if new_act is None:
                    new_act = RejectAct(suggest_act, agent=edit_acts[0].agent)
                    if verbose:
                        print('No match {}: Create {}'.format(
                            suggest_act, new_act))

                if new_act is not None:
                    # remove all that match suggest_act.
                    l = list(pending_suggest_acts)
                    for s in pending_suggest_acts:
                        if s.action.partial_equals(suggest_act.action):
                            l.remove(s)
                    pending_suggest_acts = l
                    #pending_suggest_acts = list(filter((suggest_act).__ne__, pending_suggest_acts))
                    act_list.append(new_act)
                    row['act_list'] = act_list

            else:
                act = FreeAct(action=edit_act, agent=row['subject'])
                act_list.append(act)
                row['act_list'] = act_list
                
        pending_suggest_actlist.append(pending_suggest_acts)

    df['pending_suggests'] = pending_suggest_actlist
    return df


# Try:
task_index = 28 #10  # 8
df = e_transcript_dfs[task_index]
process_intents(df)
df = process_suggests(df, verbose=False)

In [None]:
num_intents = df['intents'].apply(len).sum()
num_rejects = df['acts'].apply(lambda l: len([act for act in l if isinstance(act, RejectAct)])).sum()
num_accepts = df['acts'].apply(lambda l: len([act for act in l if isinstance(act, AcceptAct)])).sum()
num_edits = df['acts'].apply(lambda l: len([act for act in l if isinstance(act, PhysicalAct)])).sum()
num_unmatcheds = df['acts'].apply(lambda l: len([act for act in l if isinstance(act, DeliberationAct)])).sum()
num_intents, num_accepts, num_rejects, num_edits, len(df[df.verb.isin(['adds', 'removes'])]), num_unmatcheds