*Praktische Dialogmodellierung, Universität Potsdam, SoSe 19, David Schlangen*

# NLU mit RegExp, Vorüberlegungen

In [33]:
import pandas as pd
import json
import sys

from __future__ import division

from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_colwidth', -1)

Den Pfad müssen Sie natürlich anpassen:

In [8]:
DSTC_path = '/Users/das/work/svn/Gits/t_Teaching/sose19-anw1-dialmod/material/DSTC'

In [9]:
sys.path.append(DSTC_path)
from _dstc2_scripts.dataset_walker import dataset_walker
from _dstc2_scripts import misc

In [10]:
dataset = dataset_walker("dstc2_dev", dataroot=DSTC_path + "/_Data/", labels=True)

## Die unterliegende Ontologie

In [11]:
with open(DSTC_path + '/_dstc2_scripts/config/ontology_dstc2.json', 'r') as f:
    ontology = json.load(f)

In [12]:
ontology

{u'informable': {u'area': [u'centre', u'north', u'west', u'south', u'east'],
  u'food': [u'afghan',
   u'african',
   u'afternoon tea',
   u'asian oriental',
   u'australasian',
   u'australian',
   u'austrian',
   u'barbeque',
   u'basque',
   u'belgian',
   u'bistro',
   u'brazilian',
   u'british',
   u'canapes',
   u'cantonese',
   u'caribbean',
   u'catalan',
   u'chinese',
   u'christmas',
   u'corsica',
   u'creative',
   u'crossover',
   u'cuban',
   u'danish',
   u'eastern european',
   u'english',
   u'eritrean',
   u'european',
   u'french',
   u'fusion',
   u'gastropub',
   u'german',
   u'greek',
   u'halal',
   u'hungarian',
   u'indian',
   u'indonesian',
   u'international',
   u'irish',
   u'italian',
   u'jamaican',
   u'japanese',
   u'korean',
   u'kosher',
   u'latin american',
   u'lebanese',
   u'light bites',
   u'malaysian',
   u'mediterranean',
   u'mexican',
   u'middle eastern',
   u'modern american',
   u'modern eclectic',
   u'modern european',
   u'modern

## Als Hilfe beim Erstellen von Regeln

In [13]:
out = []
for m, call in enumerate(dataset):
    for n, (turn, label) in enumerate(call):
        this_turn = []
        usr_utt = label['transcription']
        usr_sem = label['semantics']['json']
        for full_da in usr_sem:
            # this is hard coding that there is only one slot/value pair in the list
            #  this seems to be the case in the data, but the format would allow
            #  for arbitrarily many. This conversion does not anymore do that.
            slot_vals = []
            for slot_value in full_da['slots']:
                slot_vals.extend([slot_value[0], slot_value[1]])
            out.append([m, n, usr_utt, full_da['act']] + slot_vals)

columns = 'dial_id turn_id usr_utt intent slot val'.split()
dstc_intent_df = pd.DataFrame(out, columns=columns)

In [19]:
dstc_intent_df.head(15)

Unnamed: 0,dial_id,turn_id,usr_utt,intent,slot,val
0,0,0,i would like to find an expensive restaurant in the south part,inform,pricerange,expensive
1,0,0,i would like to find an expensive restaurant in the south part,inform,area,south
2,0,1,does not matter,inform,this,dontcare
3,0,2,any type of food is okay,inform,food,dontcare
4,0,3,what is the address,request,slot,addr
5,0,4,what is the phone number,request,slot,phone
6,0,5,what type of food,request,slot,food
7,0,6,okay thank,thankyou,,
8,0,7,thank you good bye,thankyou,,
9,0,7,thank you good bye,bye,,


In [15]:
dstc_intent_df['intent'].unique()

array([u'inform', u'request', u'thankyou', u'bye', u'affirm', u'reqalts',
       u'repeat', u'negate', u'confirm', u'hello', u'restart', u'deny',
       u'reqmore'], dtype=object)

In [16]:
dstc_intent_df['intent'].value_counts()

inform      1942
request     1165
bye         526 
thankyou    510 
reqalts     275 
affirm      144 
negate      68  
confirm     39  
hello       18  
repeat      7   
deny        4   
restart     3   
reqmore     1   
Name: intent, dtype: int64

In [17]:
grouped = dstc_intent_df.groupby('intent')

In [18]:
grouped['slot'].value_counts()

intent   slot      
confirm  food          25  
         area          7   
         pricerange    7   
deny     food          4   
inform   food          838 
         area          454 
         pricerange    409 
         this          231 
         name          10  
request  slot          1165
Name: slot, dtype: int64

In [23]:
dstc_intent_df[dstc_intent_df['intent'] == 'affirm'].sample(10)

Unnamed: 0,dial_id,turn_id,usr_utt,intent,slot,val
3931,422,0,yes,affirm,,
1206,136,6,yes,affirm,,
359,41,1,yes,affirm,,
2523,275,4,yes,affirm,,
2436,267,19,yes,affirm,,
4263,456,4,yes,affirm,,
2480,271,13,correct,affirm,,
4061,435,3,yes danish food,affirm,,
179,20,2,yes,affirm,,
3249,354,1,yes,affirm,,


In [21]:
dstc_intent_df[dstc_intent_df['intent'] == 'inform'].sample(20)

Unnamed: 0,dial_id,turn_id,usr_utt,intent,slot,val
2339,259,6,curry prince address and phone number,inform,name,curry prince
3306,360,4,how about turkish food,inform,food,turkish
697,79,0,south part of town italian food,inform,food,italian
2640,287,6,expensive,inform,pricerange,expensive
2636,287,3,korean food expensive,inform,food,korean
1666,194,2,i dont care,inform,this,dontcare
676,77,1,south,inform,area,south
4614,496,1,any,inform,this,dontcare
1616,190,0,im looking for a restaurant that serves mexican food,inform,food,mexican
849,94,4,american,inform,food,north american


In [24]:
dstc_intent_df[dstc_intent_df['intent'] == 'repeat']

Unnamed: 0,dial_id,turn_id,usr_utt,intent,slot,val
35,4,4,again please,repeat,,
38,4,6,again please,repeat,,
43,4,11,again,repeat,,
44,4,12,repeat,repeat,,
46,4,15,repeat,repeat,,
2582,281,0,okay let me try this again,repeat,,
4194,450,2,repeat that,repeat,,


In [25]:
dstc_intent_df[dstc_intent_df['val'] == 'dontcare'].sample(10)

Unnamed: 0,dial_id,turn_id,usr_utt,intent,slot,val
3862,414,4,no in any area,inform,area,dontcare
3033,328,1,i dont care,inform,this,dontcare
4547,488,1,i dont care,inform,this,dontcare
2790,303,1,i dont care,inform,this,dontcare
3684,396,2,any,inform,this,dontcare
1176,133,1,it doesnt matter,inform,this,dontcare
1722,199,6,i dont care about the price range,inform,pricerange,dontcare
4362,466,0,im looking for a restaurant in any area that,inform,area,dontcare
4582,492,1,any,inform,this,dontcare
4529,486,1,any,inform,this,dontcare


In [38]:
this_intent = 'hello'
all_words = [t for u in dstc_intent_df[dstc_intent_df['intent'] == this_intent]['usr_utt'].tolist() for t in u.split()]
wc = Counter(all_words)
print("Types: {}  Tokens: {}  Type/Token: {:.2f}".format(len(wc), len(all_words), len(wc) / len(all_words)))

Types: 23  Tokens: 100  Type/Token: 0.23


In [39]:
wc.most_common(10)

[(u'hello', 11),
 (u'for', 9),
 (u'looking', 9),
 (u'a', 9),
 (u'im', 7),
 (u'hi', 7),
 (u'restaurant', 7),
 (u'in', 5),
 (u'the', 5),
 (u'town', 4)]

In [40]:
this_intent = 'inform'
all_words = [t for u in dstc_intent_df[dstc_intent_df['intent'] == this_intent]['usr_utt'].tolist() for t in u.split()]
wc = Counter(all_words)
print("Types: {}  Tokens: {}  Type/Token: {:.2f}".format(len(wc), len(all_words), len(wc) / len(all_words)))

Types: 276  Tokens: 10630  Type/Token: 0.03


In [41]:
this_intent = 'request'
all_words = [t for u in dstc_intent_df[dstc_intent_df['intent'] == this_intent]['usr_utt'].tolist() for t in u.split()]
wc = Counter(all_words)
print("Types: {}  Tokens: {}  Type/Token: {:.2f}".format(len(wc), len(all_words), len(wc) / len(all_words)))

Types: 112  Tokens: 5159  Type/Token: 0.02


In [43]:
Counter([' '.join(t.split()[:3]) for t in dstc_intent_df[dstc_intent_df['intent'] == 'request']['usr_utt'].tolist()]).most_common(20)

[(u'what is the', 193),
 (u'phone number', 127),
 (u'address', 101),
 (u'can i have', 79),
 (u'whats the address', 69),
 (u'can i get', 63),
 (u'whats the phone', 42),
 (u'what type of', 39),
 (u'may i have', 37),
 (u'price range', 29),
 (u'and the phone', 21),
 (u'post code', 18),
 (u'type of food', 15),
 (u'what is their', 15),
 (u'could i have', 15),
 (u'the address', 13),
 (u'address and phone', 12),
 (u'phone number and', 11),
 (u'address phone number', 10),
 (u'and the post', 10)]

In [44]:
def intent_counter(df, intent, n=3):
    return Counter([' '.join(t.split()[:n]) for t in df[df['intent'] == intent]['usr_utt'].tolist()])

In [45]:
intent_counter(dstc_intent_df, 'inform').most_common(20)

[(u'im looking for', 283),
 (u'i dont care', 76),
 (u'any', 66),
 (u'dont care', 41),
 (u'i need a', 33),
 (u'moderately priced restaurant', 25),
 (u'i want a', 24),
 (u'how about indian', 22),
 (u'cheap', 22),
 (u'restaurant in the', 22),
 (u'north', 22),
 (u'i am looking', 21),
 (u'i would like', 18),
 (u'thai food', 17),
 (u'how about chinese', 15),
 (u'expensive', 15),
 (u'east', 15),
 (u'doesnt matter', 15),
 (u'i want to', 15),
 (u'european food', 14)]