# 12. Getting chatty (dialog engines)

###  12.2.1 A pattern-matching chatbot with AIML

In [5]:
!pip install AIML-Bot==0.0.3



In [6]:
import os
# from nlpia.constants import DATA_PATH
import aiml_bot

DATA_PATH = "../../data"

""" AIML Step 1
<category><pattern>HELLO ROSA </pattern><template>Hi Human!</template></category>
<category><pattern>HELLO TROLL </pattern><template>Good one, human.</template></category>
"""
bot = aiml_bot.Bot(learn=os.path.join(DATA_PATH, 'greeting_step1.aiml'))

AttributeError: module 'time' has no attribute 'clock'

In [None]:
bot.respond("Hello Rosa,")
# 'Hi there!'

In [None]:
bot.respond("hello **troll** !!!")
# 'Good one, human.'

In [None]:
bot.respond("Helo Rosa")
# WARNING: No match found for input: Helo Rosa

In [None]:
bot.respond("Hello Ro-sa)
# WARNING: No match found for input: Hello Ro-sa

In [None]:
""" AIML Patterns Step2: Synonyms """
bot.learn(os.path.join(DATA_PATH, 'greeting_step2.aiml'))

In [None]:
bot.respond("Hey Rosa")
# 'Hi there!'

In [None]:
bot.respond("Hi Rosa")
# 'Hi there!'

In [None]:
bot.respond("Helo Rosa")
# 'Hi there!'

In [None]:
bot.respond("hello **troll** !!!")
# 'Good one, human.'

In [None]:
""" AIML Patterns Step3: Random Responses and Lists """
bot.learn(os.path.join(DATA_PATH, 'greeting_step3.aiml'))

In [None]:
bot.respond("Hey Rosa")
# 'Hello friend'


In [None]:
bot.respond("Hey Rosa")
# 'Hey you :)'


In [None]:
bot.respond("Hey Rosa")
# 'Hi Human!'

### 12.4.2 Example retrieval-based chatbot

In [1]:
# from nlpia.data.loaders import get_data

# df = get_data('ubuntu_dialog')

import pandas as pd
df = pd.read_csv("../../bigdata/ubuntu_dialog_1500k.csv.gz", index_col=0)
df.head(4)

Unnamed: 0,Context,Utterance
0,i think we could import the old comments via r...,basically each xfree86 upload will NOT force u...
1,I'm not suggesting all - only the ones you mod...,oh? oops. __eou__
2,afternoon all __eou__ not entirely related to ...,we'll have a BOF about this __eou__ so you're ...
3,interesting __eou__ grub-install worked with /...,i fully endorse this suggestion </quimby> __eo...


In [2]:
import re
from tqdm import tqdm

def split_turns(s, splitter=re.compile('__eot__')):
    """ Split a string on __eot__ markders (turns) """
    for utterance in splitter.split(s):
        utterance = utterance.replace('__eou__', '\n')
        utterance = utterance.replace('__eot__', '')
        if len(utterance.strip()):
            yield utterance


def preprocess_ubuntu_corpus(df):
    """Split all strings in df.Context and df.Utterance on __eot__ (turn) markers """
    statements = []
    replies = []
    for i, record in tqdm(df.iterrows(), total=df.shape[0]):
        turns = list(split_turns(record.Context))
        statement = turns[-1] if len(turns) else '\n'  # <1>
        statements.append(statement)
        turns = list(split_turns(record.Utterance))
        reply = turns[-1] if len(turns) else '\n'
        replies.append(reply)
    df['statement'] = statements
    df['reply'] = replies
    return df


def format_ubuntu_dialog(df):
    """ Print statements paired with replies, formatted for easy review """
    s = ''
    for i, record in df.iterrows():
        statement = list(split_turns(record.Context))[-1]  # <1>
        reply = list(split_turns(record.Utterance))[-1]  # <2>
        s += 'Statement: {}\n'.format(statement)
        s += 'Reply: {}\n\n'.format(reply)
    return s
    # <1> We need to use `list` to force iteration through the generator
    # <2> The `[-1]` index retrievs the last "turn" in the sequence, discarding everything else

In [3]:
print(format_ubuntu_dialog(df.head(4)))

Statement:  I would prefer to avoid it at this stage.  this is something that has gone into XSF svn, I assume? 
 
Reply: basically each xfree86 upload will NOT force users to upgrade 100Mb of fonts for nothing 
 no something i did in my spare time. 


Statement:  ok, it sounds like you're agreeing with me, then 
 though rather than "the ones we modify", my idea is "the ones we need to merge" 
 
Reply: oh? oops. 


Statement:  should g2 in ubuntu do the magic dont-focus-window tricks? 
 join the gang, get an x-series thinkpad 
 sj has hung on my box, again. 
 what is monday mornings discussion actually about? 
 
Reply: we'll have a BOF about this 
 so you're coming tomorrow ? 


Statement:  i want it on in sarge too but nobody else agrees 
 
Reply: i fully endorse this suggestion </quimby> 
 how did your reinstall go? 





In [4]:
df = preprocess_ubuntu_corpus(df)
df.head(4)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1499744/1499744 [01:44<00:00, 14357.62it/s]


Unnamed: 0,Context,Utterance,statement,reply
0,i think we could import the old comments via r...,basically each xfree86 upload will NOT force u...,I would prefer to avoid it at this stage. th...,basically each xfree86 upload will NOT force u...
1,I'm not suggesting all - only the ones you mod...,oh? oops. __eou__,"ok, it sounds like you're agreeing with me, t...",oh? oops. \n
2,afternoon all __eou__ not entirely related to ...,we'll have a BOF about this __eou__ so you're ...,should g2 in ubuntu do the magic dont-focus-w...,we'll have a BOF about this \n so you're comin...
3,interesting __eou__ grub-install worked with /...,i fully endorse this suggestion </quimby> __eo...,i want it on in sarge too but nobody else agr...,i fully endorse this suggestion </quimby> \n h...


In [10]:
df_sample = df.sample(n=14000, random_state=42)
df_sample.reset_index(inplace=True)
df_sample.head(4)

Unnamed: 0,index,Context,Utterance,statement,reply
0,415808,"Hi, can someone tell me how to install OTR (Of...",PriceChild: thanks a lot! __eou__,install the pidgin-otr package \n,PriceChild: thanks a lot! \n
1,503946,﻿Jabar: do it in a terminal/shell: cat ~/.xses...,you're welcome __eou__,thanks \n,you're welcome \n
2,1675140,"I was currently running Windows 7, downloaded ...",Right. You'll need to run your live-cd and go...,: That would be very nice due to the fact tha...,Right. You'll need to run your live-cd and go...
3,605818,"Alright, I'm having serious issues with the li...",I'm not in the directory! __eou__,: or cd out of devices the directory... that ...,I'm not in the directory! \n


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf = TfidfVectorizer(min_df=8, max_df=.3, max_features=50000)
tfidf = TfidfVectorizer(min_df=8, max_df=.3, max_features=500)
# tfidf.fit(df.statement)  # <1>
tfidf.fit(df_sample.statement)  # <1>

# <1> Notice you only need to compute the statement (not reply) TF-IDFs, because those are the
#     things you want to search.

In [12]:
# X = tfidf.transform(df.statement)
X = tfidf.transform(df_sample.statement)
X = pd.DataFrame(X.todense(), columns=tfidf.get_feature_names())



In [13]:
X.shape

(14000, 500)

In [14]:
x = tfidf.transform(['This is an example statement that\
    we want to retrive the best reply for.'])

cosine_similarities = x.dot(X.T)
# reply = df.loc[cosine_similarities.argmax()]
reply = df_sample.loc[cosine_similarities.argmax()]

In [21]:
reply

index                                                   429085
Context      some one here that can help me merg two small ...
Utterance                            thanx for that :) __eou__
statement      #friendly-coders is the best place for that \n 
reply                                     thanx for that :) \n
Name: 6972, dtype: object