In [390]:
import pandas as pd
import numpy as np
from  sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
import string
import codecs
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
#nltk.download()


<p><b>IMPORTANT: You need to download the wordnet corpus</b></p>

In [738]:
stopwords = ['to', 'is', 'of', 'the', 'are', 'am', 'if', 'a', 'at', 'in', 'it', 
             'an', 'and', 'i', 'we', 'on', 'id', 'be', 'do', 'u', 'have', 'my', 'or']

In [851]:
def clean_string(line):
    '''
    Cleans and lemmatizes every string
    Also includes some "feature engineering"; it also includes some heuristics for dealing with compound words
    '''
    line = line.replace('\'', ' ').replace('\n', '').replace('\r', '').replace(',', '').replace('?', '').replace('.', '').strip().lower()
    words = line.split()
    new_words = []
    
    for w in words:
        #some heuristic; breakfast also appears as break fast
        #add all the tokens to the vocabulary
        if w == 'breakfast':
            w = ['break', 'fast', 'breakfast']
            for i in w:
                new_words.append(lmtzr.lemmatize(i, 'v'))
        elif w == 'bookable':
            new_words.append('book')
        elif '-' in w:
            w_copy = w
            new_w = w_copy.replace('-', '')
            w = w.replace('-', ' ') + ' ' + new_w
            for i in w.split():
                new_words.append(lmtzr.lemmatize(i, 'v'))
        else:
            new_words.append(lmtzr.lemmatize(w, 'v'))
    return (' '.join(new_words)).encode('utf-8')

In [852]:
def read_files(filepath, questions, answers):
    '''
    Reads in a file, parses it and returns a question, answers tuple
    '''
    i = 1
    with codecs.open('{0}'.format(filepath), encoding='utf-8', errors='ignore') as dataset:
        for line in dataset:
            if len(line) > 1:
                if i%2 == 0:
                    answers.append(clean_string(line))
                else:
                    questions.append(clean_string(line))
            i += 1
    return questions, answers

In [853]:
training_data_questions = []
training_data_answers = []
test_data_questions = []
test_data_answers = []

training_data_questions, training_data_answers = read_files('training_dataset.txt', training_data_questions, training_data_answers)
training_data_questions, training_data_answers = read_files('training_dataset_2.txt', training_data_questions, training_data_answers)

test_data_questions, test_data_answers = read_files('test_dataset.txt', test_data_questions, test_data_answers)
test_data_questions, test_data_answers = read_files('test-data.txt', test_data_questions, test_data_answers)

In [854]:
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words=stopwords + list(string.punctuation))

In [855]:
#Learns vocabulary from training set
vectorizer = vectorizer.fit(np.append(training_data_questions, training_data_answers))

In [856]:
# for i, question in enumerate(training_data_questions):
#     question_transform = vectorizer.transform([question])
#     available_answers_transform = vectorizer.transform(training_data_answers)
    
#     result = np.dot(available_answers_transform, question_transform.T).todense()
    
#     result = np.asarray(result).flatten()
#     # Sort by top results and return the indices in descending order
#     possible_answers = np.argsort(result, axis=0)[::-1]
#     print 'Question: ', question
#     print 
#     print 'Answer: ', training_data_answers[i]
#     print
#     print 'Predicted answer: ', training_data_answers[possible_answers[0]]
#     print 

In [857]:
#Converts training features and test features into a dense document-term matrix
features = vectorizer.transform(training_data_questions).todense()
test_features = vectorizer.transform(test_data_questions).todense()

In [858]:
clf = RandomForestClassifier(n_estimators=2000, n_jobs=-1, max_features=1., random_state=0)
# clf = GradientBoostingClassifier(n_estimators=3000, learning_rate=0.01, random_state=0)
# clf = LogisticRegression(random_state=0)
clf.fit(features, range(0, len(training_data_answers)))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=1.0, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=-1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [859]:
test_predictions = clf.predict(test_features)
print test_predictions

[ 2  3  4  5  5  6  7  8  9 10  0  1 11 12 13 13 14 15 16 17 18 18 20 21 22
 23 24 25 26 27 28 28 29 30 37 24 31 31 31 32 32 32 32 33 33 33 34 34 34 35
 35 35 35 35 36  9 36 36 37 37 37 38 38 38 39 39 39 39 40 40 40 41 41 41 42
 42 43 43 43 44 44 45 45 45 46 37 46 47 47 48 48 49 49 49 50 50 50 51 51 51
 52  5 52 53 53 54 54 54 55 55 58 58 59 59 60 60 61 61 62 62 63 63 64 64 64
 65 65 66 66 63 63]


In [860]:
#Print question, answer and predicted answer and store wrong answers
wrong_answers = []
for i, prediction in enumerate(test_predictions):
    print 'Question: ', test_data_questions[i]
    print 
    print 'Answer: ', test_data_answers[i]
    print
    print 'Predicted answer: ', training_data_answers[prediction]
    print 
    
    if training_data_answers[prediction] != test_data_answers[i]:
        wrong_answers.append((test_data_questions[i], test_data_answers[i], training_data_answers[prediction]))

Question:  hey

Answer:  hi

Predicted answer:  hi

Question:  what s up

Answer:  nothing much

Predicted answer:  nothing much

Question:  greet

Answer:  greet

Predicted answer:  greet

Question:  how be you

Answer:  i m do good

Predicted answer:  i m do good

Question:  how be you do today

Answer:  i m do good

Predicted answer:  i m do good

Question:  what s your name

Answer:  i m nameless

Predicted answer:  i m nameless

Question:  what can you do

Answer:  anything that you want

Predicted answer:  anything that you want

Question:  when may i check in

Answer:  the date specify on your reservation

Predicted answer:  the date specify on your reservation

Question:  when will i be able to i check out

Answer:  whenever you want to

Predicted answer:  whenever you want to

Question:  who be you

Answer:  i be your little assistant

Predicted answer:  i be your little assistant

Question:  hi

Answer:  hi

Predicted answer:  hi

Question:  hello

Answer:  hi

Predicted answ

In [864]:
#calculate error rate
print 'Error rate:', (len(wrong_answers) * 1.0 /len(test_data_questions)) * 100, '%'

Error rate: 6.87022900763 %


In [862]:
wrong_answers

[('how to come to your locals if i come from brazil',
  'none',
  'you need to take the tgv then at paris train station you take the subway'),
 ('i want to book a room',
  'hello you can book online on hyphenai on our mobile app by give us a call on 555 800 4567 or by email on reservations@hyphenai do not hesitate to let us know if we can be of any other help best wish',
  'hello physically challenge room be also available and the hotel have be design to make all guests feel comfortable do not hesitate to let us know if we can be of any other help best wish'),
 ('i need to book a room',
  'hello you can book online on hyphenai on our mobile app by give us a call on 555 800 4567 or by email on reservations@hyphenai do not hesitate to let us know if we can be of any other help best wish',
  'sure about what'),
 ('at what time be check out checkout',
  'hello check in checkin be at 300pm should you wish to arrive earlier we advise you to get in touch with our reservations team on reservat

In [863]:
len(wrong_answers)

9

In [836]:
vectorizer.vocabulary_

{u'1200pm': 0,
 u'125': 1,
 u'140m': 2,
 u'24': 3,
 u'25': 4,
 u'300pm': 5,
 u'30mins': 6,
 u'32m': 7,
 u'3in1': 8,
 u'40mins': 9,
 u'4567': 10,
 u'555': 11,
 u'5mins': 12,
 u'600': 13,
 u'600pm': 14,
 u'800': 15,
 u'80mins': 16,
 u'about': 17,
 u'accommodate': 18,
 u'across': 19,
 u'adapt': 20,
 u'addition': 21,
 u'additional': 22,
 u'address': 23,
 u'advance': 24,
 u'advise': 25,
 u'aed': 26,
 u'after': 27,
 u'all': 28,
 u'also': 29,
 u'any': 30,
 u'anything': 31,
 u'anytime': 32,
 u'anywhere': 33,
 u'app': 34,
 u'appointment': 35,
 u'area': 36,
 u'areas': 37,
 u'arrival': 38,
 u'arrive': 39,
 u'as': 40,
 u'assistance': 41,
 u'assistant': 42,
 u'automatically': 43,
 u'availability': 44,
 u'available': 45,
 u'away': 46,
 u'before': 47,
 u'below': 48,
 u'best': 49,
 u'bike': 50,
 u'board': 51,
 u'book': 52,
 u'bordeaux': 53,
 u'bottom': 54,
 u'boulevard': 55,
 u'break': 56,
 u'breakfast': 57,
 u'bring': 58,
 u'buffet': 59,
 u'but': 60,
 u'by': 61,
 u'calculate': 62,
 u'call': 63,
 u'ca