In [16]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from fuzzywuzzy import fuzz

def select_noun_verbs_base(sentence, list_only=False):                                                                                                                                                 
    t_sentence = nltk.word_tokenize(sentence)                                                                            
    tags_words = nltk.pos_tag(t_sentence)
    select_words = [(word,tag) for word,tag in tags_words if tag in ('NN','VB')]
    if list_only:
        return [word for word,tag in select_words]
    return select_words

def select_noun_verbs(sentence, list_only=False):                                                                                                                                                      
    t_sentence = nltk.word_tokenize(sentence)                                                                            
    tags_words = nltk.pos_tag(t_sentence)
    select_words = [(word,tag) for word,tag in tags_words if tag.startswith('NN') or 
        tag.startswith('VB')]
    if list_only:
        return [word for word,tag in select_words]
    return select_words

def lemmatize_word(word):
    lemma = WordNetLemmatizer()
    return lemma.lemmatize(word)

def fuzzymatcher(question, query, partial=False):
    if partial:
        return fuzz.partial_ratio(question, query)
    return fuzz.ratio(question, query)


In [17]:
import shelve
from chatbot.faq_db import set_all_keys
all_questions = set_all_keys()

In [23]:
questions_pos = [select_noun_verbs(que[0],list_only=True) for que in all_questions][:-9]

In [24]:
import re
from collections import Counter


def words(text):
    return re.findall(r'\w+', text.lower())


WORDS = Counter(words(open('chatbot/big.txt').read()))


def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N


def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)


def candidates(word):
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])


def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)


def edits1(word):
    "All edits that are one edit away from `word`."
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))


# Synonym will be the combined list of Topic modeled words 'K1, K2, ...' and their synonyms 'S1, S2, ...'
synonym_r = ['register', 'registration']
synonym_m = ['merojob', 'merojobs']
synonym_p = ['password']


In [45]:
import nltk
user_input = input('>')
user_input_corrected = ' '.join([correction(word) for word in nltk.word_tokenize(user_input)]) 
user_input_corrected_pos_tagged = select_noun_verbs(user_input_corrected, list_only=True)
user_input_corrected_pos_tagged_lemmatized = ' '.join(lemmatize_word(w) for w in user_input_corrected_pos_tagged)
que = ' '.join(user_input_corrected_pos_tagged_lemmatized)
que_value = list()
for each_question in questions_pos:
    que_value.append((fuzzymatcher(que, ' '.join(each_question), partial=False), ' '.join(each_question)))
print('Selected :: {}'.format(max(que_value)))
t_list = sorted(que_value, reverse=True)[:5]
print('Top 5')
print('\n'.join([str(li) for li in t_list]))

>merojob
Selected :: (70, 'merojob')
Top 5
(70, 'merojob')
(61, 'is merojob')
(60, 'meroJob')
(50, 'emails merojob send')
(45, 'Do need pay get job merojob')
