# 1. Import Libraries

In [1]:
import random
import copy
import time
import pandas as pd
import numpy as np
import gc
import re
import pickle
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import gensim
from sklearn.metrics.pairwise import pairwise_distances_argmin
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# get data
!wget 'https://raw.githubusercontent.com/xlogix/buddy-bot/master/data.zip'
!unzip data.zip
!mv data/dialogues.tsv ./dialogues.tsv
!mv data/tagged_posts.tsv ./tagged_posts.tsv

--2020-01-15 17:14:44--  https://raw.githubusercontent.com/xlogix/buddy-bot/master/data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 63453527 (61M) [application/zip]
Saving to: ‘data.zip’


2020-01-15 17:14:48 (155 MB/s) - ‘data.zip’ saved [63453527/63453527]

Archive:  data.zip
   creating: data/
  inflating: data/dialogues.tsv      
  inflating: data/tagged_posts.tsv   


In [3]:
!ls

data  data.zip	dialogues.tsv  sample_data  tagged_posts.tsv


# 2. Read the Data

In [0]:
dialogues = pd.read_csv("dialogues.tsv",sep="\t")

In [0]:
posts = pd.read_csv("tagged_posts.tsv",sep="\t")

In [6]:
dialogues.head()

Unnamed: 0,text,tag
0,Okay -- you're gonna need to learn how to lie.,dialogue
1,I'm kidding. You know how sometimes you just ...,dialogue
2,Like my fear of wearing pastels?,dialogue
3,I figured you'd get to the good stuff eventually.,dialogue
4,Thank God! If I had to hear one more story ab...,dialogue


In [7]:
posts.head()

Unnamed: 0,post_id,title,tag
0,9,Calculate age in C#,c#
1,16,Filling a DataSet or DataTable from a LINQ que...,c#
2,39,Reliable timer in a console application,c#
3,42,Best way to allow plugins for a PHP application,php
4,59,"How do I get a distinct, ordered list of names...",c#


In [8]:
print("Num Posts:",len(posts))
print("Num Dialogues:",len(dialogues))

Num Posts: 2171575
Num Dialogues: 218609


# 3. Create training data for intent classifier - Chitchat/SO Question

In [0]:
texts  =  list(dialogues[:200000].text.values) + list(posts[:200000].title.values)
labels =  ['dialogue']*200000 + ['stackoverflow']*200000

In [0]:
data = pd.DataFrame({'text':texts,'target':labels})

In [0]:
def text_prepare(text):
    """Performs tokenization and simple preprocessing."""
    
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

In [0]:
# Doing some data cleaning
data['text'] = data['text'].apply(lambda x : text_prepare(x))

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data['text'],data['target'],test_size = .1 , random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))


Train size = 360000, test size = 40000


# 4. Create Intent classifier

In [0]:
# We will keep our models and vectorizers in this folder

In [0]:
!mkdir resources

In [16]:
'''
def tfidf_features(X_train, X_test, vectorizer_path):
    """Performs TF-IDF transformation and dumps the model."""
    tfv = TfidfVectorizer(dtype=np.float32, min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')
    
    X_train = tfv.fit_transform(X_train)
    X_test = tfv.transform(X_test)
    
    pickle.dump(tfv,vectorizer_path)
    return X_train, X_test
'''

'\ndef tfidf_features(X_train, X_test, vectorizer_path):\n    """Performs TF-IDF transformation and dumps the model."""\n    tfv = TfidfVectorizer(dtype=np.float32, min_df=3,  max_features=None, \n            strip_accents=\'unicode\', analyzer=\'word\',token_pattern=r\'\\w{1,}\',\n            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,\n            stop_words = \'english\')\n    \n    X_train = tfv.fit_transform(X_train)\n    X_test = tfv.transform(X_test)\n    \n    pickle.dump(tfv,vectorizer_path)\n    return X_train, X_test\n'

In [17]:
'''
X_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test, open("resources/tfidf.pkl",'wb'))
'''

'\nX_train_tfidf, X_test_tfidf = tfidf_features(X_train, X_test, open("resources/tfidf.pkl",\'wb\'))\n'

In [18]:
tfv = TfidfVectorizer(dtype=np.float32, min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

X_train = tfv.fit_transform(X_train)
X_test = tfv.transform(X_test)

print(X_train.shape)
print(X_test.shape)

(360000, 124576)
(40000, 124576)


In [0]:
# initalizer model
intent_recognizer = LogisticRegression(C=10,random_state=0, max_iter=2000)
# save vectorizer inside model
intent_recognizer._vectorizer = tfv

In [21]:
'''
# train
intent_recognizer.fit(X_train_tfidf,y_train)
'''

'\n# train\nintent_recognizer.fit(X_train_tfidf,y_train)\n'

In [22]:
# train
intent_recognizer.fit(X_train,y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
'''
# Check test accuracy.
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))
'''

"\n# Check test accuracy.\ny_test_pred = intent_recognizer.predict(X_test_tfidf)\ntest_accuracy = accuracy_score(y_test, y_test_pred)\nprint('Test accuracy = {}'.format(test_accuracy))\n"

In [24]:
# Check test accuracy.
y_test_pred = intent_recognizer.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.98985


In [0]:
# dump model as well as _vectorizer
pickle.dump(intent_recognizer, open("resources/intent_clf.pkl" , 'wb'))

# 5 Create Programming Language classifier

In [0]:
X = posts['title'].values
y = posts['tag'].values

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print('Train size = {}, test size = {}'.format(len(X_train), len(X_test)))

Train size = 1737260, test size = 434315


In [0]:
'''
vectorizer = pickle.load(open("resources/tfidf.pkl", 'rb'))
X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)
'''

In [0]:
intent_recognizer = pickle.load(open('resources/intent_clf.pkl','rb'))
vectorizer = intent_recognizer._vectorizer
X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)

In [29]:
tag_classifier = OneVsRestClassifier(LogisticRegression(C=5,random_state=0, max_iter=2000))
tag_classifier.fit(X_train_tfidf,y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=2000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=0, solver='lbfgs',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False),
                    n_jobs=None)

In [30]:
# Check test accuracy.
y_test_pred = tag_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('Test accuracy = {}'.format(test_accuracy))

Test accuracy = 0.8043862173767887


In [0]:
'''
pickle.dump(tag_classifier, open("resources/tag_clf.pkl", 'wb'))
'''

In [0]:
# save vectorizer
tag_classifier._vectorizer = vectorizer

# save _vectorizer as well as model
pickle.dump(tag_classifier, open("resources/tag_clf.pkl", 'wb'))


# 6. Store Question database Embeddings

You can use [pre-trained word vectors](https://code.google.com/archive/p/word2vec/) from Google.

In [32]:
 !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
 !gunzip "GoogleNews-vectors-negative300.bin.gz"

# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 

--2020-01-15 17:54:10--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.129.85
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.129.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2020-01-15 17:56:18 (12.4 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


We want to convert every question to an embedding and store them. Whenever user asks a stack overflow question we want to use cosine similarity to get the most similar question

In [0]:
def question_to_vec(question, embeddings, dim=300):
    """
        question: a string
        embeddings: dict where the key is a word and a value is its' embedding
        dim: size of the representation

        result: vector representation for the question
    """
    word_tokens = question.split(" ")
    question_len = len(word_tokens)
    question_mat = np.zeros((question_len,dim), dtype = np.float32)
    
    for idx, word in enumerate(word_tokens):
        if word in embeddings:
            question_mat[idx,:] = embeddings[word]
            
    # remove zero-rows which stand for OOV words       
    question_mat = question_mat[~np.all(question_mat == 0, axis = 1)]
    
    # Compute the mean of each word along the sentence
    if question_mat.shape[0] > 0:
        vec = np.array(np.mean(question_mat, axis = 0), dtype = np.float32).reshape((1,dim))
    else:
        vec = np.zeros((1,dim), dtype = np.float32)
        
    return vec

In [0]:
counts_by_tag = posts.groupby(by=['tag'])["tag"].count().reset_index(name = 'count').sort_values(['count'], ascending = False)

In [36]:
counts_by_tag = list(zip(counts_by_tag['tag'],counts_by_tag['count']))
print(counts_by_tag)

[('c#', 394451), ('java', 383456), ('javascript', 375867), ('php', 321752), ('c_cpp', 281300), ('python', 208607), ('ruby', 99930), ('r', 36359), ('vb', 35044), ('swift', 34809)]


In [0]:
! mkdir resources/embeddings_folder

In [0]:
for tag, count in counts_by_tag:
    tag_posts = posts[posts['tag'] == tag]
    tag_post_ids = tag_posts['post_id'].values
    tag_vectors = np.zeros((count, 300), dtype=np.float32)
    for i, title in enumerate(tag_posts['title']):
        tag_vectors[i, :] = question_to_vec(title, model, 300)
    # Dump post ids and vectors to a file.
    filename = 'resources/embeddings_folder/'+ tag + '.pkl'
    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))

# 7. Given a question and tag can I retrieve the most similar question post_id


In [0]:
def get_similar_question(question,tag):
    # get the path where all question embeddings are kept and load the post_ids and post_embeddings
    embeddings_path = 'resources/embeddings_folder/' + tag + ".pkl"
    post_ids, post_embeddings = pickle.load(open(embeddings_path, 'rb'))
    # Get the embeddings for the question
    question_vec = question_to_vec(question, model, 300)
    # find index of most similar post
    best_post_index = pairwise_distances_argmin(question_vec,
                                                post_embeddings)
    # return best post id
    return post_ids[best_post_index]

In [40]:
get_similar_question("how to use list comprehension in python?",'python')

array([5947137])

You can find this question at:
    
https://stackoverflow.com/questions/5947137

# 8. Try and Test it out!


In [0]:
#### Install dependencies
!pip install chatterbot
!pip install chatterbot-corpus

In [0]:
#!/usr/bin/env python3

import requests
import time
import argparse
import os
import json
from requests.compat import urljoin
import gensim
import pickle
import re
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances_argmin


class BotHandler(object):
    """
        BotHandler is a class which implements all back-end of the bot.
        It has three main functions:
            'get_updates' — checks for new messages
            'send_message' – posts new message to user
            'get_answer' — computes the most relevant on a user's question
    """

    def __init__(self, token, dialogue_manager):
        # Put the Telegram Access token here
        self.token = token
        self.api_url = "https://api.telegram.org/bot{}/".format(token)
        self.dialogue_manager = dialogue_manager

    def get_updates(self, offset=None, timeout=30):
        params = {"timeout": timeout, "offset": offset}
        raw_resp = requests.get(urljoin(self.api_url, "getUpdates"), params)
        try:
            resp = raw_resp.json()
        except json.decoder.JSONDecodeError as e:
            print("Failed to parse response {}: {}.".format(raw_resp.content, e))
            return []

        if "result" not in resp:
            return []
        return resp["result"]

    def send_message(self, chat_id, text):
        params = {"chat_id": chat_id, "text": text}
        return requests.post(urljoin(self.api_url, "sendMessage"), params)

    def get_answer(self, question):
        if question == '/start':
            return "Hi, I am your project bot. How can I help you today?"
        return self.dialogue_manager.generate_answer(question)


def is_unicode(text):
    return len(text) == len(text.encode())


# We will need this function to prepare text at prediction time
def text_prepare(text):
    """Performs tokenization and simple preprocessing."""

    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

# need this to convert questions asked by user to vectors


def question_to_vec(question, embeddings, dim=300):
    """
        question: a string
        embeddings: dict where the key is a word and a value is its' embedding
        dim: size of the representation
        result: vector representation for the question
    """
    word_tokens = question.split(" ")
    question_len = len(word_tokens)
    question_mat = np.zeros((question_len, dim), dtype=np.float32)

    for idx, word in enumerate(word_tokens):
        if word in embeddings:
            question_mat[idx, :] = embeddings[word]

    # remove zero-rows which stand for OOV words
    question_mat = question_mat[~np.all(question_mat == 0, axis=1)]

    # Compute the mean of each word along the sentence
    if question_mat.shape[0] > 0:
        vec = np.array(np.mean(question_mat, axis=0),
                       dtype=np.float32).reshape((1, dim))
    else:
        vec = np.zeros((1, dim), dtype=np.float32)

    return vec


class SimpleDialogueManager(object):
    """
    This is a simple dialogue manager to test the telegram bot.
    The main part of our bot will be written here.
    """

    def __init__(self):

        # Instantiate all the models and TFIDF Objects.
        print("Loading resources...")
        # Instantiate a Chatterbot for Chitchat type questions
        from chatterbot import ChatBot
        from chatterbot.trainers import ChatterBotCorpusTrainer
        chatbot = ChatBot('ZoeyChatterbot')
        trainer = ChatterBotCorpusTrainer(chatbot)
        trainer.train('chatterbot.corpus.english')
        self.chitchat_bot = chatbot
        print("Loading Word2vec model...")
        # Instantiate the Google's pre-trained Word2Vec model.
        self.model = gensim.models.KeyedVectors.load_word2vec_format(
            'GoogleNews-vectors-negative300.bin', binary=True)
        print("Loading Classifier objects...")
        # Load the intent classifier and tag classifier
        self.intent_recognizer = pickle.load(
            open('resources/intent_clf.pkl', 'rb'))
        self.tag_classifier = pickle.load(open('resources/tag_clf.pkl', 'rb'))
        # Load the TFIDF vectorizer object
        self.tfidf_vectorizer = self.tag_classifier._vectorizer
        print("Finished Loading Resources")

    # We created this function just above. We just need to have a function to get most similar question's *post id* in the dataset given we know the programming Language of the question. Here it is:
    def get_similar_question(self, question, tag):
        # get the path where all question embeddings are kept and load the post_ids and post_embeddings
        embeddings_path = 'resources/embeddings_folder/' + tag + ".pkl"
        post_ids, post_embeddings = pickle.load(open(embeddings_path, 'rb'))
        # Get the embeddings for the question
        question_vec = question_to_vec(question, self.model, 300)
        # find index of most similar post
        best_post_index = pairwise_distances_argmin(question_vec,
                                                    post_embeddings)
        # return best post id
        return post_ids[best_post_index]

    def generate_answer(self, question):
        prepared_question = text_prepare(question)
        features = self.tfidf_vectorizer.transform([prepared_question])
        # find intent
        intent = self.intent_recognizer.predict(features)[0]
        # Chit-chat part:
        if intent == 'dialogue':
            response = self.chitchat_bot.get_response(question)
        # Stack Overflow Question
        else:
            # find programming language
            tag = self.tag_classifier.predict(features)[0]
            # find most similar question post id
            post_id = self.get_similar_question(question, tag)[0]
            # respond with
            response = 'I think its about %s\nThis thread might help you: https://stackoverflow.com/questions/%s' % (
                tag, post_id)
        return response


def main():
    token = '999394784:AAGG2PY_F49Z2w39oxY3wcjZ6P_LQP9tsag'
    simple_manager = SimpleDialogueManager()
    bot = BotHandler(token, simple_manager)
    ###############################################################

    print("Ready to talk!")
    offset = 0
    while True:
        updates = bot.get_updates(offset=offset)
        for update in updates:
            print("An update received.")
            if "message" in update:
                chat_id = update["message"]["chat"]["id"]
                if "text" in update["message"]:
                    text = update["message"]["text"]
                    if is_unicode(text):
                        print("Update content: {}".format(update))
                        bot.send_message(chat_id, bot.get_answer(
                            update["message"]["text"]))
                    else:
                        bot.send_message(
                            chat_id, "Hmm, you are sending some weird characters to me...")
            offset = max(offset, update['update_id'] + 1)
        time.sleep(1)


if __name__ == "__main__":
    main()

Loading resources...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Training ai.yml: [####################] 100%

Training computers.yml: [####################] 100%
Training conversations.yml: [####################] 100%
Training emotion.yml: [####################] 100%
Training food.yml: [####################] 100%
Training gossip.yml: [####################] 100%
Training greetings.yml: [####################] 100%
Training health.yml: [####################] 100%
Training history.yml: [####################] 100%
Training humor.yml: [####################] 100%
Training literature.yml: [####################] 100%
Training money.yml: [###################

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Loading Classifier objects...
Finished Loading Resources
Ready to talk!
An update received.
Update content: {'update_id': 937061020, 'message': {'message_id': 218, 'from': {'id': 794766696, 'is_bot': False, 'first_name': 'Anah Veronica', 'last_name': 'Immanuel', 'language_code': 'en'}, 'chat': {'id': 794766696, 'first_name': 'Anah Veronica', 'last_name': 'Immanuel', 'type': 'private'}, 'date': 1579106455, 'text': 'Remove NA values'}}
An update received.
Update content: {'update_id': 937061021, 'message': {'message_id': 219, 'from': {'id': 794766696, 'is_bot': False, 'first_name': 'Anah Veronica', 'last_name': 'Immanuel', 'language_code': 'en'}, 'chat': {'id': 794766696, 'first_name': 'Anah Veronica', 'last_name': 'Immanuel', 'type': 'private'}, 'date': 1579106474, 'text': '/start', 'entities': [{'offset': 0, 'length': 6, 'type': 'bot_command'}]}}
An update received.
Update content: {'update_id': 937061022, 'message': {'message_id': 220, 'from': {'id': 794766696, 'is_bot': False, 'first