In [1]:
import os
import re
import tqdm
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from gensim.models import KeyedVectors

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data

To detect intent of users questions we will need two text collections:

- `tagged_posts.tsv` — StackOverflow posts, tagged with one programming language (positive samples).
- `dialogues.tsv` — dialogue phrases from movie subtitles (negative samples).

For those questions, that have programming-related intent, we will proceed as follow predict programming language (we allowed only one tag per question here) and rank candidates within the tag using embeddings. For the ranking part, we will need:

- `word_embeddings.tsv` — word embeddings, that you trained with StarSpace in the 3rd assignment. It's not a problem if you didn't do it, because we can offer an alternative solution for you.

As a result of this notebook, we should obtain the following new objects that we will then use in the running bot:

- `intent_recognizer.pkl` — intent recognition model;
- `tag_classifier.pkl` — programming language classification model;
- `tfidf_vectorizer.pkl` — vectorizer used during training;
- `thread_embeddings_by_tags` — folder with thread embeddings, arranged by tags.

In [2]:
if not os.path.exists('./data'):
    !mkdir ./data

In [3]:
!wget --no-check-certificate \
    https://github.com/hse-aml/natural-language-processing/releases/download/project/tagged_posts.tsv \
    -O ./data/tagged_posts.tsv > /dev/null 2>&1

In [4]:
!wget --no-check-certificate \
    https://github.com/hse-aml/natural-language-processing/releases/download/project/dialogues.tsv \
    -O ./data/dialogues.tsv > /dev/null 2>&1

In [5]:
seed = 781
sample_size = 200000

df_stackoverflow = pd.read_csv('./data/tagged_posts.tsv', sep='\t').sample(sample_size, random_state=seed)
df_dialogues = pd.read_csv('./data/dialogues.tsv', sep='\t').sample(sample_size, random_state=seed)

In [6]:
df_stackoverflow.head()

Unnamed: 0,post_id,title,tag
631024,9071076,C++ virtual method overload/override compiler ...,c_cpp
353311,5298353,Check a condition and also identify the patter...,php
1547617,23947511,isset($_POST['x']) only works if the submit bu...,php
70588,1353559,Trying to make this star output using a for lo...,c_cpp
534998,7753016,"Django+Postgres: ""current transaction is abort...",python


In [7]:
df_dialogues.head()

Unnamed: 0,text,tag
154349,What's that got to do with you?,dialogue
105643,Nooo. Is it your story?,dialogue
122343,"No Bela, that's ""incorporates."" Look, just sa...",dialogue
183491,For getting a divorce?,dialogue
129003,"No danger of attack, as long as you don't trig...",dialogue


# Part I. Intent and language recognition

We want to write a bot, which will not only **answer programming-related questions**, but also will be able to **maintain a dialogue**. We would also like to detect the *intent* of the user from the question (we could have had a 'Question answering mode' check-box in the bot, but it wouldn't fun at all). So the first thing we need to do is to **distinguish programming-related questions from general ones**.

It would also be good to predict which programming language a particular question referees to. By doing so, we will speed up question search by a factor of the number of languages (10 here).

## Data preparation

In [8]:
def text_prepare(text):
    """Performs tokenization and simple preprocessing."""

    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

In [9]:
%%time
df_stackoverflow.title = df_stackoverflow.title.apply(text_prepare)
df_dialogues.text = df_dialogues.text.apply(text_prepare)

CPU times: user 47.5 s, sys: 5.2 s, total: 52.7 s
Wall time: 52.7 s


## Intent recognition

We will do a binary classification on TF-IDF representations of texts. Labels will be either `dialogue` for general questions or `stackoverflow` for programming-related questions. First, we prepare the data for this task:

- concatenate dialogue and stackoverflow examples into one sample
- split it into train and test in proportion 90/10 %, use random_state=0 for reproducibility
- transform it into TF-IDF features

In [10]:
def extract_tfidf_features(X_train, X_test, to_='./out'):
    if not os.path.exists(to_):
        !mkdir {to_}
    vect = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
    vect.fit(X_train)
    with open(os.path.join(to_, 'tfidf_vectorizer.pkl'), 'wb') as file:
        pickle.dump(vect, file)
    X_train = vect.transform(X_train)
    X_test = vect.transform(X_test)
    return X_train, X_test

In [11]:
%%time
X = np.concatenate([df_dialogues.text.values, df_stackoverflow.title.values])
y = ['dialogue'] * df_dialogues.shape[0] + ['stackoverflow'] * df_stackoverflow.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)
print(f'Train size={len(X_train)}, test size={len(X_test)}')

X_train_tfidf, X_test_tfidf = extract_tfidf_features(X_train, X_test)

Train size=360000, test size=40000
CPU times: user 17.5 s, sys: 302 ms, total: 17.8 s
Wall time: 17.8 s


In [12]:
%%time
intent_recognizer = LogisticRegression(penalty='l2', C=10, random_state=seed, solver='liblinear')
intent_recognizer.fit(X_train_tfidf, y_train)

CPU times: user 13.6 s, sys: 30.4 s, total: 43.9 s
Wall time: 6.55 s


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=781, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test accuracy={test_accuracy}')

Test accuracy=0.99125


In [14]:
pickle.dump(intent_recognizer, open('./out/intent_recognizer.pkl', 'wb'))

## Programming Language Classifcation

We will train one more classifier for the programming-related questions. It will predict exactly one tag (=programming language) and will be also based on Logistic Regression with TF-IDF features.

First, let us prepare the data for this task.

In [15]:
X = df_stackoverflow.title.values
y = df_stackoverflow.tag.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
print(f'Train size={len(X_train)}, test size={len(X_test)}')

Train size=160000, test size=40000


In [16]:
vectorizer = pickle.load(open('./out/tfidf_vectorizer.pkl', 'rb'))

X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)

In [17]:
%%time
tag_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=seed, solver='liblinear'))
tag_classifier.fit(X_train_tfidf, y_train)

CPU times: user 37.6 s, sys: 1min 27s, total: 2min 5s
Wall time: 18.1 s


OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=781,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [18]:
y_test_pred = tag_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test accuracy={test_accuracy}')

Test accuracy=0.801625


In [19]:
pickle.dump(tag_classifier, open('./out/tag_classifier.pkl', 'wb'))

# Part II: Ranking questions with embeddings

To find a relevant answer (a thread from StackOverflow) on a question we will use vector representations to calculate similarity between the question and existing threads. We create `question_to_vec` function, which can make such a representation based on word vectors.

However, it would be costly to compute such a representation for all possible answers in online mode of the bot (e.g. when bot is running and answering questions from many users). This is the reason why we will create a database with pre-computed representations. These representations will be arranged by non-overlaping tags (programming languages), so that the search of the answer can be performed only within one tag each time. This will make our bot even more efficient and allow not to store all the database in RAM.

In [20]:
def question_to_vec(question, embeddings, dim=300):
    """
        question: a string
        embeddings: dict where the key is a word and a value is its' embedding
        dim: size of the representation

        result: vector representation for the question
    """
    vec = []
    for token in question.split():
        if token in embeddings:
            vec.append(embeddings[token])
    if len(vec) == 0:
        return np.zeros((dim,))
    return np.stack(vec).mean(axis=0)

In [21]:
!wget -c https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz \
    -O ./data/GoogleNews-vectors-negative300.bin.gz

--2020-03-30 14:03:44--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.37.158
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.37.158|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘./data/GoogleNews-vectors-negative300.bin.gz’


2020-03-30 14:07:26 (7.08 MB/s) - ‘./data/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [25]:
!gunzip -c ./data/GoogleNews-vectors-negative300.bin.gz > ./data/GoogleNews-vectors-negative300.bin

In [32]:
w2v_embeddings = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)
embeddings_dim = w2v_embeddings['word'].shape[0]

Since we want to precompute representations for all possible answers, we need to load the whole posts dataset, unlike we did for the intent classifier:

In [33]:
posts_df = pd.read_csv('./data/tagged_posts.tsv', sep='\t')

In [34]:
posts_df.head()

Unnamed: 0,post_id,title,tag
0,9,Calculate age in C#,c#
1,16,Filling a DataSet or DataTable from a LINQ que...,c#
2,39,Reliable timer in a console application,c#
3,42,Best way to allow plugins for a PHP application,php
4,59,"How do I get a distinct, ordered list of names...",c#


In [35]:
counts_by_tag = posts_df.groupby('tag').count().max(axis=1)
counts_by_tag.head()

tag
c#            394451
c_cpp         281300
java          383456
javascript    375867
php           321752
dtype: int64

Now for each tag, we need to create two data structures, which will serve as online search index:

- `tag_post_ids` — a list of post_ids with shape `(counts_by_tag[tag],)`. It will be needed to show the title and link to the thread;
- `tag_vectors` — a matrix with shape `(counts_by_tag[tag], embeddings_dim)` where embeddings for each answer are stored.


In [37]:
if not os.path.exists('./out/thread_embeddings_by_tags'):
    !mkdir ./out/thread_embeddings_by_tags

In [39]:
for tag, count in tqdm.tqdm(counts_by_tag.items()):
    tag_posts = posts_df[posts_df['tag'] == tag]
    
    tag_post_ids = tag_posts.post_id.tolist()
    
    tag_vectors = np.zeros((count, embeddings_dim), dtype=np.float32)
    for i, title in enumerate(tag_posts['title']):
        tag_vectors[i, :] = question_to_vec(title, w2v_embeddings, embeddings_dim)

    # Dump post ids and vectors to a file.
    filename = os.path.join('./out/thread_embeddings_by_tags', os.path.normpath('%s.pkl' % tag))
    pickle.dump((tag_post_ids, tag_vectors), open(filename, 'wb'))

10it [01:56, 11.62s/it]
