In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
if not os.path.exists('./data'):
    !mkdir ./data

In [3]:
!wget --no-check-certificate \
    https://github.com/hse-aml/natural-language-processing/releases/download/project/tagged_posts.tsv \
    -O ./data/tagged_posts.tsv

--2020-03-22 18:38:21--  https://github.com/hse-aml/natural-language-processing/releases/download/project/tagged_posts.tsv
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/112112945/8156d746-7325-11e8-8049-a22b3f232b02?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200322%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200322T183821Z&X-Amz-Expires=300&X-Amz-Signature=4bfaf524ac74eafaef10b87308139b8859078c49018d7798aad465e1618860a4&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dtagged_posts.tsv&response-content-type=application%2Foctet-stream [following]
--2020-03-22 18:38:21--  https://github-production-release-asset-2e65be.s3.amazonaws.com/112112945/8156d746-7325-11e8-8049-a22b3f232b02?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-

In [4]:
!wget --no-check-certificate \
    https://github.com/hse-aml/natural-language-processing/releases/download/project/dialogues.tsv \
    -O ./data/dialogues.tsv

--2020-03-22 18:38:56--  https://github.com/hse-aml/natural-language-processing/releases/download/project/dialogues.tsv
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/112112945/72da0808-eda3-11e7-80c5-8ee61be1a33e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20200322%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200322T183856Z&X-Amz-Expires=300&X-Amz-Signature=34ab34c7575d14578de0a89ece667df334a7f1ace74ded28ef672f5d500f4383&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Ddialogues.tsv&response-content-type=application%2Foctet-stream [following]
--2020-03-22 18:38:56--  https://github-production-release-asset-2e65be.s3.amazonaws.com/112112945/72da0808-eda3-11e7-80c5-8ee61be1a33e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Creden

In [5]:
def extract_tfidf_features(X_train, X_test, to_='./out'):
    if not os.path.exists(to_):
        !mkdir {to_}
    vect = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
    vect.fit(X_train)
    with open(os.path.join(to_, 'tfidf.pkl'), 'wb') as file:
        pickle.dump(vect, file)
    X_train = vect.transform(X_train)
    X_test = vect.transform(X_test)
    return X_train, X_test

In [6]:
seed = 781
sample_size = 200000

df_stackoverflow = pd.read_csv('./data/tagged_posts.tsv', sep='\t').sample(sample_size, random_state=seed)
df_dialogues = pd.read_csv('./data/dialogues.tsv', sep='\t').sample(sample_size, random_state=seed)

In [7]:
df_stackoverflow.head()

Unnamed: 0,post_id,title,tag
631024,9071076,C++ virtual method overload/override compiler ...,c_cpp
353311,5298353,Check a condition and also identify the patter...,php
1547617,23947511,isset($_POST['x']) only works if the submit bu...,php
70588,1353559,Trying to make this star output using a for lo...,c_cpp
534998,7753016,"Django+Postgres: ""current transaction is abort...",python


In [8]:
df_dialogues.head()

Unnamed: 0,text,tag
154349,What's that got to do with you?,dialogue
105643,Nooo. Is it your story?,dialogue
122343,"No Bela, that's ""incorporates."" Look, just sa...",dialogue
183491,For getting a divorce?,dialogue
129003,"No danger of attack, as long as you don't trig...",dialogue


In [9]:
def text_prepare(text):
    """Performs tokenization and simple preprocessing."""

    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

In [10]:
%%time
df_stackoverflow.title = df_stackoverflow.title.apply(text_prepare)
df_dialogues.text = df_dialogues.text.apply(text_prepare)

CPU times: user 51.4 s, sys: 3.9 s, total: 55.3 s
Wall time: 55.4 s


## Intent recognition

In [11]:
%%time
X = np.concatenate([df_dialogues.text.values, df_stackoverflow.title.values])
y = ['dialogue'] * df_dialogues.shape[0] + ['stackoverflow'] * df_stackoverflow.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)
print(f'Train size={len(X_train)}, test size={len(X_test)}')

X_train_tfidf, X_test_tfidf = extract_tfidf_features(X_train, X_test)

Train size=360000, test size=40000
CPU times: user 17.6 s, sys: 404 ms, total: 18 s
Wall time: 18.1 s


In [12]:
%%time
intent_recognizer = LogisticRegression(penalty='l2', C=10, random_state=seed, solver='liblinear')
intent_recognizer.fit(X_train_tfidf, y_train)

CPU times: user 18.9 s, sys: 1.09 s, total: 20 s
Wall time: 5.45 s


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=781, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [13]:
y_test_pred = intent_recognizer.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test accuracy={test_accuracy}')

Test accuracy=0.99125


In [14]:
pickle.dump(intent_recognizer, open('./out/intent_recognizer.pkl', 'wb'))

# Programming language classification

In [15]:
X = df_stackoverflow.title.values
y = df_stackoverflow.tag.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
print(f'Train size={len(X_train)}, test size={len(X_test)}')

Train size=160000, test size=40000


In [16]:
vectorizer = pickle.load(open('./out/tfidf.pkl', 'rb'))

X_train_tfidf, X_test_tfidf = vectorizer.transform(X_train), vectorizer.transform(X_test)

In [17]:
%%time
tag_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', C=5, random_state=seed, solver='liblinear'))
tag_classifier.fit(X_train_tfidf, y_train)

CPU times: user 44.6 s, sys: 2.18 s, total: 46.8 s
Wall time: 12.4 s


OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=781,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [18]:
y_test_pred = tag_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test accuracy={test_accuracy}')

Test accuracy=0.801575


In [19]:
pickle.dump(tag_classifier, open('./out/tag_classifier.pkl', 'wb'))

# Ranking questions with embeddings