In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
%%time
df_dialogues = pd.read_csv('./data/dialogues.tsv', sep='\t')
df_stackoverflow = pd.read_csv('./data/tagged_posts.tsv', sep='\t')

print(f'df_dialogues shape: {df_dialogues.shape}')
print(f'df_stackoverflow shape: {df_stackoverflow.shape}')

df_dialogues shape: (218609, 2)
df_stackoverflow shape: (2171575, 3)
CPU times: user 2.38 s, sys: 358 ms, total: 2.74 s
Wall time: 2.74 s


In [3]:
# Sample data
seed = 781
sample_size = 200000

df_dialogues = df_dialogues.sample(sample_size, random_state=seed)
df_stackoverflow = df_stackoverflow.sample(sample_size, random_state=seed)

In [4]:
df_dialogues.head()

Unnamed: 0,text,tag
154349,What's that got to do with you?,dialogue
105643,Nooo. Is it your story?,dialogue
122343,"No Bela, that's ""incorporates."" Look, just sa...",dialogue
183491,For getting a divorce?,dialogue
129003,"No danger of attack, as long as you don't trig...",dialogue


In [5]:
df_stackoverflow.head()

Unnamed: 0,post_id,title,tag
631024,9071076,C++ virtual method overload/override compiler ...,c_cpp
353311,5298353,Check a condition and also identify the patter...,php
1547617,23947511,isset($_POST['x']) only works if the submit bu...,php
70588,1353559,Trying to make this star output using a for lo...,c_cpp
534998,7753016,"Django+Postgres: ""current transaction is abort...",python


In [6]:
def text_prepare(text):
    replace_by_space_re = re.compile('[/(){}\[\]\|@,;]')
    bad_symbols_re = re.compile('[^0-9a-z #+_]')
    stopwords_set = set(stopwords.words('english'))

    text = text.lower()
    text = replace_by_space_re.sub(' ', text)
    text = bad_symbols_re.sub('', text)
    text = ' '.join([x for x in text.split() if x and x not in stopwords_set])

    return text.strip()

In [7]:
%%time
# Text Preprocessing
df_stackoverflow.title = df_stackoverflow.title.apply(text_prepare)
df_dialogues.text = df_dialogues.text.apply(text_prepare)

CPU times: user 50 s, sys: 5.22 s, total: 55.2 s
Wall time: 58 s


In [8]:
df_dialogues.head()

Unnamed: 0,text,tag
154349,whats got,dialogue
105643,nooo story,dialogue
122343,bela thats incorporates look say casket,dialogue
183491,getting divorce,dialogue
129003,danger attack long dont trigger injury system,dialogue


In [9]:
df_stackoverflow.head()

Unnamed: 0,post_id,title,tag
631024,9071076,c++ virtual method overload override compiler ...,c_cpp
353311,5298353,check condition also identify pattern url rege...,php
1547617,23947511,isset _post x works submit button namesubmit,php
70588,1353559,trying make star output using loop c,c_cpp
534998,7753016,django+postgres current transaction aborted co...,python


In [10]:
%%time
# Concatenate dialogues and posts
X = np.concatenate([df_dialogues.text.values, df_stackoverflow.title.values])
y = [0] * df_dialogues.shape[0] + [1] * df_stackoverflow.shape[0]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)
print(f'Train size={len(X_train):,}, test size={len(X_test):,}')

Train size=360,000, test size=40,000
CPU times: user 120 ms, sys: 11.4 ms, total: 132 ms
Wall time: 131 ms


In [11]:
def tfidf(train, test):
    vectorizer = TfidfVectorizer(
        min_df=5, max_df=0.9,
        ngram_range=(1, 2),
        token_pattern='(\S+)'
    )
    vectorizer.fit(train)
    train_tfidf = vectorizer.transform(train)
    test_tfidf = vectorizer.transform(test)
    return vectorizer, train_tfidf, test_tfidf

In [12]:
%%time
vectorizer, X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

CPU times: user 11.8 s, sys: 222 ms, total: 12 s
Wall time: 12 s


In [13]:
%%time
# Train model
intent_recognizer = LogisticRegression(
    penalty='l2', C=10, 
    random_state=seed, 
    solver='liblinear'
)
intent_recognizer.fit(X_train_tfidf, y_train)

# Evaluate
y_pred = intent_recognizer.predict(X_test_tfidf)
acc = metrics.accuracy_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
print(f'acc={acc*100:.3f}% - f1={f1*100:.3f}%')

# Save models
pickle.dump(vectorizer, open('../api/src/dumps/tfidf_vectorizer.pkl', 'wb'))
pickle.dump(intent_recognizer, open('../api/src/dumps/intent_recognizer.pkl', 'wb'))

acc=99.125% - f1=99.124%
CPU times: user 12.5 s, sys: 28.4 s, total: 40.9 s
Wall time: 6.53 s
