In [1]:
import re
import ast
import tqdm
import pickle
import collections
import numpy as np
import scipy as sp
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, make_scorer

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

SEED = 78

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Text processing

In [2]:
train = pd.read_csv('./data/train.tsv', sep='\t')
test = pd.read_csv('./data/test.tsv', sep='\t')

print('Shape of data')
print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

train.head()

Shape of data
Train: (100000, 2)
Test: (30000, 2)


Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"
2,How to terminate windows phone 8.1 app,['c#']
3,get current time in a specific country via jquery,"['javascript', 'jquery']"
4,Configuring Tomcat to Use SSL,['java']


In [3]:
help(ast.literal_eval)

Help on function literal_eval in module ast:

literal_eval(node_or_string)
    Safely evaluate an expression node or a string containing a Python
    expression.  The string or node provided may only consist of the following
    Python literal structures: strings, bytes, numbers, tuples, lists, dicts,
    sets, booleans, and None.



In [4]:
train.tags = train.tags.apply(ast.literal_eval)
test.tags = test.tags.apply(ast.literal_eval)

In [5]:
X_train, y_train = train.title.values, train.tags.values
X_test, y_test = test.title.values, test.tags.values

In [6]:
def clean_text(text):
    text = text.strip().lower()
    text = re.sub('[/(){}\[\]\|@,;]', ' ', text)
    text = re.sub('[^0-9a-z #+_]', '', text)
    text = ' '.join([word for word in str(text).split()
                     if word not in set(stopwords.words('english'))])
    return text

In [7]:
X_train = [clean_text(text) for text in tqdm.tqdm(X_train)]
X_test = [clean_text(text) for text in tqdm.tqdm(X_test)]

100%|██████████| 100000/100000 [01:33<00:00, 1072.50it/s]
100%|██████████| 30000/30000 [00:28<00:00, 1064.83it/s]


In [8]:
X_train[:3]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app']

In [9]:
word2count = collections.defaultdict(lambda: 0)
for text in X_train:
    for word in text.split():
        word2count[word] += 1

tag2count = collections.defaultdict(lambda: 0)
for tags in y_train:
    for tag in tags:
        tag2count[tag] += 1

In [10]:
most_common_tags = sorted(tag2count.items(),
                          key=lambda x: x[1],
                          reverse=True)
most_common_words = sorted(word2count.items(),
                          key=lambda x: x[1],
                          reverse=True)
print(most_common_tags[:3])
print(most_common_words[:3])

[('javascript', 19078), ('c#', 19077), ('java', 18661)]
[('using', 8278), ('php', 5614), ('java', 5501)]


## Label processing

In [11]:
%%time
mlb = MultiLabelBinarizer(classes=sorted(tag2count.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)

print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

y_train shape: (100000, 100)
y_test shape: (30000, 100)
CPU times: user 148 ms, sys: 52 ms, total: 200 ms
Wall time: 199 ms


## Feature extraction

In [12]:
print(f'Number of words: {len(word2count)}')

Number of words: 31497


In [13]:
VOCAB_SIZE = 5000

WORD2INDEX = {token[0]: i
              for i, token in enumerate(most_common_words[:VOCAB_SIZE])}
INDEX2WORD = {v: k for k, v in WORD2INDEX.items()}

In [14]:
def get_bow(text, word2index=WORD2INDEX, vocab_size=VOCAB_SIZE):
    vect = np.zeros(vocab_size)
    for word in text.split():
        if word in word2index:
            vect[word2index[word]] += 1
    return vect

In [15]:
X_train_bow = sp.sparse.vstack([sp.sparse.csr_matrix(get_bow(text))
                                for text in tqdm.tqdm(X_train)])
X_test_bow = sp.sparse.vstack([sp.sparse.csr_matrix(get_bow(text))
                               for text in tqdm.tqdm(X_test)])

print('X_train_bow shape:', X_train_bow.shape)
print('X_test_bow shape:', X_test_bow.shape)

100%|██████████| 100000/100000 [00:27<00:00, 3667.19it/s]
100%|██████████| 30000/30000 [00:07<00:00, 3793.12it/s]


X_train_bow shape: (100000, 5000)
X_test_bow shape: (30000, 5000)


In [16]:
%%time
tfidf_vectorizer = TfidfVectorizer(token_pattern='\S+',
                                   min_df=5, max_df=0.9,
                                   ngram_range=(1, 2))
tfidf_vectorizer.fit(X_train)

X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print('X_train_tfidf shape:', X_train_tfidf.shape)
print('X_test_tfidf shape:', X_test_tfidf.shape)

X_train_tfidf shape: (100000, 18300)
X_test_tfidf shape: (30000, 18300)
CPU times: user 4.4 s, sys: 28 ms, total: 4.43 s
Wall time: 4.43 s


## Modeling

In [17]:
def train_model(clf, X_tr, y_tr, params=None):
    if params is None:
        model = OneVsRestClassifier(estimator=clf)
        model.fit(X_tr, y_tr)
        return model
    else:
        params = {f'estimator__{k}': v for k, v in params.items()}
        # print(params)
        gs = GridSearchCV(OneVsRestClassifier(estimator=clf),
                          param_grid=params, cv=5,
                          scoring=make_scorer(f1_score,
                                              average='weighted'),
                          verbose=10, n_jobs=-1)
        gs.fit(X_tr, y_tr)
        print(f'Best CV score: {gs.best_score_:.5f}')
        return gs.best_estimator_

In [18]:
lr_params = {
    'C': [0.1, 1, 5, 10],
    'penalty': ['l1', 'l2'],
    'random_state': [SEED],
    'max_iter': [1000]
}

In [19]:
%%time
clf_bow = train_model(LogisticRegression(),
                      X_tr=X_train_bow,
                      y_tr=y_train,
                      params=lr_params)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 20.6min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 26.1min finished


Best CV score: 0.64910
CPU times: user 2min 57s, sys: 272 ms, total: 2min 57s
Wall time: 29min 3s


In [28]:
clf_bow.

OneVsRestClassifier(estimator=LogisticRegression(C=5, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=78,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [20]:
%%time
clf_tfidf = train_model(LogisticRegression(),
                        X_tr=X_train_tfidf,
                        y_tr=y_train,
                        params=lr_params)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 19.2min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 24.5min finished


Best CV score: 0.64637
CPU times: user 3min 31s, sys: 152 ms, total: 3min 31s
Wall time: 28min 2s


In [29]:
clf_tfidf

OneVsRestClassifier(estimator=LogisticRegression(C=10, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=1000,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=78,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [35]:
y_pred_bow = clf_bow.predict(X_test_bow)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)

f1_score_bow = f1_score(y_test, y_pred_bow, average='weighted')
f1_score_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')

print(f'LR + BOW - F1 score: {f1_score_bow:.5f}')
print(f'LR + TF-IDF - F1 score: {f1_score_tfidf:.5f}')

LR + BOW - F1 score: 0.65434
LR + TF-IDF - F1 score: 0.65400


In [34]:
for idx in np.random.randint(0, len(test), size=10):
    x_test_bow = X_test_bow[idx].reshape(1, -1)
    x_test_tfidf = X_test_tfidf[idx].reshape(1, -1)
    y_pred_tags_bow = mlb.inverse_transform(clf_bow.predict(x_test_bow))
    y_pred_tags_tfidf = mlb.inverse_transform(clf_tfidf.predict(x_test_tfidf))
    print('Content: \t', test.title.iloc[idx])
    print('\tTrue tags: \t\t', ', '.join(test.tags.iloc[idx]))
    print('\tBoW pred tags: \t\t', ', '.join(y_pred_tags_bow[0]))
    print('\tTF-IDF pred tags: \t', ', '.join(y_pred_tags_tfidf[0]))
    print('====================================================')

Content: 	 How to check whether SQL Server 2005 or 2008 is installed
	True tags: 		 c#, sql-server
	BoW pred tags: 		 
	TF-IDF pred tags: 	 c#
Content: 	 Making HTML elements editable after a click of a button and save on the database
	True tags: 		 javascript, php, html, css
	BoW pred tags: 		 html, javascript
	TF-IDF pred tags: 	 html, javascript
Content: 	 Time.zone & in_time_zone display wrong time in views
	True tags: 		 ruby-on-rails, ruby
	BoW pred tags: 		 datetime
	TF-IDF pred tags: 	 datetime
Content: 	 Combine two integer arrays into one array in java
	True tags: 		 java, arrays
	BoW pred tags: 		 arrays, java
	TF-IDF pred tags: 	 arrays, java
Content: 	 Importing From Sister Subdirectories in Python?
	True tags: 		 python
	BoW pred tags: 		 python
	TF-IDF pred tags: 	 python
Content: 	 Django - Post ajax request forbidden 403 with ExtJS
	True tags: 		 ajax, django
	BoW pred tags: 		 ajax, django
	TF-IDF pred tags: 	 ajax, django, javascript
Content: 	 Java: simplest integer

## Most important features

In [45]:
def print_most_k_important_features(clf, tag, tags_classes, k, index2word):
    coef = clf.coef_[tags_classes.index(tag)]
    indexes = coef.argsort()
    top_pos_words = [index2word[idx] for idx in indexes[-k:]]
    top_neg_words = [index2word[idx] for idx in indexes[:k]]
    print('\tTop 5 positive words: {}'.format(', '.join(top_pos_words)))
    print('\tTop 5 negative words: {}'.format(', '.join(top_neg_words)))

In [49]:
tfidf_inv_vocab = {v:k for k, v in tfidf_vectorizer.vocabulary_.items()}

for idx in np.random.randint(0, len(tag2count), size=10):
    tag = [*tag2count.keys()][idx]
    print(f'Tag: {tag}')
    print('    BoW')
    print_most_k_important_features(clf_bow, tag, mlb.classes, 5, INDEX2WORD)
    print('    TF-IDF')
    print_most_k_important_features(clf_tfidf, tag, mlb.classes, 5, tfidf_inv_vocab)
    print('============================================================')

Tag: qt
    BoW
	Top 5 positive words: qtcreator, qt5, qstring, qml, qt
	Top 5 negative words: outside, work, 51, containing, calling
    TF-IDF
	Top 5 positive words: qt5, qstring, qtableview, qml, qt
	Top 5 negative words: java, 51, php, python, javascript
Tag: parsing
    BoW
	Top 5 positive words: jobject, parsecom, parse, parsing, parser
	Top 5 negative words: built, class, efficient, problems, iphone
    TF-IDF
	Top 5 positive words: read_csv, parsecom, parser, parsing, parse
	Top 5 negative words: class, xml string, iphone, write, json array
Tag: google-maps
    BoW
	Top 5 positive words: markers, maps, smoothly, google, polyline
	Top 5 negative words: initialize, oracle, displaying, extern, id
    TF-IDF
	Top 5 positive words: maps, markers, googlemap, polyline, google
	Top 5 negative words: displaying, disappear, extern, redirect, style
Tag: cocoa-touch
    BoW
	Top 5 positive words: nsuserdefaults, obtained, afnetworking, uipickerview, customizing
	Top 5 negative words: php, 

In [27]:
pickle.dump(clf_bow, open('./trained models/clf_bow.pkl', 'wb'))
pickle.dump(clf_bow, open('./trained models/clf_tfidf.pkl', 'wb'))