In [1]:
import re
import ast
import tqdm
import collections
import numpy as np
import scipy as sp
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, make_scorer

import warnings
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

SEED = 78

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Text processing

In [2]:
train = pd.read_csv('./data/train.tsv', sep='\t')
test = pd.read_csv('./data/test.tsv', sep='\t')

print('Shape of data')
print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

train.head()

Shape of data
Train: (100000, 2)
Test: (30000, 2)


Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"
2,How to terminate windows phone 8.1 app,['c#']
3,get current time in a specific country via jquery,"['javascript', 'jquery']"
4,Configuring Tomcat to Use SSL,['java']


In [3]:
help(ast.literal_eval)

Help on function literal_eval in module ast:

literal_eval(node_or_string)
    Safely evaluate an expression node or a string containing a Python
    expression.  The string or node provided may only consist of the following
    Python literal structures: strings, bytes, numbers, tuples, lists, dicts,
    sets, booleans, and None.



In [4]:
train.tags = train.tags.apply(ast.literal_eval)
test.tags = test.tags.apply(ast.literal_eval)

In [5]:
X_train, y_train = train.title.values, train.tags.values
X_test, y_test = test.title.values, test.tags.values

In [6]:
def clean_text(text):
    text = text.strip().lower()
    text = re.sub('[/(){}\[\]\|@,;]', ' ', text)
    text = re.sub('[^0-9a-z #+_]', '', text)
    text = ' '.join([word for word in str(text).split()
                     if word not in set(stopwords.words('english'))])
    return text

In [7]:
X_train = [clean_text(text) for text in tqdm.tqdm(X_train)]
X_test = [clean_text(text) for text in tqdm.tqdm(X_test)]

100%|██████████| 100000/100000 [01:33<00:00, 1074.06it/s]
100%|██████████| 30000/30000 [00:27<00:00, 1071.99it/s]


In [8]:
X_train[:3]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app']

In [9]:
word2count = collections.defaultdict(lambda: 0)
for text in X_train:
    for word in text.split():
        word2count[word] += 1

tag2count = collections.defaultdict(lambda: 0)
for tags in y_train:
    for tag in tags:
        tag2count[tag] += 1

In [10]:
most_common_tags = sorted(tag2count.items(),
                          key=lambda x: x[1],
                          reverse=True)
most_common_words = sorted(word2count.items(),
                          key=lambda x: x[1],
                          reverse=True)
print(most_common_tags[:3])
print(most_common_words[:3])

[('javascript', 19078), ('c#', 19077), ('java', 18661)]
[('using', 8278), ('php', 5614), ('java', 5501)]


## Label processing

In [11]:
%%time
mlb = MultiLabelBinarizer(classes=sorted(tag2count.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)

print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

y_train shape: (100000, 100)
y_test shape: (30000, 100)
CPU times: user 136 ms, sys: 76 ms, total: 212 ms
Wall time: 211 ms


## Feature extraction

In [12]:
print(f'Number of words: {len(word2count)}')

Number of words: 31497


In [13]:
VOCAB_SIZE = 5000

WORD2INDEX = {token[0]: i
              for i, token in enumerate(most_common_words[:VOCAB_SIZE])}
INDEX2WORD = {v: k for k, v in WORD2INDEX.items()}

In [14]:
def get_bow(text, word2index=WORD2INDEX, vocab_size=VOCAB_SIZE):
    vect = np.zeros(vocab_size)
    for word in text.split():
        if word in word2index:
            vect[word2index[word]] += 1
    return vect

In [15]:
X_train_bow = sp.sparse.vstack([sp.sparse.csr_matrix(get_bow(text))
                                for text in tqdm.tqdm(X_train)])
X_test_bow = sp.sparse.vstack([sp.sparse.csr_matrix(get_bow(text))
                               for text in tqdm.tqdm(X_test)])

print('X_train_bow shape:', X_train_bow.shape)
print('X_test_bow shape:', X_test_bow.shape)

100%|██████████| 100000/100000 [00:26<00:00, 3733.64it/s]
100%|██████████| 30000/30000 [00:07<00:00, 3817.42it/s]


X_train_bow shape: (100000, 5000)
X_test_bow shape: (30000, 5000)


In [16]:
%%time
tfidf_vectorizer = TfidfVectorizer(token_pattern='\S+',
                                   min_df=5, max_df=0.9,
                                   ngram_range=(1, 2))
tfidf_vectorizer.fit(X_train)

X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print('X_train_tfidf shape:', X_train_tfidf.shape)
print('X_test_tfidf shape:', X_test_tfidf.shape)

X_train_tfidf shape: (100000, 18300)
X_test_tfidf shape: (30000, 18300)
CPU times: user 4.43 s, sys: 48 ms, total: 4.48 s
Wall time: 4.47 s


## Modeling

In [17]:
def train_model(clf, X_tr, y_tr, params=None):
    if params is None:
        model = OneVsRestClassifier(estimator=clf)
        model.fit(X_tr, y_tr)
        return model
    else:
        params = {f'estimator__{k}': v for k, v in params.items()}
        # print(params)
        gs = GridSearchCV(OneVsRestClassifier(estimator=clf),
                          param_grid=params, cv=10,
                          scoring=make_scorer(f1_score,
                                              average='weighted'),
                          verbose=3, n_jobs=-1)
        gs.fit(X_tr, y_tr)
        print(f'Best CV score: {gs.best_score_:.5f}')
        return gs.best_estimator_

In [18]:
lr_params = {
    'C': [0.1, 1, 5, 10],
    'penalty': ['l1', 'l2'],
    'random_state': [SEED],
    'max_iter': [1000]
}

In [None]:
%%time
clf_bow = train_model(LogisticRegression(),
                      X_tr=X_train_bow,
                      y_tr=y_train,
                      params=lr_params)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  5.2min


In [None]:
%%time
clf_tfidf = train_model(LogisticRegression(),
                        X_tr=X_train_tfidf,
                        y_tr=y_train,
                        params=lr_params)

In [None]:
y_pred_bow = f1_score(y_test, clf_bow.predict(X_test_bow),
                      average='weighted')
y_pred_tfidf = f1_score(y_test, clf_tfidf.predict(X_test_bow),
                        average='weighted')

print('LR + BOW - F1 score:', y_pred_bow)
print('LR + TF-IDF - F1 score:', y_pred_tfidf)

## Analysis

In [None]:
def print_most_important_features(clf, tag, tags_classes,
                                  index2word=INDEX2WORD):
    print(f'Tag: {tag}')
    coef = classifier.coef_[tags_classes.index(tag)]
    top_pos_words = [index2word[idx] for idx in coef.argsort()[-1:-6:-1]]
    top_neg_words = [index2word[idx] for idx in coef.argsort()[:5]]
    print('Top 5 positive words: \t{}'.format(', '.join(top_pos_words)))
    print('Top 5 negative words: \t{}'.format(', '.join(top_neg_words)))
    

print_most_important_features(clf_tfidf, 'c',
                              mlb.classes, tfidf_reversed_vocab)
print_most_important_features(clf_tfidf, 'c++',
                              mlb.classes, tfidf_reversed_vocab)
print_most_important_features(clf_tfidf, 'linux',
                              mlb.classes, tfidf_reversed_vocab)