In [1]:
import re
import ast
import tqdm
import collections
import numpy as np
import scipy as sp
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

SEED = 78

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Text processing

In [2]:
train = pd.read_csv('./data/train.tsv', sep='\t')
test = pd.read_csv('./data/test.tsv', sep='\t')

print('Shape of data')
print(f'Train: {train.shape}')
print(f'Test: {test.shape}')

train.head()

Shape of data
Train: (100000, 2)
Test: (30000, 2)


Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"
2,How to terminate windows phone 8.1 app,['c#']
3,get current time in a specific country via jquery,"['javascript', 'jquery']"
4,Configuring Tomcat to Use SSL,['java']


In [3]:
help(ast.literal_eval)

Help on function literal_eval in module ast:

literal_eval(node_or_string)
    Safely evaluate an expression node or a string containing a Python
    expression.  The string or node provided may only consist of the following
    Python literal structures: strings, bytes, numbers, tuples, lists, dicts,
    sets, booleans, and None.



In [4]:
train.tags = train.tags.apply(ast.literal_eval)
test.tags = test.tags.apply(ast.literal_eval)

In [5]:
X_train, y_train = train.title.values, train.tags.values
X_test, y_test = test.title.values, test.tags.values

In [6]:
def clean_text(text):
    text = text.strip().lower()
    text = re.sub('[/(){}\[\]\|@,;]', ' ', text)
    text = re.sub('[^0-9a-z #+_]', '', text)
    text = ' '.join([word for word in str(text).split()
                     if word not in set(stopwords.words('english'))])
    return text

In [7]:
X_train = [clean_text(text) for text in tqdm.tqdm(X_train)]
X_test = [clean_text(text) for text in tqdm.tqdm(X_test)]

100%|██████████| 100000/100000 [01:35<00:00, 1043.24it/s]
100%|██████████| 30000/30000 [00:27<00:00, 1073.10it/s]


In [8]:
X_train[:3]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app']

In [14]:
word2count = collections.defaultdict(lambda: 0)
for text in X_train:
    for word in text.split():
        word2count[word] += 1

tag2count = collections.defaultdict(lambda: 0)
for tags in y_train:
    for tag in tags:
        tag2count[tag] += 1

In [31]:
most_common_tags = sorted(tag2count.items(),
                          key=lambda x: x[1],
                          reverse=True)
most_common_words = sorted(word2count.items(),
                          key=lambda x: x[1],
                          reverse=True)
print(most_common_tags[:3])
print(most_common_words[:3])

[('javascript', 19078), ('c#', 19077), ('java', 18661)]
[('using', 8278), ('php', 5614), ('java', 5501)]


## Label processing

In [44]:
%%time
mlb = MultiLabelBinarizer(classes=sorted(tag2count.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.fit_transform(y_test)

print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

CPU times: user 160 ms, sys: 132 ms, total: 292 ms
Wall time: 293 ms


## Feature extraction

In [32]:
print(f'Number of words: {len(word2count)}')

Number of words: 31498


In [38]:
VOCAB_SIZE = 5000

WORD2INDEX = {token[0]: i
              for i, token in enumerate(most_common_words[:VOCAB_SIZE])}
INDEX2WORD = {v: k for k, v in word2index.items()}

In [39]:
def get_bow(text, word2index=WORD2INDEX, vocab_size=VOCAB_SIZE):
    vect = np.zeros(vocab_size)
    for word in text.split():
        if word in word2index:
            vect[word2index[word]] += 1
    return vect

In [40]:
X_train_bow = sp.sparse.vstack([sp.sparse.csr_matrix(get_bow(text))
                                for text in tqdm.tqdm(X_train)])
X_test_bow = sp.sparse.vstack([sp.sparse.csr_matrix(get_bow(text))
                                for text in tqdm.tqdm(X_test)])

print('X_train_bow shape:', X_train_bow.shape)
print('X_test_bow shape:', X_test_bow.shape)

100%|██████████| 100000/100000 [00:28<00:00, 3536.00it/s]
100%|██████████| 30000/30000 [00:08<00:00, 3723.86it/s]


X_train shape: (100000, 5000)
X_test shape: (30000, 5000)


In [41]:
%%time
tfidf_vectorizer = TfidfVectorizer(token_pattern='\S+',
                                   min_df=5, max_df=0.9,
                                   ngram_range=(1, 2))
tfidf_vectorizer.fit(X_train)

X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print('X_train_tfidf shape:', X_train_tfidf.shape)
print('X_test_tfidf shape:', X_test_tfidf.shape)

CPU times: user 4.44 s, sys: 60 ms, total: 4.5 s
Wall time: 4.86 s


## Modeling