In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ACER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
from ast import literal_eval
import pandas as pd
import numpy as np

In [3]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [4]:
train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep='\t')

In [5]:
validation.head()

Unnamed: 0,title,tags
0,Why odbc_exec always fail?,"[php, sql]"
1,Access a base classes variable from within a c...,[javascript]
2,"Content-Type ""application/json"" not required i...","[ruby-on-rails, ruby]"
3,Sessions in Sinatra: Used to Pass Variable,"[ruby, session]"
4,"Getting error - type ""json"" does not exist - i...","[ruby-on-rails, ruby, json]"


In [6]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [7]:
import re

In [8]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()
    text = re.sub(REPLACE_BY_SPACE_RE," ",text)
    text = re.sub(BAD_SYMBOLS_RE,"",text)
    text = text.split();
    return ' '.join([i for i in text if i not in STOPWORDS])

In [9]:
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

In [10]:
X_train[:3]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app']

In [11]:
from collections import defaultdict
# Dictionary of all tags from train corpus with their counts.
tags_counts =  defaultdict(int)
# Dictionary of all words from train corpus with their counts.
words_counts =  defaultdict(int)

In [12]:
for text in X_train:
    for word in text.split():
        words_counts[word] += 1


for tags in y_train:
    for tag in tags:
        tags_counts[tag] += 1

In [13]:
# for creating word to index and vice versa mappings
def create_vocabulary_mappings(X_train, word_counts, DICT_SIZE=4500):
    # word to index mapping
    word_to_idx = {word:idx for idx,(word,f) in enumerate(
                sorted(word_counts.items(), key=lambda v:v[1], reverse=True)[:DICT_SIZE])}
    # reverse index to word mapping
    idx_to_word= {word_to_idx[word]:word for word in word_to_idx.keys()}
    
    return word_to_idx, idx_to_word

In [15]:
DICT_SIZE=4500
word_to_idx, idx_to_word = create_vocabulary_mappings(X_train, words_counts, DICT_SIZE=4500)

In [16]:
# for creating BOW representation
def create_bag_of_words(text, word_to_idx, DICT_SIZE):
    # Intial Matrix for holding the features
    feature_vector = np.zeros(DICT_SIZE)
    
    # update the word frequencies
    for word in text.split():
        if word in word_to_idx.keys():
            feature_vector[word_to_idx[word]] += 1 
    
    return feature_vector

In [20]:
# create the bag of words feature vector
# we will use a sparse representation , here we will be using csr matrix representation
# for storing it
from scipy import sparse
X_train_bow = sparse.vstack([sparse.csr_matrix(create_bag_of_words(text, word_to_idx, DICT_SIZE)) for text in X_train])
X_val_bow = sparse.vstack([sparse.csr_matrix(create_bag_of_words(text, word_to_idx, DICT_SIZE)) for text in X_val])

print('X_train shape ', X_train_bow.shape)
print('X_val shape ', X_val_bow.shape)

X_train shape  (100000, 4500)
X_val shape  (30000, 4500)


In [26]:

from sklearn.preprocessing import MultiLabelBinarizer
# create an instance
mlb_object = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
# transform the tags 
y_train = mlb_object.fit_transform(y_train)
y_val = mlb_object.transform(y_val)

In [28]:
from sklearn.linear_model import LogisticRegression
import pickle

In [33]:
# define the classifier and fit it to the training data
from sklearn.multiclass import OneVsRestClassifier
def train_classifier(X_train, y_train, inner_clf):
    # define the classifier
    clf = OneVsRestClassifier(inner_clf)
    # train it
    clf.fit(X_train, y_train)
    return clf


In [34]:
lr_clf = LogisticRegression(penalty="l2", C=1)

In [35]:
# for bag of words
clf_bow_lr = train_classifier(X_train_bow, y_train, lr_clf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [36]:
from sklearn.metrics import accuracy_score
pred_val_bow_lr = clf_bow_lr.predict(X_val_bow)

In [39]:
# gives evaluation statistics
def evaluate_classifiers(y_val, predicted):
    print('Accuracy: '+ str(accuracy_score(y_val, predicted)*100))
    #print(average_precision_score(y_val, predicted))

In [40]:
evaluate_classifiers(y_val, pred_val_bow_lr)

Accuracy: 35.723333333333336


In [42]:
# convert back the predictions to the original tags they are suppose to
pred_val_inverse = mlb_object.inverse_transform(pred_val_bow_lr)
# convert the original tag labels
y_val_inverse = mlb_object.inverse_transform(y_val)

for i in range(10):
    print('Query:\t' + str(X_val[i]))
    print('True tags:\t' + str(y_val_inverse[i]))
    print('Predicted tags:\t' + str(pred_val_inverse[i]))
    print()

Query:	odbc_exec always fail
True tags:	('php', 'sql')
Predicted tags:	()

Query:	access base classes variable within child class
True tags:	('javascript',)
Predicted tags:	()

Query:	contenttype application json required rails
True tags:	('ruby', 'ruby-on-rails')
Predicted tags:	('ruby-on-rails',)

Query:	sessions sinatra used pass variable
True tags:	('ruby', 'session')
Predicted tags:	('ruby',)

Query:	getting error type json exist postgresql rake db migrate
True tags:	('json', 'ruby', 'ruby-on-rails')
Predicted tags:	('json', 'ruby-on-rails')

Query:	library found
True tags:	('c++', 'ios', 'iphone', 'xcode')
Predicted tags:	()

Query:	csproj file programmatic adding deleting files
True tags:	('c#',)
Predicted tags:	()

Query:	typeerror makedirs got unexpected keyword argument exists_ok
True tags:	('django', 'python')
Predicted tags:	('python',)

Query:	pan div using jquery
True tags:	('html', 'javascript', 'jquery')
Predicted tags:	('javascript', 'jquery')

Query:	hibernate interme