# Loading Data

In [1]:
import pandas as pd
df=pd.read_csv('stackoverflowtags.csv')
df.head(2)

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"


In [2]:
df.title.value_counts() # counts the number of occurances in the data

Conversion failed when converting date and/or time from character string    3
Could not find default endpoint element that references contract            2
Uncaught Reference Error: function is not defined                           2
Object reference not set to an instance of an object                        2
JavaScript runtime error: '$' is undefined                                  2
                                                                           ..
Implementing Smileys into my Chat Room MessageBox                           1
Importing Objective-C JSON Framework in Xcode 4                             1
Parse Html with Python and lxml.html                                        1
Using URL Routing for Web Forms and StopRoutingHandler for Favicon          1
Difference between onMouseOver and onMouseEnter                             1
Name: title, Length: 99984, dtype: int64

# Droping Duplicates

In [3]:
df.title=df.title.drop_duplicates(keep='first') # drops the duplicates
df.title.value_counts()

Encrypting Zip Entries but not the entire Zip File in Java            1
How to set background color in MAC OSX in Objective c                 1
Query parameters in the URL, with REST Framework                      1
Unusual antialias while using basic texture material in three.js      1
What is this expression in Java ( 1 << 2)?                            1
                                                                     ..
Parse Html with Python and lxml.html                                  1
Using URL Routing for Web Forms and StopRoutingHandler for Favicon    1
Bootstrap typeahead updater option                                    1
Play Framework renderJSON Issue                                       1
Difference between onMouseOver and onMouseEnter                       1
Name: title, Length: 99984, dtype: int64

# Text Cleaning

In [4]:
import re       # importing regular expressions used for cleaning texts

# importing natural language toolkit
# that helps in cleaning texts by using
# stopwords, SnowballStemmer, WordNetLemmatizer libraries

from nltk.corpus import stopwords 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

# function to remove html tags and other 
# unwanted stuff in the question asked by the user

def clean(s):
    s=str(s)
    s=s.lower()
    html=re.compile('<.*?>')   #removing html tags
    cleaned = re.sub(html,' ',s)
    fil=[]
    for i in cleaned.split():    # splits the text and repalces the unwanted characters with ''
        if i!='c++':
            cleaned=re.sub('[^A-Za-z]', '', i) #search the pattern !(A-Z & a-z) and replace with ''
            fil.append(cleaned)
        else:
            fil.append(i)
    return fil         # returns the splitted text with removed stopwords and html tags

stop=set(stopwords.words('english'))  #loading stopwords in english to compare and remove
sno=SnowballStemmer('english')
clean(df.title[0])                # function call to clean the text

['how', 'to', 'draw', 'a', 'stacked', 'dotplot', 'in', 'r']

# Stemming

In [5]:
# function to stem the data
# stemming means grouping the words
# after cleaning the data for further processing

def stem(s):
    fil=[]
    for i in s:
        if i not in stop:
            s=(sno.stem(i).encode('utf8'))  # encoding the data into a clean file
            fil.append(s)
    s=b' '.join(fil)
    return s

In [6]:
# creating the new columns 
# cleaned questions and cleaned tags
# after cleaning the data
ques=[]
for j in df.title:
    ques.append(stem(clean(j)))
df['cleanQues'] = ques

import re
ctags=[]
for i in df.tags:
    ctags.append(re.sub('[^A-Za-z#+-]', ' ', i)) # search the pattern !(A-Z & a-z) and replace with ''
df['cleanTags']=ctags

# After Cleaning and Stemming

In [7]:
df.head(10)

Unnamed: 0,title,tags,cleanQues,cleanTags
0,How to draw a stacked dotplot in R?,['r'],b'draw stack dotplot r',r
1,mysql select all records where a datetime fiel...,"['php', 'mysql']",b'mysql select record datetim field less speci...,php mysql
2,How to terminate windows phone 8.1 app,['c#'],b'termin window phone app',c#
3,get current time in a specific country via jquery,"['javascript', 'jquery']",b'get current time specif countri via jqueri',javascript jquery
4,Configuring Tomcat to Use SSL,['java'],b'configur tomcat use ssl',java
5,Awesome nested set plugin - how to add new chi...,['ruby-on-rails'],b'awesom nest set plugin add new children tre...,ruby-on-rails
6,How to create map from JSON response in Ruby o...,"['ruby', 'ruby-on-rails-3', 'json']",b'creat map json respons rubi rail ',ruby ruby-on-rails- json
7,rspec test if method is called,['ruby'],b'rspec test method call',ruby
8,SpringBoot Catalina LifeCycle Exception,"['java', 'spring', 'spring-mvc']",b'springboot catalina lifecycl except',java spring spring-mvc
9,How to import data from excel to mysql databas...,"['php', 'codeigniter']",b'import data excel mysql databas use php',php codeigniter


In [8]:
# creating the new dataset that only 
# consists cleaned questions and tags

d=pd.DataFrame()
d['text']=df.cleanQues
d['tags']=df.cleanTags
d.to_csv('datafinal',index=False)
df = pd.read_csv('datafinal')
df.head()

Unnamed: 0,text,tags
0,b'draw stack dotplot r',r
1,b'mysql select record datetim field less speci...,php mysql
2,b'termin window phone app',c#
3,b'get current time specif countri via jqueri',javascript jquery
4,b'configur tomcat use ssl',java


# Splitting Dataset

In [9]:
# splitting the dataset to train and test in 80% and 20%

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.text, df.tags, test_size=0.2, random_state=9)

# Converting text and tags to vectors

In [10]:
# Used ti-idf , bow

# importing TfidfVectorizer , CountVectorizer from sklearn.feature_extraction
# to convert the text and tags to vectors
# so that we can train and test the dataset

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tfvectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2",
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_multilabel = tfvectorizer.fit_transform(x_train)
x_test_multilabel = tfvectorizer.transform(x_test)

vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
y_train_multilabel = vectorizer.fit_transform(y_train)
y_test_multilabel = vectorizer.transform(y_test)

# Training using One vs Rest

In [11]:
# using multi-class classification to classify the tags 
# classifiers like One-Vs-Rest classifier and Stochastic Gradient Descent Classifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score


classifier = OneVsRestClassifier(SGDClassifier(loss='log', max_iter = 5, tol = None, alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train_multilabel)
predictions = classifier.predict(x_test_multilabel)

print("accuracy :",metrics.accuracy_score(y_test_multilabel,predictions))
print("macro f1 score :",metrics.f1_score(y_test_multilabel, predictions, average = 'macro'))
print("micro f1 scoore :",metrics.f1_score(y_test_multilabel, predictions, average = 'micro'))
print("hamming loss :",metrics.hamming_loss(y_test_multilabel,predictions))

accuracy : 0.33195
macro f1 score : 0.491694282973174
micro f1 scoore : 0.640108098033734
hamming loss : 0.011586


# Classification Report

In [12]:
# classification report
print("Precision recall report :\n",metrics.classification_report(y_test_multilabel, predictions))

Precision recall report :
               precision    recall  f1-score   support

           0       0.78      0.49      0.60       327
           1       0.67      0.12      0.21        96
           2       0.93      0.52      0.67       555
           3       0.95      0.71      0.81       294
           4       0.69      0.22      0.33        83
           5       0.56      0.38      0.45       447
           6       0.85      0.40      0.55       950
           7       0.71      0.31      0.43       588
           8       0.83      0.61      0.70      3763
           9       0.89      0.46      0.61      1295
          10       0.35      0.07      0.12        96
          11       0.20      0.01      0.02       101
          12       0.97      0.75      0.85       140
          13       0.64      0.22      0.33       359
          14       0.75      0.64      0.69        87
          15       0.14      0.02      0.04       150
          16       0.51      0.22      0.31       132


  'precision', 'predicted', average, warn_for)


In [14]:
# using dill saving the classification, tfvectorization nad vectorization 
# in three files and using them to predict the tags

import dill
model_data = 'model_data.sav'
tfidf_data = 'tfidf_data.sav'
bow_data = 'bow_data.sav'
dill.dump(classifier, open(model_data, 'wb'))
dill.dump(tfvectorizer, open(tfidf_data, 'wb'))
dill.dump(vectorizer, open(bow_data, 'wb'))