# Loading Data

In [1]:
import pandas as pd
df=pd.read_csv('stackoverflowtags.csv')
df.head(2)

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,['r']
1,mysql select all records where a datetime fiel...,"['php', 'mysql']"


In [2]:
df.title.value_counts()

Conversion failed when converting date and/or time from character string               3
No rule to make target                                                                 2
Object reference not set to an instance of an object                                   2
System.NullReferenceException: Object reference not set to an instance of an object    2
c# and excel automation - ending the running instance                                  2
                                                                                      ..
cannot implicitly convert type 'int' to 'string' c#                                    1
Non-ajax GET/POST using jQuery (plugin?)                                               1
How to get/find an object by property value in a list                                  1
Set MySQL Variables with MySQLi (not PHP variables)                                    1
Pandas version of rbind                                                                1
Name: title, Length: 

# Droping Duplicates

In [3]:
df.title=df.title.drop_duplicates(keep='first')
df.title.value_counts()

Where is the startup method of a WCF Service?                                                                                        1
getSelection() not working in IE                                                                                                     1
Failed to lookup view "error" in views directory using handlebars                                                                    1
GWT JSNI return a js-function                                                                                                        1
jQuery post div content and php echo                                                                                                 1
                                                                                                                                    ..
Set MySQL Variables with MySQLi (not PHP variables)                                                                                  1
Generating (pseudo)random alpha-numeric strings        

# Text Cleaning

In [4]:
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
def clean(s):
    s=str(s)
    s=s.lower()
    html=re.compile('<.*?>')   #removing html tags
    cleaned = re.sub(html,' ',s)
    fil=[]
    for i in cleaned.split():
        if i!='c++':
            cleaned=re.sub('[^A-Za-z]', '', i) #search the pattern !(A-Z & a-z) and replace with ''
            fil.append(cleaned)
        else:
            fil.append(i)
    return fil
stop=set(stopwords.words('english'))
sno=SnowballStemmer('english')
clean(df.title[0])

['how', 'to', 'draw', 'a', 'stacked', 'dotplot', 'in', 'r']

# Stemming

In [5]:
def stem(s):
    fil=[]
    for i in s:
        if i not in stop:
            s=(sno.stem(i).encode('utf8'))
            fil.append(s)
    s=b' '.join(fil)
    return s

In [6]:
l=[]
for j in df.title:
    l.append(stem(clean(j)))
df['cleanQues'] = l

In [7]:
import re
l=[]
for i in df.tags:
    l.append(re.sub('[^A-Za-z#+-]', ' ', i)) #search the pattern !(A-Z & a-z) and replace with ''
df['cleanTags']=l

# After Cleaning and Stemming

In [8]:
df.head(10)

Unnamed: 0,title,tags,cleanQues,cleanTags
0,How to draw a stacked dotplot in R?,['r'],b'draw stack dotplot r',r
1,mysql select all records where a datetime fiel...,"['php', 'mysql']",b'mysql select record datetim field less speci...,php mysql
2,How to terminate windows phone 8.1 app,['c#'],b'termin window phone app',c#
3,get current time in a specific country via jquery,"['javascript', 'jquery']",b'get current time specif countri via jqueri',javascript jquery
4,Configuring Tomcat to Use SSL,['java'],b'configur tomcat use ssl',java
5,Awesome nested set plugin - how to add new chi...,['ruby-on-rails'],b'awesom nest set plugin add new children tre...,ruby-on-rails
6,How to create map from JSON response in Ruby o...,"['ruby', 'ruby-on-rails-3', 'json']",b'creat map json respons rubi rail ',ruby ruby-on-rails- json
7,rspec test if method is called,['ruby'],b'rspec test method call',ruby
8,SpringBoot Catalina LifeCycle Exception,"['java', 'spring', 'spring-mvc']",b'springboot catalina lifecycl except',java spring spring-mvc
9,How to import data from excel to mysql databas...,"['php', 'codeigniter']",b'import data excel mysql databas use php',php codeigniter


In [9]:
d=pd.DataFrame()
d['text']=df.cleanQues
d['tags']=df.cleanTags
d.to_csv('datafinal',index=False)

In [10]:
df = pd.read_csv('datafinal')
df.head()

Unnamed: 0,text,tags
0,b'draw stack dotplot r',r
1,b'mysql select record datetim field less speci...,php mysql
2,b'termin window phone app',c#
3,b'get current time specif countri via jqueri',javascript jquery
4,b'configur tomcat use ssl',java


# Splitting Dataset

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df.text, df.tags, test_size=0.2, random_state=9)

# Converting text and tags to vectors

In [12]:
# Used ti-idf , bow
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tfvectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2",
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_multilabel = tfvectorizer.fit_transform(x_train)
x_test_multilabel = tfvectorizer.transform(x_test)

vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
y_train_multilabel = vectorizer.fit_transform(y_train)
y_test_multilabel = vectorizer.transform(y_test)

# Training using One vs Rest

In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


classifier = OneVsRestClassifier(SGDClassifier(loss='log', max_iter = 5, tol = None, alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train_multilabel)
predictions = classifier.predict(x_test_multilabel)

print("accuracy :",metrics.accuracy_score(y_test_multilabel,predictions))
print("macro f1 score :",metrics.f1_score(y_test_multilabel, predictions, average = 'macro'))
print("micro f1 scoore :",metrics.f1_score(y_test_multilabel, predictions, average = 'micro'))
print("hamming loss :",metrics.hamming_loss(y_test_multilabel,predictions))

accuracy : 0.33075
macro f1 score : 0.48876273246938884
micro f1 scoore : 0.6397114831568965
hamming loss : 0.0115885


In [14]:
print("Precision recall report :\n",metrics.classification_report(y_test_multilabel, predictions))

Precision recall report :
               precision    recall  f1-score   support

           0       0.78      0.48      0.59       327
           1       0.62      0.10      0.18        96
           2       0.92      0.52      0.66       555
           3       0.94      0.72      0.82       294
           4       0.71      0.20      0.32        83
           5       0.55      0.36      0.44       447
           6       0.85      0.41      0.55       950
           7       0.71      0.30      0.42       588
           8       0.83      0.61      0.70      3763
           9       0.89      0.46      0.61      1295
          10       0.35      0.06      0.11        96
          11       0.14      0.01      0.02       101
          12       0.97      0.75      0.85       140
          13       0.65      0.21      0.32       359
          14       0.74      0.63      0.68        87
          15       0.19      0.03      0.05       150
          16       0.47      0.18      0.26       132


  'precision', 'predicted', average, warn_for)


In [21]:
import dill
model = 'model.sav'
tfidf='tfidf.sav'
bow='bow.sav'
dill.dump(classifier, open(model, 'wb'))
dill.dump(tfvectorizer, open(tfidf, 'wb'))
dill.dump(vectorizer, open(bow, 'wb'))