# StackOverflow Tag Prediction - Natural Language Processing

## Reading the Data

In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval

def read_data(filename, sep='\t'):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval) # To get rid of the quotes around tags
    return data

train = read_data('data/train.tsv').head(10000) # Only taking the first 5000 (I have potato PC)
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep='\t') # read_csv because it doesn't have labels, so no literal_eval

train.head(5)

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [2]:
len(train)

10000

In [3]:
# Data is already partitioned, splitting target and feature
X_train, y_train = train['title'].values, train['tags'].values
X_validation, y_validation = validation['title'].values, validation['tags'].values
X_test =test['title'].values

In [4]:
X_train[0]

'How to draw a stacked dotplot in R?'

## Preparing the Text Data

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
# All the special characters and brackets should be replaced by space
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')

# tokens that start with numbers, usernames, phone numbers etc
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

try:
# Loading stopwords
    from nltk.corpus import stopwords
    STOPWORDS = set(stopwords.words('english'))
except Exception:
# if stopwords not yet downloaded
    import nltk
    nltk.download('stopwords')
finally:
# English stopwords
    from nltk.corpus import stopwords
    STOPWORDS = set(stopwords.words('english'))
    
# Now STOPWORDS is a finite sest of string of the stopwords of the english language

# Function to prepare the text: Convert to LowerCase, 
# Replace special chars by space
# Replace bad symbols and names starting with numbers,
# Replace numbers
# Replace stopwords
def text_prepare(text): # returns processed text: String
    text = text.lower() # converts to lower
    text = re.sub(REPLACE_BY_SPACE_RE, ' ', text) # Replacing with space
    text = re.sub(BAD_SYMBOLS_RE, '', text) # Replacing with '' ie deleting
    
    ### The following code does stemming using Porter Stemmer Algo
    try:
        text = word_tokenize(text)
    except Exception:
        nltk.download('punkt')
        text = word_tokenize(text)
    text2 = []
    
    try:
        for literal in text:
            text2.append(stemmer.stem(literal)) 
    except Exception:
        nltk.download('wordnet')
        for literal in text:
            text2.append(stemmer.stem(literal))
    text2 = [w for w in text2 if not w in STOPWORDS]
    return " ".join(text2)
    
    # The following is another way to remove stop words
    ''' # The following is one way to remove stopwords
    text = text.split(' ') # removing stopwords
    new_text = ""
    for word in text:
        if word in STOPWORDS:
            text.remove(word)
        else:
            new_text = new_text + word + " "
    return new_text[:-1]
    ''' 
    
    ### The following code is for lemmatization - replace with stemming
    '''
    try:
        text = word_tokenize(text)
    except Exception:
        nltk.download('punkt')
        text = word_tokenize(text)
    text2 = []
    
    try:
        for literal in text:
            text2.append(lemmatizer.lemmatize(literal, 'v')) # 'v' for verb based
    except Exception:
        nltk.download('wordnet')
        for literal in text:
            text2.append(lemmatizer.lemmatize(literal, 'v'))
    text2 = [w for w in text2 if not w in STOPWORDS]
    return " ".join(text2)
    '''
    
    

### Applying the text preparation function to the datasets

In [6]:
X_train = [text_prepare(x) for x in X_train]
print("Train data processed")
X_validation = [text_prepare(x) for x in X_validation]
print("Validation data processed")
X_test = [text_prepare(x) for x in X_test]
print("Test data processed")

Train data processed
Validation data processed
Test data processed


### Exploring the tags ie target

In [7]:
tags = list(x[0] for x in y_train)
print("There are ", len(tags), "in the training set")
print("Out of these", len(set(tags)), "are distinct")

There are  10000 in the training set
Out of these 44 are distinct


There are 100000 datapoints in train set and each of them have a tag

### Finding the ten most common tags

In [8]:
from collections import Counter
tag_occurence_count = Counter(tags)
tag_occurence_count.most_common(n=10)

[('c#', 1940),
 ('java', 1871),
 ('javascript', 1687),
 ('php', 1355),
 ('python', 878),
 ('c++', 603),
 ('ruby-on-rails', 302),
 ('c', 216),
 ('ios', 164),
 ('iphone', 156)]

### Finding the ten most common words in text

In [9]:
# Function to get the word 
def get_word_list(data):
    tokenized_sentences = []
    for i in data:
        tokenized_sentences.append(word_tokenize(i))
    wordList = []
    for sentence in tokenized_sentences:
        for word in sentence:
            wordList.append(word)
    return wordList

word_occurence_count = Counter(get_word_list(X_train))
word_occurence_count.most_common(n=10)


[('use', 1157),
 ('file', 623),
 ('c', 602),
 ('java', 578),
 ('php', 573),
 ('get', 569),
 ('valu', 484),
 ('#', 459),
 ('error', 440),
 ('python', 433)]

In [10]:
print("There are", len(word_occurence_count), "unique words in the training set")  
#8200 before lemm # 6969 before stem

There are 6524 unique words in the training set


## Data Preprocessing

### TFIDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.6, min_df=5, ngram_range=(1,2), token_pattern='(\S+)') #max-1.0, min-10

# Transforming data
def transform_into_tfidf(data, train_data=False):
    if train_data == True: # Fit only for training data
        return pd.DataFrame(tfidf.fit_transform(data).todense(), columns=tfidf.get_feature_names())
    else:
        try:
            return pd.DataFrame(tfidf.transform(data).todense(), columns=tfidf.get_feature_names())
        except Exception:
            print("Fit Using train data first to transform test and validation data")
            return -1

def tfidf_preprocess_data(X_train, X_validation, X_test):
    X_train_tfidf = transform_into_tfidf(data=X_train, train_data=True)
    print("Fitted and transformed train data")
    X_validation_tfidf = transform_into_tfidf(data=X_validation)
    print("Transformed validation data")
    X_test_tfidf = transform_into_tfidf(data=X_test)
    print("Transformed test data")
    return X_train_tfidf, X_validation_tfidf, X_test_tfidf

In [22]:
X_train_tfidf, X_validation_tfidf, X_test_tfidf = tfidf_preprocess_data(X_train, X_validation, X_test)

Fitted and transformed train data
Transformed validation data
Transformed test data


In [23]:
X_train_tfidf.head(10) # 2299 before lemming

Unnamed: 0,#,# aspnet,# code,# get,# net,# use,# window,+,0,1,...,yet,yii,yii2,youtub,zend,zend framework,zero,zip,zip file,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Preprocessing the label - OHE

In [29]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=sorted(tag_occurence_count.keys()))

y_train = mlb.fit_transform(y_train)
y_validation = mlb.transform(y_validation)

  .format(sorted(unknown, key=str)))


## Classifying the data

### Training the Model

In [32]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

model = OneVsRestClassifier(LogisticRegression(solver='lbfgs')).fit(X_train_tfidf, y_train)

### Making Predictions

In [34]:
y_pred_validation = model.predict(X_validation_tfidf)

In [35]:
y_pred_test = model.predict(X_test_tfidf)

## Evaluating the Model

In [54]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

print("Accuracy: ", accuracy_score(y_validation, y_pred_validation))
print("F1: ", f1_score(y_validation, y_pred_validation, average='micro'))
print("ROC_AUC ", roc_auc_score(y_validation, y_pred_validation, average='micro'))
print("Average Precision ", average_precision_score(y_validation, y_pred_validation))
print("Recall aka sensitivity ", recall_score(y_validation, y_pred_validation, average='micro'))

Accuracy:  0.30823333333333336
F1:  0.5408044366436083
ROC_AUC  0.6902026735691748
Average Precision  0.22893354893263507
Recall aka sensitivity  0.3815162424822592


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
%matplotlib inline

## This snippet to plot wont work with multilabel
'''
fpr, tpr, thresholds = roc_curve(y_pred_validation, y_validation)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()'''

# Conclusion

This being a 44 labelled classification, baseline is:

In [64]:
1/44

0.022727272727272728

With this, an accuracy of 30.32% is not bad. ROC_AUC of 70% is also pretty good.

The research team who published the paper got an F1 of .60 by taking 100% of title data only - Good for them!

# Future Work

1. Make automated scrips to take data, preprocess and predict the outcome
2. Apply various other techniques - word embeddings, DeepNLP, and other traditional ML Algorithms as well (think SVM)
3. Buy a better PC that can load 100% of the data as opposed to the 10% I loaded now to train set
4. Try this using the college GPUs with 100% of the dataset - both title and post texts

# References

This project follows the research paper: Autonomous Tagging of Stack Overflow Questions by Mihail Eric, Ana Klimovic, Victor Zhong
http://cs229.stanford.edu/proj2014/Mihail%20Eric,%20Ana%20Klimovic,%20Victor%20Zhong,MLNLP-Autonomous%20Tagging%20Of%20Stack%20Overflow%20Posts.pdf

### Note to self:

learn how to plot multiclass ROCs