In [8]:
import pandas as pd
from collections import Counter
import numpy as np 
import string
import re
import warnings
from itertools import chain

# Ignore warnings when training data due to empty tags predicted 
warnings.warn("deprecated", DeprecationWarning)
warnings.simplefilter("ignore")

# Read first 20000 rows from Dataset
data = pd.read_csv('Train.csv',nrows=100000)
              
# Function to clean text in data
def clean_text(text):
    
    if not isinstance(text, str): 
        return text
    text = re.sub('<pre><code>.*?</code></pre>', '', text)

    def replace_link(match):
        return '' if re.match('[a-z]+://', match.group(1)) else match.group(1)
    
    text = re.sub('<a[^>]+>(.*)</a>', replace_link, text)
    return re.sub('<[^>]+>', '', text)

In [9]:
# Adding higher weightage to Title 
Weightage = 5

# Combining Title and Body while applying clean_text function to pre-process data
data['Text'] = (data['Title'] + " ")*Weightage + data['Body']
data['Text'] = data['Text'].apply(clean_text).str.lower()
data['Text'] = data.Text.apply(lambda x: x.replace('"','').replace("\n","").replace("\t",""))

# Apply split on Tags 
f = lambda x: x["Tags"].split()
data["Tags"] = data.apply(f, axis=1)


In [10]:
import collections

# Initialize variables
totaltags = []
toptags = []

# Number of top tags used for the model
n = 1000

# Retrieve total tags for each row
for i in range(data['Tags'].size):
    totaltags.extend(data['Tags'][i])

# Number of usages for the n of most repeated tags
tagcount = collections.Counter(totaltags).most_common(n)

# Append most used tags into array
for i in range(len(tagcount)):
    toptags.append(tagcount[i][0])
toptags = np.array(toptags)


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# Split Data into 80% Train and 20% Test sets 
X_train, X_test, y_train, y_test = train_test_split(data['Text'], data['Tags'], random_state=42, test_size=0.2, shuffle=True)

# Apply multilabel binarizer for n top tags used
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit([toptags])
y_initial = y_test

# Transform y_train and y_test into binary form
y_train = multilabel_binarizer.transform(y_train)
y_test = multilabel_binarizer.transform(y_test)

# OneVsRestClassifier with Logistic Regression
classifier = OneVsRestClassifier(LogisticRegression(penalty='l1'))

# TF-IDF approach 
vectorizer = TfidfVectorizer(min_df = 2, max_df = 0.95, stop_words='english', max_features=10000, smooth_idf=True, norm="l2",sublinear_tf=False, ngram_range=(1,3))
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

# Fitting Train data into classifier
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)


In [12]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

prediction = multilabel_binarizer.inverse_transform(y_pred)
multilabel_binarizer.fit(data['Tags'])
y_test_all = multilabel_binarizer.transform(y_initial)
y_pred_all = multilabel_binarizer.transform(prediction)

# Comparison table of predicted and actual values
comp = pd.Series(y_initial).reset_index()
comp2 = pd.Series(prediction)
comparison = pd.concat([comp,comp2], axis = 1)

# Micro f1 score
precision = precision_score(y_test_all, y_pred_all, average = 'micro')
recall = recall_score(y_test_all, y_pred_all, average = 'micro')
f1 = f1_score(y_test_all, y_pred_all, average="micro")
print('Micro')
print('Precision: {:.4f}\nRecall: {:.4f}\nF1: {:4f}'.format(precision, recall, f1))

# Macro f1 score
precision = precision_score(y_test_all, y_pred_all, average = 'macro')
recall = recall_score(y_test_all, y_pred_all, average = 'macro')
f1 = f1_score(y_test_all, y_pred_all, average="macro")
print('Macro')
print('Precision: {:.4f}\nRecall: {:.4f}\nF1: {:4f}'.format(precision, recall, f1))

# Accuracy score
print("Accuracy :",accuracy_score(y_test_all,y_pred_all))

# Hamming loss score
print("Hamming loss ",hamming_loss(y_test_all,y_pred_all))

Micro
Precision: 0.7134
Recall: 0.2446
F1: 0.364320
Macro
Precision: 0.0275
Recall: 0.0133
F1: 0.016909
Accuracy : 0.06665
Hamming loss  0.00013294395056374675


In [13]:
# Comparison Table for visualization purposes
comparison.columns = ['Index', 'Actual Tags', 'Predicted Tags']
comparison

Unnamed: 0,Index,Actual Tags,Predicted Tags
0,75721,"[html, iframe, google-analytics, p3p]","(google-analytics,)"
1,80184,"[python, class, inheritance, attributes, derived]","(python,)"
2,19864,"[perl, string]","(perl,)"
3,76699,[numerical-methods],()
4,92991,"[php, css, codeigniter]","(templates,)"
5,76434,"[c, unix, path, terminal, execvp]",()
6,84004,"[php, string, algorithm, hash, rabin-karp]","(php,)"
7,80917,"[osx, osx-mountain-lion, iterm2]",()
8,60767,"[c#, regex, string]","(regex,)"
9,50074,[sql],"(sql,)"
