In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from datetime import datetime
from wordcloud import WordCloud
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kinchang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/kinchang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
df_processed = pd.read_csv("processed_sample.csv")
df_enhanced = pd.read_csv("enhanced_sample.csv")

# 1st Model

In [6]:
# binary='true' will give a binary vectorizer
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
multilabel_y = vectorizer.fit_transform(df_processed['tags'])

In [12]:
total_tags=multilabel_y.shape[1]
total_qs=df_processed.shape[0]
def tags_to_choose(n):
    t = multilabel_y.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
    multilabel_yn=multilabel_y[:,sorted_tags_i[:n]]
    return multilabel_yn

def questions_explained_fn(n):
    multilabel_yn = tags_to_choose(n)
    x= multilabel_yn.sum(axis=1)
    return (np.count_nonzero(x==0))

In [13]:
multilabel_yx = tags_to_choose(5500)
print("number of questions that are not covered :", questions_explained_fn(5500),"out of ", total_qs)

number of questions that are not covered : 509 out of  60057


In [14]:
print("Number of tags in sample :", multilabel_y.shape[1])
print("number of tags taken :", multilabel_yx.shape[1],"(",(multilabel_yx.shape[1]/multilabel_y.shape[1])*100,"%)")

Number of tags in sample : 15054
number of tags taken : 5500 ( 36.53514016208317 %)


In [15]:
total_size=df_processed.shape[0]
train_size=int(0.80*total_size)

x_train=df_processed.head(train_size)
x_test=df_processed.tail(total_size - train_size)

y_train = multilabel_yx[0:train_size,:]
y_test = multilabel_yx[train_size:total_size,:]

In [16]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])

In [17]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)

Dimensions of train data X: (48045, 94933) Y : (48045, 5500)
Dimensions of test data X: (12012, 94933) Y: (12012, 5500)


In [18]:
# Applying Logistic Regression with OneVsRest Classifier
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict(x_test_multilabel)

precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))



Micro-average quality numbers
Precision: 0.6806, Recall: 0.2497, F1-measure: 0.3653
Macro-average quality numbers
Precision: 0.1168, Recall: 0.0559, F1-measure: 0.0712
              precision    recall  f1-score   support

           0       0.61      0.25      0.35       972
           1       0.77      0.43      0.55       813
           2       0.80      0.55      0.65       762
           3       0.73      0.44      0.55       744
           4       0.96      0.76      0.85       646
           5       0.83      0.66      0.73       634
           6       0.72      0.30      0.42       421
           7       0.90      0.57      0.70       361
           8       0.68      0.42      0.52       347
           9       0.85      0.59      0.70       389
          10       0.71      0.45      0.55       315
          11       0.49      0.11      0.18       342
          12       0.50      0.16      0.25       349
          13       0.53      0.27      0.36       290
          14       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Enhanced Model with more Title Weight

In [21]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')
multilabel_y = vectorizer.fit_transform(df_enhanced['tags'])

In [22]:
multilabel_yx = tags_to_choose(5500)
print("number of questions that are not covered :", questions_explained_fn(5500),"out of ", total_qs)

number of questions that are not covered : 509 out of  60057


In [23]:
print("Number of tags in sample :", multilabel_y.shape[1])
print("number of tags taken :", multilabel_yx.shape[1],"(",(multilabel_yx.shape[1]/multilabel_y.shape[1])*100,"%)")

Number of tags in sample : 15054
number of tags taken : 5500 ( 36.53514016208317 %)


In [24]:
total_size=df_enhanced.shape[0]
train_size=int(0.80*total_size)

x_train=df_enhanced.head(train_size)
x_test=df_enhanced.tail(total_size - train_size)

y_train = multilabel_yx[0:train_size,:]
y_test = multilabel_yx[train_size:total_size,:]

In [25]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))
x_train_multilabel = vectorizer.fit_transform(x_train['question'])
x_test_multilabel = vectorizer.transform(x_test['question'])

In [26]:
start = datetime.now()
classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict (x_test_multilabel)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions))
print("Time taken to run this cell :", datetime.now() - start)



Accuracy : 0.0864968364968365
Hamming loss  0.00040639663366936096
Micro-average quality numbers
Precision: 0.6945, Recall: 0.2755, F1-measure: 0.3945
Macro-average quality numbers
Precision: 0.1422, Recall: 0.0776, F1-measure: 0.0943
              precision    recall  f1-score   support

           0       0.62      0.28      0.38       972
           1       0.79      0.43      0.56       813
           2       0.82      0.54      0.65       762
           3       0.72      0.43      0.54       744
           4       0.96      0.73      0.83       646
           5       0.85      0.64      0.73       634
           6       0.69      0.32      0.44       421
           7       0.93      0.58      0.71       361
           8       0.68      0.44      0.53       347
           9       0.90      0.60      0.72       389
          10       0.74      0.41      0.53       315
          11       0.54      0.13      0.21       342
          12       0.55      0.25      0.34       349
        

In [None]:
start = datetime.now()
classifier_2 = OneVsRestClassifier(LogisticRegression(C=1, penalty='l1', solver='liblinear'), n_jobs=-1)
classifier_2.fit(x_train_multilabel, y_train)
predictions_2 = classifier_2.predict(x_test_multilabel)
print("Accuracy :",metrics.accuracy_score(y_test, predictions_2))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions_2))


precision = metrics.precision_score(y_test, predictions_2, average='micro')
recall = metrics.recall_score(y_test, predictions_2, average='micro')
f1 = metrics.f1_score(y_test, predictions_2, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions_2, average='macro')
recall = metrics.recall_score(y_test, predictions_2, average='macro')
f1 = metrics.f1_score(y_test, predictions_2, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

print (metrics.classification_report(y_test, predictions_2))
print("Time taken to run this cell :", datetime.now() - start)

