In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import hamming_loss, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyodbc
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re


df_text = pd.read_csv('./dataset/TextPreprocessed.csv', encoding='iso-8859-1')
# print(df_text.head())
df_tags = pd.read_csv('./dataset/Tag.csv', encoding='iso-8859-1')

num_classes = 14
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

counts = df_tags.Tag.value_counts()
firstlast = counts[:5].append(counts[-5:])
firstlast.reset_index(name="count")

  firstlast = counts[:5].append(counts[-5:])


Unnamed: 0,index,count
0,extract method ...,4225
1,rename method ...,2940
2,move method ...,1441
3,move attribute ...,957
4,rename class ...,801
5,pull up attribute ...,186
6,extract interface ...,149
7,extract superclass ...,132
8,push down method ...,112
9,push down attribute ...,102


In [9]:
def tags_for_question(question_id):
    return df_tags[df_tags['Id'] == question_id].Tag.values

def add_tags_column(row):
    row['Tags'] = tags_for_question(row['Id'])
    return row

df_questions = df_text.apply(add_tags_column, axis=1)

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions.Tags)
Y = multilabel_binarizer.transform(df_questions.Tags)

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(df_questions.Text.values.astype('U'))

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

ros = RandomOverSampler(random_state=9000)
X_tfidf_resampled, Y_tfidf_resampled = ros.fit_resample(X_tfidf, Y)

#X_tfidf_resampled, Y_tfidf_resampled = ros.fit_sample(X_tfidf, Y)

x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_resampled, Y_tfidf_resampled, test_size=0.2, random_state=9000)

In [42]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    print(y_true.shape[0])
    print(y_pred)

    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            # tmp_a = len(set_true.union(set_pred))
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    # print(acc_list)
    return np.mean(acc_list)

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    cm = multilabel_confusion_matrix(y_test_tfidf, y_pred)
    cm_p = (np.round(cm / np.sum(cm, axis=1, keepdims=True) * 100, 2))
  
    #identifying failed cases
    for i in range(len(y_pred)):
        for j in range(len(y_pred[i])):
            if y_test_tfidf[i][j] != y_pred[i][j]:
                print(f"{i}____{j}")

    # Print confusion matrix
    for i in range(len(cm)):
        print(f"Confusion Matrix for Label {i + 1}:\n", cm_p[i])
        print()
    #print("Confusion Matrix:{}".format)
    print("Hamming loss: {}".format(hamming_loss(y_test_tfidf, y_pred)))
    print("Hamming score: {}".format(hamming_score(y_test_tfidf, y_pred)))
    print('Subset accuracy: {0}'.format(accuracy_score(y_test_tfidf, y_pred, normalize=True, sample_weight=None)))
    print('Subset precision: {0}'.format(precision_score(y_test_tfidf, y_pred, average='samples')))
    print("---")

# sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None)
#lr = LogisticRegression()
mnb = MultinomialNB()
#svm = LinearSVC()
#rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [41]:
for classifier in [mnb]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(x_train_tfidf, y_train_tfidf)
    y_pred = clf.predict(x_test_tfidf)
    print(y_test_tfidf.shape)
    print_score(y_pred, classifier)
    print(classification_report(y_test_tfidf, y_pred))

(7300, 14)
Clf:  MultinomialNB
0____7
1____5
2____11
3____8
4____7
5____9
6____9
7____9
8____2
9____9
10____5
11____12
13____4
14____12
15____7
17____1
18____9
21____10
22____9
23____12
25____3
26____9
27____8
28____8
30____5
31____6
32____6
33____10
34____2
35____2
36____8
37____8
39____11
40____6
41____12
42____6
43____4
44____13
45____3
46____9
47____0
50____1
51____11
52____4
53____4
54____0
55____11
56____8
57____13
58____2
59____1
62____8
63____6
65____13
66____10
67____10
68____7
69____5
70____4
71____9
72____11
73____9
74____3
76____1
77____3
78____11
79____11
82____0
83____11
84____6
85____12
87____7
88____1
89____2
90____5
91____9
92____6
94____12
95____8
96____3
97____5
98____0
99____7
101____5
102____3
103____1
104____12
105____8
106____1
107____9
108____5
109____7
110____3
111____13
112____8
113____12
114____7
115____1
116____9
117____11
118____13
119____10
120____1
121____3
122____13
124____1
125____6
126____0
127____3
128____9
129____2
130____1
132____5
133____10
134____

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
