In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.decomposition import TruncatedSVD
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.metrics import TopKCategoricalAccuracy
from sklearn.model_selection import StratifiedKFold
from losses import categorical_focal_loss
from plot_keras_history import plot_history
import matplotlib.pyplot as plt
import json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten
from keras.optimizers import Adam
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras.metrics import TopKCategoricalAccuracy
from keras import regularizers

In [2]:
df = pd.read_csv('../Dataset/mozilla_firefox.csv')

In [3]:
df.head()

Unnamed: 0,Assignee,Summary,Description,text,words
0,nobody@mozilla.org,Dialup properties needs to be exposed in prefs,The dialup properties of the profile should be...,dialup properties needs to be exposed in prefs...,63
1,nobody@mozilla.org,[Find] Find whole word only,"Please add ""Match Whole Word Only"" option to b...",find find whole word only. please add match wh...,19
2,nobody@mozilla.org,Plug-In Manager (ui for choosing mimetype-plug...,I would really like a plug-in manager for my b...,plugin manager ui for choosing mimetypeplugin ...,92
3,nobody@mozilla.org,add font-list support to the font pref front end,Subject: Re: font selection interface\nFrom: J...,add fontlist support to the font pref front en...,132
4,nobody@mozilla.org,Ctrl-Alt-T to show networking debug info,"This is a 4.x farity feature request, it is us...",ctrlaltt to show networking debug info. this i...,49


In [4]:
df = df[df['text'].notnull()]

In [5]:
label_name = 'Assignee'

In [6]:
unique_developers = df[label_name].unique()
developer_dict = {}
for idx, developer in enumerate(unique_developers, start = 1):
  developer_dict[developer] = idx

In [7]:
df[label_name] = df[label_name].astype(str).map(developer_dict)

In [8]:
ann_components = 1100

In [9]:
X = df['text'].values
Y = df[label_name].values

In [10]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [11]:
Y_encoded = np_utils.to_categorical(encoded_Y)

In [12]:
EMBEDDING_DIM = 300

In [13]:
metrics = [TopKCategoricalAccuracy(i+1, name=f'{i+1}') for i in range(10)]

In [14]:
alpha = np.full(df[label_name].nunique(), 0.25)
loss = [categorical_focal_loss(alpha=[alpha], gamma=2)]

In [15]:
def cnn2_cnn3_model(summary, vocab_size, MAX_SEQUENCE_LENGTH):
    input1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    input2 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    
    #CNN2
    embed2 = Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input1)
    cnv2 = Conv1D(256, 2, activation="relu")(embed2)
    pool2 = MaxPooling1D(int(MAX_SEQUENCE_LENGTH * 0.2))(cnv2)
    flat2 = Flatten()(pool2)
    
    #CNN3
    embed3 = Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input2)
    cnv3 = Conv1D(256, 3, activation="relu")(embed3)
    pool3 = MaxPooling1D(int(MAX_SEQUENCE_LENGTH * 0.2))(cnv3)
    flat3 = Flatten()(pool3)
    
    #Merge
    merged = concatenate([flat2, flat3])
    
    dense2 = Dense(1024, activation="relu")(merged)
    
    outputs = Dense(df[label_name].nunique(), activation='softmax')(dense2)
    
    model = Model(inputs=[input1, input2], outputs=outputs)
    
    model.compile(loss=loss, optimizer='rmsprop', metrics=metrics)
    
    if summary==True:
        print(model.summary())

    return model

In [16]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=101)
cvscores = []
histories = []
miss_classified = []
fold_no = 1

In [1]:
for train, test in kfold.split(X, Y):
    
    x_train = X[train]
    x_test = X[test]
    
    y_train = Y_encoded[train]
    y_test = Y_encoded[test]
    
    #Tokenizing & Padding
    tokenizer = Tokenizer(oov_token=True)
    tokenizer.fit_on_texts(x_train)
    vocab_size = len(tokenizer.word_index) + 1
    train_seq = tokenizer.texts_to_sequences(x_train)
    test_seq = tokenizer.texts_to_sequences(x_test)
    MAX_SEQUENCE_LENGTH = max([len(s) for s in train_seq])
    train_cnn = pad_sequences(train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    test_cnn = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    
    #Model Traing
    model = cnn2_cnn3_model(False, vocab_size, MAX_SEQUENCE_LENGTH)
    history = model.fit([train_cnn, train_cnn], y_train, epochs=5, validation_data = ([test_cnn, test_cnn], y_test), batch_size=32, verbose=0).history
    
    #Training History
    pred = model.predict([test_cnn, test_cnn]) 
    actual_developers = np.argmax(y_test, axis=-1)
    train_developers =  np.argmax(y_train, axis=-1)
    pred_developers = np.argmax(pred, axis=-1)
    miss = {'fold': fold_no, 'train_documents': train, 'train_developers': train_developers ,'test_documents': test, 'test_developers': actual_developers, 'pred_developers': pred_developers}
    miss_classified.append(miss)
    histories.append(history)
    fold_no = fold_no + 1
    
    #Scores
    scores = model.evaluate([test_cnn, test_cnn], y_test, batch_size=32, verbose=1)
    cvscores.append(scores[1:])
    del model

In [18]:
np.array(cvscores).sum(axis=0)/3

array([0.88560949, 0.94713293, 0.97075442, 0.97654976, 0.97941073,
       0.98148922, 0.98278522, 0.9841301 , 0.98491259, 0.98540165])

In [None]:
for item in miss_classified:
    for key in item.keys():
        if isinstance(item[key], np.ndarray):
            item[key] = item[key].tolist()

In [None]:
history_dic = {}

In [None]:
for item in histories:
    for key in item.keys():
        if key in history_dic.keys():
            history_dic[key] = history_dic[key] + np.array(item[key])
        else:
            history_dic[key] = np.array(item[key])

In [None]:
plot_history(history_dic)
plt.show()

In [None]:
def convert(o):
    if isinstance(o, np.int64): return int(o)  
    raise TypeError

In [None]:
for key in history_dic.keys():
    history_dic[key] = history_dic[key].tolist()

In [None]:
with open('cnn2_cnn3_history.json', mode='w') as history_file:
    json.dump(history_dic, history_file)

In [None]:
with open('cnn2_cnn3_classification_report.json', 'w') as report_file:
  json.dump(miss_classified, report_file, default=convert)