In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.decomposition import TruncatedSVD
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.metrics import TopKCategoricalAccuracy
from sklearn.model_selection import StratifiedKFold
from losses import categorical_focal_loss
from plot_keras_history import plot_history
import matplotlib.pyplot as plt
import json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Embedding, Input, Flatten
from keras.optimizers import Adam
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras.metrics import TopKCategoricalAccuracy
from keras import regularizers

In [2]:
df = pd.read_csv('../Dataset/google_choromium_cleaned.csv')

In [3]:
df.head()

Unnamed: 0,id,issue_id,Summary,reported_time,Assignee,Description,text,words
0,5,7,Errors in importing from firefox,2008-09-02 19:08:46,venkataramana@chromium.org,\nproduct version 0214927\r\nurls if app...,errors in importing from firefox. product vers...,100
1,16,18,Wishlist: Chrome does not have an addon-system,2008-09-02 19:22:41,aa@chromium.org,\nproduct version all\r\nurls if applica...,wishlist chrome does not have an addonsystem. ...,95
2,17,19,Automatic integrated windows authentication (a...,2008-09-02 19:22:46,cbentzel@chromium.org,\nproduct version 0214927\r\nurls if app...,automatic integrated windows authentication ak...,131
3,19,21,Facebook: Commenting on Status not working,2008-09-02 19:24:29,eroman@chromium.org,\nproduct version see aboutversion\r\nur...,facebook commenting on status not working. pro...,82
4,27,31,"""Become a fan"" on facebook does not work",2008-09-02 19:30:10,eroman@chromium.org,\nproduct version see aboutversion\r\nur...,become fan on facebook does not work. product ...,88


In [4]:
df = df[df['text'].notnull()]

In [5]:
label_name = 'Assignee'

In [6]:
unique_developers = df[label_name].unique()
developer_dict = {}
for idx, developer in enumerate(unique_developers, start = 1):
  developer_dict[developer] = idx

In [7]:
df[label_name] = df[label_name].astype(str).map(developer_dict)

In [8]:
ann_components = 1100

In [9]:
X = df['text'].values
Y = df[label_name].values

In [10]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [11]:
Y_encoded = np_utils.to_categorical(encoded_Y)

In [12]:
EMBEDDING_DIM = 300

In [13]:
metrics = [TopKCategoricalAccuracy(i+1, name=f'{i+1}') for i in range(10)]

In [14]:
alpha = np.full(df[label_name].nunique(), 0.25)
loss = [categorical_focal_loss(alpha=[alpha], gamma=2)]

In [15]:
def cnn3_model(summary, vocab_size, MAX_SEQUENCE_LENGTH):
    input1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
    
    #CNN3
    embed3 = Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)(input1)
    cnv3 = Conv1D(256, 3, activation="relu")(embed3)
    pool3 = MaxPooling1D(int(MAX_SEQUENCE_LENGTH * 0.2))(cnv3)
    flat3 = Flatten()(pool3)
    
    dense2 = Dense(1024, activation="relu")(flat3)
    
    outputs = Dense(df[label_name].nunique(), activation='softmax')(dense2)
    
    model = Model(inputs=input1, outputs=outputs)
    
    model.compile(loss=loss, optimizer='rmsprop', metrics=metrics)
    
    if summary==True:
        print(model.summary())

    return model

In [16]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=101)
cvscores = []
histories = []
miss_classified = []
fold_no = 1

In [1]:
for train, test in kfold.split(X, Y):
    
    x_train = X[train]
    x_test = X[test]
    
    y_train = Y_encoded[train]
    y_test = Y_encoded[test]
    
    #Tokenizing & Padding
    tokenizer = Tokenizer(oov_token=True)
    tokenizer.fit_on_texts(x_train)
    vocab_size = len(tokenizer.word_index) + 1
    train_seq = tokenizer.texts_to_sequences(x_train)
    test_seq = tokenizer.texts_to_sequences(x_test)
    MAX_SEQUENCE_LENGTH = max([len(s) for s in train_seq])
    train_cnn = pad_sequences(train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    test_cnn = pad_sequences(test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    
    #Model Traing
    model = cnn3_model(False, vocab_size, MAX_SEQUENCE_LENGTH)
    history = model.fit(train_cnn, y_train, epochs=5, validation_data = (test_cnn, y_test), batch_size=32, verbose=0).history
    
    #Training History
    pred = model.predict(test_cnn) 
    actual_developers = np.argmax(y_test, axis=-1)
    train_developers =  np.argmax(y_train, axis=-1)
    pred_developers = np.argmax(pred, axis=-1)
    miss = {'fold': fold_no, 'train_documents': train, 'train_developers': train_developers ,'test_documents': test, 'test_developers': actual_developers, 'pred_developers': pred_developers}
    miss_classified.append(miss)
    histories.append(history)
    fold_no = fold_no + 1
    
    #Scores
    scores = model.evaluate(test_cnn, y_test, batch_size=32, verbose=1)
    cvscores.append(scores[1:])
    del model

In [18]:
np.array(cvscores).sum(axis=0)/10

array([0.14382672, 0.22032491, 0.27191336, 0.31292419, 0.34498194,
       0.37191336, 0.39425993, 0.41382671, 0.43187725, 0.44873646])

In [19]:
for item in miss_classified:
    for key in item.keys():
        if isinstance(item[key], np.ndarray):
            item[key] = item[key].tolist()

In [20]:
history_dic = {}

In [21]:
for item in histories:
    for key in item.keys():
        if key in history_dic.keys():
            history_dic[key] = history_dic[key] + np.array(item[key])
        else:
            history_dic[key] = np.array(item[key])

In [2]:
plot_history(history_dic)
plt.show()

In [23]:
def convert(o):
    if isinstance(o, np.int64): return int(o)  
    raise TypeError

In [24]:
for key in history_dic.keys():
    history_dic[key] = history_dic[key].tolist()

In [25]:
with open('cnn3_history.json', mode='w') as history_file:
    json.dump(history_dic, history_file)

In [26]:
with open('cnn3_classification_report.json', 'w') as report_file:
  json.dump(miss_classified, report_file, default=convert)