In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.decomposition import TruncatedSVD
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.metrics import TopKCategoricalAccuracy
from sklearn.model_selection import StratifiedKFold
from losses import categorical_focal_loss
from plot_keras_history import plot_history
import matplotlib.pyplot as plt
import json

In [2]:
df = pd.read_csv('../Dataset/google_choromium_cleaned_100_max.csv')

In [3]:
df.head()

Unnamed: 0,id,issue_id,Summary,reported_time,Assignee,Description,text,words
0,5,7,Errors in importing from firefox,2008-09-02 19:08:46,venkataramana@chromium.org,\nproduct version 0214927\r\nurls if app...,errors in importing from firefox. product vers...,100
1,16,18,Wishlist: Chrome does not have an addon-system,2008-09-02 19:22:41,aa@chromium.org,\nproduct version all\r\nurls if applica...,wishlist chrome does not have an addonsystem. ...,95
2,17,19,Automatic integrated windows authentication (a...,2008-09-02 19:22:46,cbentzel@chromium.org,\nproduct version 0214927\r\nurls if app...,automatic integrated windows authentication ak...,131
3,19,21,Facebook: Commenting on Status not working,2008-09-02 19:24:29,eroman@chromium.org,\nproduct version see aboutversion\r\nur...,facebook commenting on status not working. pro...,82
4,27,31,"""Become a fan"" on facebook does not work",2008-09-02 19:30:10,eroman@chromium.org,\nproduct version see aboutversion\r\nur...,become fan on facebook does not work. product ...,88


In [4]:
df = df[df['text'].notnull()]

In [5]:
label_name = 'Assignee'

In [6]:
unique_developers = df[label_name].unique()
developer_dict = {}
for idx, developer in enumerate(unique_developers, start = 1):
  developer_dict[developer] = idx

In [7]:
df[label_name] = df[label_name].astype(str).map(developer_dict)

In [8]:
X = df['text'].values

In [10]:
vectorizer = TfidfVectorizer(tokenizer= lambda x: x.split(' ') , ngram_range=(1, 2), dtype=np.float32)
train_tfidf = vectorizer.fit_transform(X)

In [11]:
len(vectorizer.get_feature_names())

1100622

In [9]:
n_components = 1100

In [10]:
Y = df[label_name].values

In [11]:
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

In [12]:
Y_encoded = np_utils.to_categorical(encoded_Y)

In [13]:
metrics = [TopKCategoricalAccuracy(i+1, name=f'{i+1}') for i in range(10)]

In [14]:
alpha = np.full(df[label_name].nunique(), 0.25)

In [15]:
loss = [categorical_focal_loss(alpha=[alpha], gamma=2)]

In [16]:
def ann_model():
  ann = tf.keras.models.Sequential()
  ann.add(tf.keras.layers.Dense(units=800, input_dim = n_components, activation='relu'))
  ann.add(tf.keras.layers.Dense(df[label_name].nunique(), activation='softmax'))
  ann.compile(optimizer = 'adam', loss = loss, metrics=metrics)
  return ann

In [17]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=101)
cvscores = []
histories = []
miss_classified = []
fold_no = 1

In [None]:
for train, test in kfold.split(X, Y):
    x_train = X[train]
    x_test = X[test]
    
    y_train = Y_encoded[train]
    y_test = Y_encoded[test]
    
    #TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(tokenizer= lambda x: x.split(' ') , ngram_range=(1, 2), dtype=np.float32)
    
    train_tfidf = vectorizer.fit_transform(x_train)
    
    test_tfidf = vectorizer.transform(x_test)
    
    svd = TruncatedSVD(n_components=n_components, algorithm='arpack')
    
    train_svd = svd.fit_transform(train_tfidf)
    
    test_svd = svd.transform(test_tfidf)
    
    #Model Training
    model = ann_model()
    history = model.fit(train_svd, y_train, validation_data = (test_svd, y_test), epochs=4, batch_size=32, verbose=0).history
    
    #Training History
    pred = model.predict(test_svd) 
    actual_developers = np.argmax(y_test, axis=-1)
    train_developers =  np.argmax(y_train, axis=-1)
    pred_developers = np.argmax(pred, axis=-1)
    miss = {'fold': fold_no, 'train_documents': train, 'train_developers': train_developers ,'test_documents': test, 'test_developers': actual_developers, 'pred_developers': pred_developers}
    miss_classified.append(miss)
    histories.append(history)
    fold_no = fold_no + 1
    
    #Scores
    scores = model.evaluate(test_svd, y_test, batch_size=32, verbose=1)
    cvscores.append(scores[1:])



In [None]:
np.array(cvscores).sum(axis=0)/10

In [None]:
for item in miss_classified:
    for key in item.keys():
        if isinstance(item[key], np.ndarray):
            item[key] = item[key].tolist()

In [None]:
history_dic = {}

In [None]:
for item in histories:
    for key in item.keys():
        if key in history_dic.keys():
            history_dic[key] = history_dic[key] + np.array(item[key])
        else:
            history_dic[key] = np.array(item[key])

In [None]:
plot_history(history_dic)
plt.show()

In [None]:
def convert(o):
    if isinstance(o, np.int64): return int(o)  
    raise TypeError

In [None]:
for key in history_dic.keys():
    history_dic[key] = history_dic[key].tolist()

In [None]:
with open('ann_history.json', mode='w') as history_file:
    json.dump(history_dic, history_file)

In [None]:
with open('ann_classification_report.json', 'w') as report_file:
  json.dump(miss_classified, report_file, default=convert)