In [1]:
import json
import pandas as pd
from collections import Counter
from tqdm.auto import tqdm
import os

In [2]:
from nltk.tokenize import RegexpTokenizer
from scipy import sparse
from collections import Counter
import numpy as np
import torch
import torch.nn as nn

## Load and Preprocess the Data

In [3]:
with open("selected_articles.json", 'r') as file:
    selected_articles = json.load(file)

In [None]:
total_df = pd.DataFrame(selected_articles)[["id", "title", "authors", "categories", "abstract"]]
total_df = total_df.sample(frac=1).reset_index(drop=True)

In [None]:
category_list = ["cs.AI", "cs.DB", "cs.IT", "cs.LG", "cs.SI"]
category_dict = {"cs.AI":0, "cs.DB":1, "cs.IT":2, "cs.LG":3, "cs.SI":4}
stopwords = ['are', 'was', 'were', 'been', 'being', 'did', 'done', 'had', 'has', 'have', 'will', 'would', 'may', 'might', 'should', 'can', 'could', 
'she', 'her', 'you', 'your', 'him', 'they', 'them', 'ours', 'its', 'ourselves', 'yourself', 'itself', 'himself', 'yourselves', 'herself', 'himself',
'that', 'there', 'here', 'this', 'the', 'these', 'those', 'other', 'for', 'with', 'through', 'off', 'from', 'about','under', 'between', 'below', 'above', 'out', 'than',
'yes', 'not', 'and', 'because', 'nor', 'neither', 'then', 'but', 'too', 'else', 'also', 'either', 'when', 'what', 'which', 'who', 'why', 'whose', 'where', 'how',
'when', 'while', 'all', 'both', 'only', 'every', 'ever', 'much', 'more', 'many', 'very']

In [None]:
def to_sparse_tensor(id_matrix):
    sparse_mx = id_matrix.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse_coo_tensor(indices, values, shape)

def built_vocabulary(traindata, tokenizer):
    vocabulary = []
    traindata['word_list'] = traindata['abstract'].apply(lambda x: [word.lower() for word in tokenizer.tokenize(x)])
    for i, v in tqdm(traindata['word_list'].iteritems()):
        vocabulary += v
    voc_dict = Counter(vocabulary)
    uniq_voc = list(set(vocabulary))
    for word in tqdm(uniq_voc):
        if (voc_dict[word]<5) or (word in stopwords):
            voc_dict.pop(word)
    word_place_dict={}
    uniq_voc = list(voc_dict.keys())
    for i in tqdm(range(len(uniq_voc))):
        word_place_dict[uniq_voc[i]] = i
    return word_place_dict
    
# Build the term-document matrix, and generate ground truth array
def process_data(data_df, word_dict, categories, category_id,
                                if_train = True, tokenizer = None): 
    uniq_words = word_dict.keys()
    ground_truth = np.zeros((len(data_df), len(categories)))
    td_matrix = sparse.lil_matrix((data_df.shape[0], len(uniq_words))) # The term-document matrix
    
    # For evaluation and test, word_list need 
    if not if_train:
        data_df['word_list'] = data_df['abstract'].apply(lambda x: [word.lower() for word in tokenizer.tokenize(x)])
    
    for idx, row in tqdm(data_df.iterrows()):
        tmp = Counter(row['word_list'])
        for k,v in tmp.items():
            if k in uniq_words:
                td_matrix[idx, word_dict[k]] = v
        
        c_list = row['categories'].split(" ")
        for c in c_list:
            if c in categories:
                ground_truth[idx][category_id[c]] = 1  
                
    bias_td_matrix = torch.hstack((to_sparse_tensor(td_matrix), torch.ones(len(data_df),1).to_sparse()))        
    return bias_td_matrix, ground_truth

In [None]:
# Divide the train set, evaluate set and test set
train_df = total_df.iloc[:180000]
valid_df = total_df.iloc[180000:192000]
valid_df = valid_df.reset_index(drop=True)
test_df = total_df.iloc[192000:]
test_df = test_df.reset_index(drop=True)

In [None]:
# Preprocess Data
word_dict = built_vocabulary(train_df, regtokenizer)
train_X, train_Y_np = process_data(train_df, word_dict, category_list, category_dict)
valid_X, valid_Y_np = process_data(valid_df, word_dict, category_list, category_dict, False, regtokenizer)
test_X, test_Y_np = process_data(test_df, word_dict, category_list, category_dict, False, regtokenizer)

In [None]:
dim = len(word_dict.keys())+1
count = len(total_df)

train_Y = torch.from_numpy(train_Y_np).float()
valid_Y = torch.from_numpy(valid_Y_np).float()
test_Y = torch.from_numpy(test_Y_np).float()

## Define Evaluate and Test Function

In [None]:
def evaluate(devX, devY, Model, device, loss_fn, batch_size = 16, iftest = False):
    Model.eval()
    dev_count = devY.shape[0]
    dev_num_batch = dev_count//batch_size
    indices = torch.arange(batch_size)
    total_acc = 0
    total_loss = 0
    for i in tqdm(range(dev_num_batch), desc = "Evaluation step", leave = False):
        outputs = Model(devX.index_select(0, indices).to(device))
        ground_truth = devY[indices].to(device)
        total_acc += (((outputs>0.5).float() == ground_truth).float()).mean()
        loss = loss_fn(outputs, ground_truth)
        total_loss += loss.item()
        indices = indices + batch_size
        
    return total_acc/dev_num_batch, total_loss/dev_num_batch        

def test(devX, devY, Model, device, batch_size = 16):
    Model.eval()
    dev_count = devY.shape[0]
    cag_num = devY.shape[1]
    dev_num_batch = dev_count//batch_size
    indices = torch.arange(batch_size)
    total_acc = 0
    tp_fp_fn = np.ones((3, cag_num))
    for i in tqdm(range(dev_num_batch), desc = "Evaluation step", leave = False):
        outputs = Model(devX.index_select(0, indices).to(device))
        ground_truth = devY[indices].to(device)
        classify_results = (outputs>0.5).float()
        total_acc += ((classify_results == ground_truth).float()).mean()
        for j in range(batch_size):
            for k in range(cag_num):
                if classify_results[j][k]==1:
                    if classify_results[j][k] == ground_truth[j][k]:
                        tp_fp_fn[0][k] += 1
                    else:
                        tp_fp_fn[1][k] += 1
                        
                else:
                    if ground_truth[j][k]==1:
                        tp_fp_fn[2][k] += 1
                
        indices = indices + batch_size
        
    precision = tp_fp_fn[0]/(tp_fp_fn[0]+tp_fp_fn[1])
    recall = tp_fp_fn[0]/(tp_fp_fn[0]+tp_fp_fn[2])
    macrof1 = 2/(1/precision.mean() + 1/recall.mean())
    mean_tp_fp_fn = tp_fp_fn.mean(axis = 1)
    microf1 = 2*mean_tp_fp_fn[0]/(2*mean_tp_fp_fn[0] + mean_tp_fp_fn[1] + 2*mean_tp_fp_fn[2])
    print(tp_fp_fn)
        
    return total_acc/dev_num_batch, macrof1, microf1

## Define the MLP Model

In [None]:
class DNN(nn.Module):
    def __init__(self, n, num_class):
        super(DNN, self).__init__()
        self.first_layer = nn.Linear(n, 256)
        self.activation = nn.ReLU()
        self.second_layer = nn.Linear(256, 5)
        self.output_layer = nn.Sigmoid()
        
        # Initialize
        self.first_layer.weight.data.uniform_(-1e-5, 1e-5)
        self.second_layer.weight.data.uniform_(-1e-5, 1e-5)

    def forward(self, feature):
        return self.output_layer(self.second_layer(self.activation(self.first_layer(feature))))

## Define Parameters and Train

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 5
batch_size = 32

In [None]:
loss_function = nn.CrossEntropyLoss()
model = DNN(dim, 5)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
train_count = len(train_df)
loss_record = [] 
eval_accuracy_list = []
eval_loss_list = []
k = 0
num_batch = train_count//batch_size
tr_loss = 0
model.train()
for epoch in tqdm(range(num_epochs), desc = "epoch", leave = True):
    indices = torch.arange(batch_size)
    for i in tqdm(range(num_batch), desc = "step", leave = False):
        outputs = model.forward(train_X.index_select(0, indices).to(device))
        loss = loss_function(outputs, train_Y[indices].to(device))
        loss.backward()
        optimizer.step()
        model.zero_grad()
        k += 1
        tr_loss += loss.item()
        indices = indices + batch_size
        if k%100 == 0 and k!=0:
            loss_record.append(tr_loss/100)            
            if k%1000 == 0:
                eval_acc, eval_loss = evaluate(valid_X, valid_Y, model, device, loss_function, batch_size)
                eval_accuracy_list.append(eval_acc)
                eval_loss_list.append(eval_loss)
                print("Mean Train Loss", tr_loss/100, "Mean evaluation accuracy", eval_acc, 
                     "Mean evaluation loss", eval_loss)
                model.train()
                
            tr_loss = 0
     
    # save model after each epoch
    save_path = "./MLP_checkpoint/checkpoint-"+str(epoch+1)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    torch.save(model, save_path + "/mlp_model_"+str(epoch+1)+".pt")
    

## Save Results and Test the Model

In [None]:
train_loss = np.array([loss_record, np.arange(100, num_epochs*num_batch+1, 100)])
train_loss[0]

In [None]:
# and evaluation results for the final
end_eval_acc, end_eval_loss = evaluate(valid_X, valid_Y, model, device, loss_function, batch_size)
eval_accuracy_list.append(end_eval_acc)
eval_loss_list.append(end_eval_loss)
end_eval_acc, end_eval_loss

In [None]:
eval_steps = np.arange(1000, num_epochs*num_batch+1, 1000)
eval_steps = np.append(eval_steps, num_epochs*num_batch)
eval_results = np.array([eval_accuracy_list, eval_loss_list, eval_steps])
eval_results 

In [None]:
# Save results
np.savez("MLP_results.npz", train_loss, eval_results)

In [None]:
# Test the model
test(test_X, test_Y, model, device, batch_size)