### Imports

In [61]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import spacy
from sklearn.datasets import fetch_openml
from sklearn.multioutput import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import jaccard_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

import pickle
import time
import os
from collections import Counter
from pprint import pprint

### Dataset Preparation

In [5]:
PATH = r'C:\Users\gufra\Desktop\Work\Projects\MultiThemed\Researcher\Dataset'
model_dst_path = r'C:\Users\gufra\Desktop\Work\Projects\MultiThemed\Researcher\Models'
df = pd.read_csv(os.path.join(PATH,"data.csv"))

In [6]:
df['total_text'] = df.titles+" "+df.abstracts
df.head()

Unnamed: 0,terms,titles,abstracts,total_text
0,['cs.LG'],Multi-Level Attention Pooling for Graph Neural...,Graph neural networks (GNNs) have been widely ...,Multi-Level Attention Pooling for Graph Neural...
1,"['cs.LG', 'cs.AI']",Decision Forests vs. Deep Networks: Conceptual...,Deep networks and decision forests (such as ra...,Decision Forests vs. Deep Networks: Conceptual...
2,"['cs.LG', 'cs.CR', 'stat.ML']",Power up! Robust Graph Convolutional Network v...,Graph convolutional networks (GCNs) are powerf...,Power up! Robust Graph Convolutional Network v...
3,"['cs.LG', 'cs.CR']",Releasing Graph Neural Networks with Different...,With the increasing popularity of Graph Neural...,Releasing Graph Neural Networks with Different...
4,['cs.LG'],Recurrence-Aware Long-Term Cognitive Network f...,Machine learning solutions for pattern classif...,Recurrence-Aware Long-Term Cognitive Network f...


In [7]:
df.total_text.iloc[0]

"Multi-Level Attention Pooling for Graph Neural Networks: Unifying Graph Representations with Multiple Localities Graph neural networks (GNNs) have been widely used to learn vector\nrepresentation of graph-structured data and achieved better task performance\nthan conventional methods. The foundation of GNNs is the message passing\nprocedure, which propagates the information in a node to its neighbors. Since\nthis procedure proceeds one step per layer, the range of the information\npropagation among nodes is small in the lower layers, and it expands toward the\nhigher layers. Therefore, a GNN model has to be deep enough to capture global\nstructural information in a graph. On the other hand, it is known that deep GNN\nmodels suffer from performance degradation because they lose nodes' local\ninformation, which would be essential for good model performance, through many\nmessage passing steps. In this study, we propose multi-level attention pooling\n(MLAP) for graph-level classification

In [8]:
df.shape

(56181, 4)

In [9]:
cat_unique, cat_list = set(), []
for i in range(len(df.terms)):
    cats = df.terms.iloc[i].replace("]","").replace("[","").replace("'","").replace(" ","")
    cats = cats.split(",")
    
    for c in cats:
        cat_unique.add(c)
        cat_list.append(c)

print(len(cat_list), len(cat_unique))

114920 1238


In [17]:
min_number_occurrences = 470

counts = dict(Counter(cat_list))
sorted_counts = dict(sorted(counts.items(), key=lambda x: x[1], reverse=True))

filtered_sorted_counts = {k: v for k, v in sorted_counts.items() if v > 50}
print(len(filtered_sorted_counts))
pprint(filtered_sorted_counts)

65
{'62H30': 53,
 '68T05': 112,
 '68T07': 125,
 '68T10': 55,
 '68T45': 125,
 '68U10': 73,
 'I.2.10': 51,
 'I.2.6': 154,
 'I.4.6': 52,
 'cond-mat.dis-nn': 71,
 'cond-mat.stat-mech': 66,
 'cs.AI': 8391,
 'cs.AR': 62,
 'cs.CE': 77,
 'cs.CG': 190,
 'cs.CL': 1659,
 'cs.CR': 739,
 'cs.CV': 33433,
 'cs.CY': 266,
 'cs.DB': 100,
 'cs.DC': 279,
 'cs.DM': 53,
 'cs.DS': 197,
 'cs.GR': 1614,
 'cs.GT': 117,
 'cs.HC': 419,
 'cs.IR': 469,
 'cs.IT': 331,
 'cs.LG': 30940,
 'cs.LO': 64,
 'cs.MA': 385,
 'cs.MM': 598,
 'cs.NA': 186,
 'cs.NE': 1378,
 'cs.NI': 120,
 'cs.PF': 53,
 'cs.PL': 57,
 'cs.RO': 2054,
 'cs.SD': 206,
 'cs.SE': 124,
 'cs.SI': 684,
 'cs.SY': 479,
 'eess.AS': 218,
 'eess.IV': 2647,
 'eess.SP': 658,
 'eess.SY': 376,
 'math.DS': 83,
 'math.IT': 331,
 'math.NA': 192,
 'math.OC': 724,
 'math.PR': 93,
 'math.ST': 263,
 'physics.chem-ph': 134,
 'physics.comp-ph': 141,
 'physics.data-an': 124,
 'q-bio.BM': 88,
 'q-bio.NC': 181,
 'q-bio.QM': 253,
 'q-fin.ST': 54,
 'quant-ph': 71,
 'stat.AP': 324,

In [18]:
drop_indices = []
for i in range(len(df)):
    cats = df.terms.iloc[i].replace("]","").replace("[","").replace("'","").replace(" ","")
    cats = cats.split(",")
    
    cat_found = False
    for c in cats:
        if c in df.terms.iloc[i] and filtered_sorted_counts.get(c,0)!=0:
            cat_found = True
            break
    
    if not cat_found: drop_indices.append(i)
        
for i in drop_indices:
    df = df.drop(index=df.iloc[i].name)

df.shape

(56181, 4)

In [19]:
keys = list(filtered_sorted_counts.keys())
class_cat_map = {}

for i in range(len(keys)):
    class_cat_map["Class"+str(i+1)] = keys[i]
    df["Class"+str(i+1)] = df['terms'].apply(lambda text: keys[i] in text)

class_cat_map

{'Class1': 'cs.CV',
 'Class2': 'cs.LG',
 'Class3': 'stat.ML',
 'Class4': 'cs.AI',
 'Class5': 'eess.IV',
 'Class6': 'cs.RO',
 'Class7': 'cs.CL',
 'Class8': 'cs.GR',
 'Class9': 'cs.NE',
 'Class10': 'cs.CR',
 'Class11': 'math.OC',
 'Class12': 'cs.SI',
 'Class13': 'eess.SP',
 'Class14': 'cs.MM',
 'Class15': 'cs.SY',
 'Class16': 'cs.IR',
 'Class17': 'cs.HC',
 'Class18': 'cs.MA',
 'Class19': 'eess.SY',
 'Class20': 'cs.IT',
 'Class21': 'math.IT',
 'Class22': 'stat.AP',
 'Class23': 'stat.ME',
 'Class24': 'cs.DC',
 'Class25': 'cs.CY',
 'Class26': 'math.ST',
 'Class27': 'stat.TH',
 'Class28': 'q-bio.QM',
 'Class29': 'eess.AS',
 'Class30': 'cs.SD',
 'Class31': 'cs.DS',
 'Class32': 'math.NA',
 'Class33': 'cs.CG',
 'Class34': 'cs.NA',
 'Class35': 'q-bio.NC',
 'Class36': 'I.2.6',
 'Class37': 'stat.CO',
 'Class38': 'physics.comp-ph',
 'Class39': 'physics.chem-ph',
 'Class40': '68T07',
 'Class41': '68T45',
 'Class42': 'physics.data-an',
 'Class43': 'cs.SE',
 'Class44': 'cs.NI',
 'Class45': 'cs.GT',
 '

In [27]:
nlp = spacy.load("en_core_web_lg")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct or token.text == "\n":
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens) 

t = time.time()
df['preprocessed_text'] = df['total_text'].apply(preprocess) 
print("Preprocessing the text took",time.time()-t,"seconds")

Preprocessing the text took 1683855427.7375712 seconds


In [28]:
t = time.time()
df["vector"] = df['preprocessed_text'].apply(lambda text: nlp(text).vector)
print("Vectorizing the text took",time.time()-t,"seconds")

Vectorizing the text took 1683856652.4551666 seconds


In [29]:
df.drop(['terms','titles','abstracts', 'total_text', 'preprocessed_text'], axis=1, inplace=True)
df.head()

Unnamed: 0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,...,Class57,Class58,Class59,Class60,Class61,Class62,Class63,Class64,Class65,vector
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,"[-0.5645256, 0.6847013, -0.52508897, 0.8776306..."
1,False,True,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,"[-0.87305856, 0.36389616, -1.1076534, 1.024963..."
2,False,True,True,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,"[-1.4377851, 1.3583574, -0.4455826, 0.6853791,..."
3,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,"[-0.99162173, 0.8107034, -0.34841648, 0.460232..."
4,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,"[-1.3354148, 0.67077386, -0.2261729, 1.0543416..."


In [40]:
X_train, X_test, y_train, y_test = train_test_split(
                                    df.vector.values, 
                                    df.drop(['vector'], axis=1), 
                                    test_size=0.2, 
                                    random_state=0)
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

### Building NLP model

In [49]:
base_lr = LogisticRegression()

ovr = OneVsRestClassifier(base_lr)
ovr.fit(X_train_2d, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
y_pred_ovr = ovr.predict(X_test_2d)
ovr_jaccard_score = jaccard_score(y_test, y_pred_ovr, average="samples")

ovr_jaccard_score

0.6522563639676747

In [None]:
chains = [ClassifierChain(base_lr, order="random", random_state=i) for i in range(10)]
for chain in chains:
    chain.fit(X_train_2d, y_train)

In [None]:
y_pred_chains = np.array([chain.predict(X_test_2d) for chain in chains])
chain_jaccard_scores = [
    jaccard_score(y_test, y_pred_chain >= 0.5, average="samples")
    for y_pred_chain in y_pred_chains
]

In [None]:
y_pred_ensemble = y_pred_chains.mean(axis=0)
ensemble_jaccard_score = jaccard_score(y_test, y_pred_ensemble >= 0.5, average="samples")
ensemble_jaccard_score

In [None]:
model_scores = [ovr_jaccard_score] + chain_jaccard_scores
model_scores.append(ensemble_jaccard_score)

In [None]:
model_names = (
    "Independent",
    "Chain 1",
    "Chain 2",
    "Chain 3",
    "Chain 4",
    "Chain 5",
    "Chain 6",
    "Chain 7",
    "Chain 8",
    "Chain 9",
    "Chain 10",
    "Ensemble",
)

In [None]:
x_pos = np.arange(len(model_names))

# Plot the Jaccard similarity scores for the independent model, each of the
# chains, and the ensemble (note that the vertical axis on this plot does
# not begin at 0).

fig, ax = plt.subplots(figsize=(7, 4))
ax.grid(True)
ax.set_title("Classifier Chain Ensemble Performance Comparison")
ax.set_xticks(x_pos)
ax.set_xticklabels(model_names, rotation="vertical")
ax.set_ylabel("Jaccard Similarity Score")
ax.set_ylim([min(model_scores) * 0.9, max(model_scores) * 1.1])
colors = ["r"] + ["b"] * len(chain_jaccard_scores) + ["g"]
ax.bar(x_pos, model_scores, alpha=0.5, color=colors)
plt.tight_layout()
plt.show()

In [57]:
def get_classes(one_hot):
    res = ""
    for i in range(len(one_hot)):
        if one_hot[i] == 0: continue
        
        res+=class_cat_map["Class"+str(i+1)] + " "
    
    return res

In [58]:
title = 'Continuous and Realtime Road Condition Assessment Using Deep Learning'
abstract = 'Continuous and real-time monitoring of road quality conditions is essential for the maintenance of roads and to ensure the safety of drivers and their vehicles. However, the continuous monitoring of thousands of kilometers of roads and highways is a very tedious, time-consuming, error-prone, and expensive operation. A deep learning based approach that can automatically classify the road condition can help tremendously in cutting down the time, effort, accuracy, and cost for monitoring and maintenance of vast road infrastructure. This paper proposes a mechanism to continuously monitor deteriorating road conditions at the city or municipality level in real time and classify them into four different categories (good, medium, bad and unpaved) using custom-built and transfer learning from pre-trained deep learning models (VGG16 and MobileNetV2). The dataset is collected from different roads in the Kingdom of Saudi Arabia. The dataset is composed of close-up road images taken in real time (while driving the car) at regular intervals using an Android App. In the data capture model, the Android App helps to easily tag (label) the captured images for model training purposes. In the classifier mode, the Android app uses the developed deep learning model to classify the captured image and then transmits the medium, bad or unpaved road condition to the central server along with longitude and latitude information to update the centralized map of the city (or municipality). The proposed approach provides an accuracy of 98.6 % to classify the road condition based on images captured during real time driving of the vehicle.'

vector = nlp(preprocess(title+abstract)).vector
get_classes(ovr.predict([vector])[0])

'cs.CV cs.CY '

In [59]:
title = 'Knee Osteoarthritis Detection and Severity Classification Using Residual Neural Networks on Preprocessed X-ray Images'
abstract = 'One of the most common and challenging medical conditions to deal with in old-aged people is the occurrence of knee osteoarthritis (KOA). Manual diagnosis of this disease involves observing X-ray images of the knee area and classifying it under five grades using the Kellgren–Lawrence (KL) system. This requires the physician’s expertise, suitable experience, and a lot of time, and even after that the diagnosis can be prone to errors. Therefore, researchers in the ML/DL domain have employed the capabilities of deep neural network (DNN) models to identify and classify KOA images in an automated, faster, and accurate manner. To this end, we propose the application of six pretrained DNN models, namely, VGG16, VGG19, ResNet101, MobileNetV2, InceptionResNetV2, and DenseNet121 for KOA diagnosis using images obtained from the Osteoarthritis Initiative (OAI) dataset. More specifically, we perform two types of classification, namely, a binary classification, which detects the presence or absence of KOA and secondly, classifying the severity of KOA in a three-class classification. For a comparative analysis, we experiment on three datasets (Dataset I, Dataset II, and Dataset III) with five, two, and three classes of KOA images, respectively. We achieved maximum classification accuracies of 69%, 83%, and 89%, respectively, with the ResNet101 DNN model. Our results show an improved performance from the existing work in the literature.'

vector = nlp(preprocess(title+abstract)).vector
get_classes(ovr.predict([vector])[0])

'cs.CV '

In [60]:
title = 'Deep CNN based MR image denoising for tumor segmentation using watershed transform'
abstract = 'Magnetic Resonance Imaging (MRI) is considered one of the most effective imaging techniques used in the medical field for both clinical investigation and diagnosis. This is due to the fact that MRI provides many critical features of the tissue including both physiological and chemical information. Rician noise affects MR images during acquisition thereby reducing the quality of the image and complicating the accurate diagnosis. In this paper, we propose a novel technique for MR image denoising using Deep Convolutional Neural Network (Deep CNN) and anisotropic diffusion (AD) which we will refer to as Deep CNN-AD. Watershed transform is then used to segment the tumorous portion of the denoised image. The proposed method is tested on the BraTS MRI datasets. The proposed denoising method produced better results compared to previous methods. As denoising process affect the segmentation process therefore better denoised images by proposed technique produced more accurate segmentation with an average Specificity of 99.85% and dice coefficient of 90.46% thus indicating better performance of proposed technique.'

vector = nlp(preprocess(title+abstract)).vector
get_classes(ovr.predict([vector])[0])

'cs.CV eess.IV '

In [67]:
pickle.dump(ovr, open(os.path.join(model_dst_path, "ovr_en_core_web_lg.pickle"), "wb"))
ovr = pickle.load(open(os.path.join(model_dst_path, "ovr_en_core_web_lg.pickle"), "rb"))

get_classes(ovr.predict([vector])[0])

'cs.CV eess.IV '