# **Scientific journal recommender for submitting a publication**

Parser:  
https://it.wikipedia.org/wiki/BibTeX  

In [1]:
import string

folder = "datasets/"

# Dataset

For each class (journal) there is a file in BibTeX format containing the articles published in that journal. Each file was cleaned and formatted with the following online tool [BibTeX Tidy](https://flamingtempura.github.io/bibtex-tidy/index.html).

Each article is represented by a record with the following fields:
* **abstract**: Abstract of the article.
* **author**: Author of the article.
* **ENTRYTYPE**: Type of entry (article, book, inproceedings, etc.).
* **doi**: Digital Object Identifier of the article.
* **ID**: Unique identifier of the article.
* **issn**: International Standard Serial Number of the journal in which the article was published.
* **journal**: Journal in which the article was published.
* **keywords**: Keywords of the article.
* **note**: Additional information about the article.
* **pages**: Pages of the article.
* **title**: Title of the article.
* **url**: URL of the article.
* **volume**: Volume of the journal in which the article was published.
* **year**: Year of publication of the article.

The goal is to create a model that is able to predict the **journal** in which it will be published.

In [2]:
import os
import bibtexparser
import pandas as pd

# 1, 3, 4, 5
def read_bib_to_dataframe(file_path):
    #with open(file_path, 'r', encoding='utf-8') as bibtex_file:
    with open(file_path, 'r', encoding='latin-1') as bibtex_file:
        return bibtexparser.load(bibtex_file)

for filename in os.listdir(folder):
    if filename.endswith(".bib"):
        filename_path = os.path.join(folder, filename)
        bib_data = read_bib_to_dataframe(filename_path)
        if bib_data.entries:
            df = pd.DataFrame(bib_data.entries)
            df.to_csv(os.path.splitext(filename_path)[0] + '.csv', index=False)
        else:
            print("Error: ", filename, " is empty")

In [3]:
import pandas as pd
import os
import gc

dfs = []
for filename in os.listdir(folder):
    if filename.endswith(".csv"):
        dfs.append(pd.read_csv(os.path.join(folder, filename)))

df = pd.concat(dfs, ignore_index=True)
df.to_csv(folder + 'all.csv', index=False)

for tmp_df in dfs:
    tmp_df = None
dfs = None

gc.collect()
df.head()

  dfs.append(pd.read_csv(os.path.join(folder, filename)))


Unnamed: 0,abstract,keywords,author,url,doi,issn,year,pages,volume,journal,title,ENTRYTYPE,ID,note,combined_text,journal_num
0,This study proposed to investigate the thermal...,"Virtual reality headsets, Thermal comfort, Mic...",Zihao Wang and Renke He and Ke Chen,https://www.sciencedirect.com/science/article/...,https://doi.org/10.1016/j.apergo.2020.103066,0003-6870,2020.0,103066,85.0,Applied Ergonomics,Thermal comfort and virtual reality headsets,article,WANG2020103066,,,
1,A one-pedal system for operating an electric v...,"One-pedal operation, Electric vehicle, Emotion...",Fumie Sugimoto and Motohiro Kimura and Yuji Ta...,https://www.sciencedirect.com/science/article/...,https://doi.org/10.1016/j.apergo.2020.103179,0003-6870,2020.0,103179,88.0,Applied Ergonomics,Effects of one-pedal automobile operation on t...,article,SUGIMOTO2020103179,,,
2,Surgery has changed significantly in recent ye...,"Human reliability analysis (HRA), HEART, Dynam...",Rossella Onofrio and Paolo Trucco,https://www.sciencedirect.com/science/article/...,https://doi.org/10.1016/j.apergo.2020.103150,0003-6870,2020.0,103150,88.0,Applied Ergonomics,A methodology for Dynamic Human Reliability An...,article,ONOFRIO2020103150,,,
3,Truck platoon driving is a current branch of a...,"Truck platoon driving, Technology acceptance, ...",Sarah-Maria Castritius and Heiko Hecht and Joh...,https://www.sciencedirect.com/science/article/...,https://doi.org/10.1016/j.apergo.2019.103042,0003-6870,2020.0,103042,85.0,Applied Ergonomics,Acceptance of truck platooning by professional...,article,CASTRITIUS2020103042,,,
4,This paper presents a new handle for instrumen...,"Laparoscopic surgery, Handle design, Biomechanics",Ramon Sancibrian and Carlos Redondo-Figuero an...,https://www.sciencedirect.com/science/article/...,https://doi.org/10.1016/j.apergo.2020.103210,0003-6870,2020.0,103210,89.0,Applied Ergonomics,Ergonomic evaluation and performance of a new ...,article,SANCIBRIAN2020103210,,,


# Feature Selection

The following features are selected:
* **abstract**: Abstract of the article.
* **keywords**: Keywords of the article.
* **title**: Title of the article.

The target feature is:
* **journal**: Journal in which the article was published.

In [4]:
# Removing unnecessary columns
import pandas as pd
df = pd.read_csv(folder + 'all.csv')

feature_names = ['abstract', 'keywords', 'title']
target_name = 'journal'

print(df.shape)
df = df[feature_names + [target_name]]
print('\n',df.shape,'\n')
df.info()

df = df.dropna()
df.info()
df.head()

  df = pd.read_csv(folder + 'all.csv')


(661201, 16)

 (661201, 4) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 661201 entries, 0 to 661200
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   abstract  444022 non-null  object
 1   keywords  443826 non-null  object
 2   title     444311 non-null  object
 3   journal   444311 non-null  object
dtypes: object(4)
memory usage: 20.2+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 443820 entries, 0 to 444310
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   abstract  443820 non-null  object
 1   keywords  443820 non-null  object
 2   title     443820 non-null  object
 3   journal   443820 non-null  object
dtypes: object(4)
memory usage: 16.9+ MB


Unnamed: 0,abstract,keywords,title,journal
0,This study proposed to investigate the thermal...,"Virtual reality headsets, Thermal comfort, Mic...",Thermal comfort and virtual reality headsets,Applied Ergonomics
1,A one-pedal system for operating an electric v...,"One-pedal operation, Electric vehicle, Emotion...",Effects of one-pedal automobile operation on t...,Applied Ergonomics
2,Surgery has changed significantly in recent ye...,"Human reliability analysis (HRA), HEART, Dynam...",A methodology for Dynamic Human Reliability An...,Applied Ergonomics
3,Truck platoon driving is a current branch of a...,"Truck platoon driving, Technology acceptance, ...",Acceptance of truck platooning by professional...,Applied Ergonomics
4,This paper presents a new handle for instrumen...,"Laparoscopic surgery, Handle design, Biomechanics",Ergonomic evaluation and performance of a new ...,Applied Ergonomics


In [5]:
# Cleaning data

import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

language = 'english'
# Convert to lowercase
df[feature_names] = df[feature_names].applymap(lambda x: str(x).lower())
# Remove stopwords
stopwords_list = stopwords.words(language)
df[feature_names] = df[feature_names].apply(lambda x: x.apply(lambda words: ' '.join([w for w in words.split() if w not in stopwords_list])))
# Remove punctuation
df[feature_names] = df[feature_names].apply(lambda x: x.str.translate(str.maketrans('', '', string.punctuation)))
# Stemming
stemmer = nltk.stem.SnowballStemmer(language=language)
df[feature_names] = df[feature_names].apply(lambda x: x.apply(lambda words: ' '.join([stemmer.stem(w) for w in words.split()])))
# Tokenize
df[feature_names] = df[feature_names].apply(lambda x: x.apply(nltk.word_tokenize))

#df[feature_names] = df[feature_names].apply(lambda x: x.apply(lambda words: ' '.join(words)))
feature_name = 'combined_text'
df[feature_name] = df[feature_names[0]]
for i in range(1, len(feature_names)):
    df[feature_name] = df[feature_name] + df[feature_names[i]]

#add journal_num column with numeric integer to process
labels = df[target_name].unique()
labels_value = list(labels)
# Ordina la lista in ordine alfabetico e assegna i rank
ranks_ordinati = [rank for rank, _ in sorted(enumerate(labels_value, start=0), key=lambda x: x[1])] # Estrai solo i rank ordinati
target_name_num = target_name + '_num'
df[target_name_num] = df[target_name].map(dict(zip(labels_value, ranks_ordinati)))

df.head()
df.to_csv(folder + 'all_cleaned.csv', index=False)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [12]:
from sklearn.model_selection import train_test_split
import pandas as pd

gc.collect()

folder = "datasets/"
feature_name = 'combined_text'
target_name = 'journal_num'

df = pd.read_csv(folder + 'all_cleaned.csv')
#create new df with df[feature_name] and df[target_name_num]
#Selecting only feature_name and target_name_num
df = df[[feature_name, target_name_num]]

df.to_csv(folder + 'all_cleaned_' + feature_name + '.csv', index=False)
df_train, df_test = train_test_split(df, train_size=0.8)
df = None

#Clean memory
import gc
gc.collect()

0

## **Feature Extraction**

* **Objective** = Convert unstructured text into a numerical structure.
* **Result** = Feature vector valid for use by modeling algorithms (classification, clustering, etc.).
* The most commonly used approach is the concept of **"Bag of Words"** or **BoW**.
  – The sequence in which words appear and their positions in the document are not taken into account.

Create a **Bag of Words** using the **CountVectorizer** class from **scikit-learn**.

**Training** and **Transformation**: Apply `fit_transform()` to the collection of documents to train the vectorizer and transform the text into a term-document matrix. The result `X` is a sparse matrix in CSR (Compressed Sparse Row) format.

**Term-Document Matrix**: The term-document matrix can be visualized using `X.toarray()`, which converts it into a two-dimensional NumPy array based on word **frequency**.

**Vocabulary**: Obtain the vocabulary (the sorted list of words) using `vectorizer.get_feature_names_out()`.

[CountVectorizer Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer)


In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

bow_Count = CountVectorizer(max_features=1000)
bow_Tfidf = TfidfVectorizer(max_features=1000)

X_train_Count = bow_Count.fit_transform(df_train[feature_name]).toarray()
X_train_Tfidf = bow_Count.fit_transform(df_train[feature_name]).toarray()
y_train = df_train[target_name_num]

X_test_Count = bow_Count.transform(df_test[feature_name]).toarray()
X_test_Tfidf = bow_Count.transform(df_test[feature_name]).toarray()
y_test = df_test[target_name_num]

MemoryError: Unable to allocate 2.65 GiB for an array with shape (355056, 1000) and data type int64

## Clasificación

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier

def plot_class_distribution(y):
    plt.hist(y)
    plt.title("Number of instances per class")
    plt.xlabel("Class")
    plt.ylabel("Number of instances")
    plt.xticks(ranks_ordinati, labels_value)
    plt.grid(True)
    plt.show()

def performance(y_test, y_pred):
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    cm = cm.astype('float')/cm.sum(axis=1)[:,np.newaxis]

    sns.heatmap(cm, annot=True, fmt='.5%', cmap='Blues', yticklabels=labels, xticklabels=labels, cbar=False)
    plt.show()

plot_class_distribution(y_train)

# BoW ConuntVectorizer
cls = RandomForestClassifier()
cls.fit(X_train_Count, y_train)
y_pred = cls.predict(X_test_Count)

performance(y_test, y_pred)

# BoW TfidfVectorizer
cls = RandomForestClassifier()
cls.fit(X_train_Tfidf, y_train)
y_pred = cls.predict(X_test_Tfidf)

performance(y_test, y_pred)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
import gc

sampler = RandomUnderSampler(random_state=42)

# BoW ConuntVectorizer
X_train_under_sampled, y_train_under_sampled = sampler.fit_resample(X_train_Count, y_train)
plot_class_distribution(y_train_under_sampled)

cls = RandomForestClassifier()
cls.fit(X_train_Count, y_train)
y_pred = cls.predict(X_test_Count)

performance(y_test, y_pred)

# BoW TfidfVectorizer
X_train_under_sampled, y_train_under_sampled = sampler.fit_resample(X_train_Tfidf, y_train)
plot_class_distribution(y_train_under_sampled)

cls = RandomForestClassifier()
cls.fit(X_train_Tfidf, y_train)
y_pred = cls.predict(X_test_Tfidf)

performance(y_test, y_pred)

#Clean memory
X_train_under_sampled = None
y_train_under_sampled = None

gc.collect()

In [None]:
from imblearn.over_sampling import RandomOverSampler
import gc

sampler = RandomOverSampler(random_state=42)

# BoW ConuntVectorizer
X_train_over_sampled, y_train_over_sampled = sampler.fit_resample(X_train_Count, y_train)
plot_class_distribution(y_train_over_sampled)

cls = RandomForestClassifier()
cls.fit(X_train_over_sampled, y_train_over_sampled)
y_pred = cls.predict(X_test_Count)

performance(y_test, y_pred)

# BoW TfidfVectorizer
X_train_over_sampled, y_train_over_sampled = sampler.fit_resample(X_train_Tfidf, y_train)
plot_class_distribution(y_train_over_sampled)

cls = RandomForestClassifier()
cls.fit(X_train_over_sampled, y_train_over_sampled)
y_pred = cls.predict(X_test_Tfidf)

performance(y_test, y_pred)

X_train_over_sampled = None
y_train_over_sampled = None
sampler = None
gc.collect()


## Connectionist techniques

In this case, after pre-processing, a neural network based on an LSTM unit is trained.

In [None]:
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
device = tf.config.list_physical_devices('GPU')
print('Num. tarjetas: ', len(device))

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
import pandas as pd

df = pd.read_csv(folder + 'all_cleaned_' + feature_name + '.csv')

X_train, X_test, y_train, y_test = train_test_split(df[feature_name], df[target_name_num], test_size=0.2, random_state=42)

print(X_train.shape)
print(y_test.shape)

In [None]:
from keras.utils import to_categorical

num_words = 1000
sequence_length = 100

tokenizer = Tokenizer(num_words = num_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=sequence_length)

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad= sequence.pad_sequences(X_test_seq, maxlen=sequence_length)


num_classes = len(labels)
# Convert your integer labels to one-hot encoded format
y_train_one_hot = to_categorical(y_train, num_classes=num_classes)
y_test_one_hot = to_categorical(y_test, num_classes=num_classes)


from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

# Define the model
model = Sequential()

# Add an Embedding layer with input_dim=num_words, output_dim=embedding_dim, input_length=sequence_length
embedding_dim = 50  # You can adjust this value based on your specific task
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=sequence_length))

# Add an LSTM layer with a certain number of units (you can experiment with different values)
lstm_units = 100
model.add(LSTM(units=lstm_units, dropout=0.2, recurrent_dropout=0.2))

model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

callback = EarlyStopping(monitor='val_loss', patience=2)
model.fit(X_train_pad, y_train_one_hot, batch_size=32, epochs=5, validation_data=(X_test_pad, y_test_one_hot), callbacks=[callback])