# **Scientific journal recommender for submitting a publication**

In [None]:
import string

folder = "dataset/"
folder_raw = "dataset_raw/"


# Dataset

For each class (journal) there is a file in BibTeX format containing the articles published in that journal. Each file was cleaned and formatted with the following online tool [BibTeX Tidy](https://flamingtempura.github.io/bibtex-tidy/index.html).

Each article is represented by a record with the following fields:
* **abstract**: Abstract of the article.
* **author**: Author of the article.
* **ENTRYTYPE**: Type of entry (article, book, inproceedings, etc.).
* **doi**: Digital Object Identifier of the article.
* **ID**: Unique identifier of the article.
* **issn**: International Standard Serial Number of the journal in which the article was published.
* **journal**: Journal in which the article was published.
* **keywords**: Keywords of the article.
* **note**: Additional information about the article.
* **pages**: Pages of the article.
* **title**: Title of the article.
* **url**: URL of the article.
* **volume**: Volume of the journal in which the article was published.
* **year**: Year of publication of the article.

The goal is to create a model that is able to predict the **journal** in which it will be published.

In [None]:
import os
import bibtexparser
import pandas as pd

def read_bib_to_dataframe(file_path):
    #with open(file_path, 'r', encoding='utf-8') as bibtex_file:
    with open(file_path, 'r', encoding='latin-1') as bibtex_file:
        return bibtexparser.load(bibtex_file)

for filename in os.listdir(folder_raw):
    if filename.endswith(".bib"):
        filename_path = os.path.join(folder_raw, filename)
        bib_data = read_bib_to_dataframe(filename_path)
        if bib_data.entries:
            df = pd.DataFrame(bib_data.entries)
            df.to_csv(os.path.splitext(filename_path)[0] + '.csv', index=False)
        else:
            print("Error: ", filename, " is empty")

In [None]:
import pandas as pd
import os

dfs = []
for filename in os.listdir(folder_raw):
    if filename.endswith(".csv"):
        dfs.append(pd.read_csv(os.path.join(folder_raw, filename)))

df = pd.concat(dfs, ignore_index=True)

# Use the following id features to remove duplicates
for feature in ['doi', 'ID']:
    # Remove duplicates based on the subset of non-null, non-na, and non-empty values in the feature
    tmp_df = df[df[feature].notnull() & df[feature].notna() & (df[feature] != '')]
    duplicates_count = tmp_df.duplicated(subset=[feature]).sum()

    if duplicates_count > 0:
        print(f'Duplicates found using {feature}: {duplicates_count}\n\n')

        df = df[~df[feature].duplicated(keep='first') | df[feature].isnull() | df[feature].isna() | (df[feature] == '')]

print(df.info())
df.head()
df.to_csv(folder_raw + 'all.csv', index=False)

for tmp_df in dfs:
    tmp_df = None
dfs = None

# Feature Selection

The following features are selected:
* **abstract**: Abstract of the article.
* **keywords**: Keywords of the article.
* **title**: Title of the article.

The target feature is:
* **journal**: Journal in which the article was published.

In [None]:
# Removing unnecessary columns
import pandas as pd
df = pd.read_csv(folder_raw + 'all.csv')

feature_names = ['abstract', 'keywords', 'title']
target_name = 'journal'

# Remove all the row that countains null values in the target_name column
df = df.dropna(subset=[target_name])

df = df[feature_names + [target_name]]
print(df.info())
df.head()

df.to_csv(folder_raw + 'selected.csv', index=False)

In [None]:
# Cleaning data
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt_tab')

language = 'english'
# Convert to lowercase
df[feature_names] = df[feature_names].applymap(lambda x: str(x).lower())
# Remove stopwords
nltk.download('stopwords')
stopwords_list = stopwords.words(language)
df[feature_names] = df[feature_names].apply(lambda x: x.apply(lambda words: ' '.join([w for w in words.split() if w not in stopwords_list])))
# Remove punctuation
nltk.download('punkt')
df[feature_names] = df[feature_names].apply(lambda x: x.str.translate(str.maketrans('', '', string.punctuation)))
# Stemming
stemmer = nltk.stem.SnowballStemmer(language=language)
df[feature_names] = df[feature_names].apply(lambda x: x.apply(lambda words: ' '.join([stemmer.stem(w) for w in words.split()])))
# Tokenize
df[feature_names] = df[feature_names].apply(lambda x: x.apply(nltk.word_tokenize))

df.head()
df.to_csv(folder + 'selected_cleaned.csv', index=False)