In [1]:
from pymongo import MongoClient
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import string
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
client = MongoClient("mongodb://localhost:27017/")
db = client["Arxiv"]
collection = db["Arxiv Papers"]

query = {"doi": { "$exists": True, "$ne": None }}
documents = collection.find(query, {"doi": 1, "summary": 1, "_id": 0})
doi_list = [[doc["doi"]] for doc in documents]

In [3]:
documents = collection.find(query, {"doi": 1, "summary": 1, "_id": 0})
summary = [[doc['summary']] for doc in documents]

In [4]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

In [5]:
lemmatizer = WordNetLemmatizer()

In [6]:
def preprocessing(summaries):
    summaries[0] = summaries[0].lower()
    summaries[0] = summaries[0].translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(summaries[0])
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]
    return [" ".join(lemmatized_tokens)]

In [7]:
for i in range(len(summary)):
    summary[i] = preprocessing(summary[i])
    summary[i].insert(0, doi_list[i])

KeyboardInterrupt: 

In [142]:
# Split indices
train_size = int(0.9 * len(summary))  # 80% for training

# Split data
train_data = summary[:train_size]
test_data = summary[train_size:]

In [143]:
train_fixed = [train_data[index][1] for index in range(len(train_data))]

In [129]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')

In [144]:
tfidf_mat = tfidf_vectorizer.fit_transform(train_fixed)

In [145]:
feature_names = tfidf_vectorizer.get_feature_names_out()
print(tfidf_mat.shape)

(870, 7930)


Test TFIDF model

In [146]:
new_tfidf = tfidf_vectorizer.transform([test_data[-1][1]])

In [147]:
similarities = cosine_similarity(new_tfidf, tfidf_mat)
most_similar = similarities.argmax()
score = similarities.max()

In [148]:
print(most_similar)
print(train_data[most_similar][0])
print(score)

405
['10.3390/info10040150']
0.24018322697324754
