In [1]:
from pymongo import MongoClient
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import string
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import copy

In [2]:
client = MongoClient("mongodb://localhost:27017/")
db = client["Arxiv"]
collection = db["Arxiv Papers"]
summary = [[doc["summary"]] for doc in collection.find()]

In [3]:
original_summary = copy.deepcopy(summary)

In [4]:
np.save("original.npy", original_summary)

In [5]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

In [6]:
lemmatizer = WordNetLemmatizer()

In [7]:
def preprocessing(summaries):
    summaries[0] = summaries[0].lower()
    summaries[0] = summaries[0].translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(summaries[0])
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]
    return [" ".join(lemmatized_tokens)]

In [8]:
for i in range(len(summary)):
    summary[i] = preprocessing(summary[i])

In [9]:
np.save("summary.npy", summary)

# Below is training and not necessary for final model

In [None]:
train_size = int(0.9 * len(summary))
train_data = summary[:train_size]
test_data = summary[train_size:]
train_fixed = [train_data[index][0] for index in range(len(train_data))]
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
tfidf_mat = tfidf_vectorizer.fit_transform(train_fixed)
feature_names = tfidf_vectorizer.get_feature_names_out()
new_tfidf = tfidf_vectorizer.transform([test_data[-3][0]])
similarities = cosine_similarity(new_tfidf, tfidf_mat)
most_similar = similarities.argmax()
score = similarities.max()
print(most_similar)
print(test_data[-1][0])
print(train_data[most_similar][0])
print(score)