In [1]:
from pymongo import MongoClient
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import string
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import copy

In [131]:
client = MongoClient("mongodb://localhost:27017/")
db = client["Arxiv"]
collection = db["Arxiv Papers"]
summary = [[doc["summary"]] for doc in collection.find()]

In [133]:
original_summary = copy.deepcopy(summary)

In [86]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

In [87]:
lemmatizer = WordNetLemmatizer()

In [120]:
print(original_summary[0])

['lecture note on optimization for machine learn derive from a course at princeton university and tutorial give in mlss buenos aire as well a simon foundation berkeley']


In [88]:
def preprocessing(summaries):
    summaries[0] = summaries[0].lower()
    summaries[0] = summaries[0].translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(summaries[0])
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]
    return [" ".join(lemmatized_tokens)]

In [89]:
for i in range(len(summary)):
    summary[i] = preprocessing(summary[i])

In [90]:
train_size = int(0.9 * len(summary))
train_data = summary[:train_size]
test_data = summary[train_size:]

In [91]:
train_fixed = [train_data[index][0] for index in range(len(train_data))]
summary_full = [summary[index][0] for index in range(len(summary))]

In [92]:
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')

In [93]:
tfidf_mat = tfidf_vectorizer.fit_transform(train_fixed)

In [94]:
feature_names = tfidf_vectorizer.get_feature_names_out()
print(tfidf_mat.shape)

(30554, 67363)


In [95]:
new_tfidf = tfidf_vectorizer.transform([test_data[-3][0]])

In [96]:
similarities = cosine_similarity(new_tfidf, tfidf_mat)
most_similar = similarities.argmax()
score = similarities.max()

In [97]:
print(most_similar)
print(test_data[-1][0])
print(train_data[most_similar][0])
print(score)

16988
eeg signal be usually simple to obtain but expensive to label although supervised learning have be widely use in the field of eeg signal analysis it generalization performance be limit by the amount of annotate data selfsupervised learn ssl a a popular learning paradigm in computer vision cv and natural language processing nlp can employ unlabeled data to make up for the data shortage of supervised learning in this paper we propose a selfsupervised contrastive learn method of eeg signal for sleep stage classification during the training process we set up a pretext task for the network in order to match the right transformation pair generate from eeg signal in this way the network improve the representation ability by learn the general feature of eeg signal the robustness of the network also get improve in deal with diverse data that be extract constant feature from change data in detail the network performance depend on the choice of transformation and the amount of unlabeled dat

The user can input multiple things.
1 - doi of the article they want to find (from doi we extract summary, preprocess then serves as input to vector) -- figure out how to query this later
2 - summary of article (preprocess summary then find most similar article)
3 - title if in db (use corresponding summary, preprocess then pass through model)

If the user inputs the doi

In [None]:
BASE_URL = 'http://export.arxiv.org/api/query?search_query=all:Machine%20Learning'

If the user inputs summary

In [None]:
user_input = test_data[-3][0] # Already preprocessed, but simply pass through function

If the user inputs title:
    - if title present in db get summary, preprocess then pass through model
    - if title not present in db then preprocess title and pass through model

In [98]:
tfidf_mat = tfidf_vectorizer.fit_transform(summary_full)
feature_names = tfidf_vectorizer.get_feature_names_out()


Here we get the user input to pass through model

In [99]:
new_tfidf = tfidf_vectorizer.transform([test_data[-3][0]])

In [100]:
similarities = cosine_similarity(new_tfidf, tfidf_mat)
most_similar = similarities.argmax()
score = similarities.max()

[[0.02331928 0.07301803 0.01945461 ... 1.         0.08941017 0.04466947]]


In [101]:
print(similarities[0].argmax())
similarities_np = np.array(similarities[0])
highest = np.argpartition(similarities_np, -5)[-5:]

33946


In [102]:
print(most_similar)
print(test_data[-1][0])
print(summary_full[most_similar])
print(score)

33946
eeg signal be usually simple to obtain but expensive to label although supervised learning have be widely use in the field of eeg signal analysis it generalization performance be limit by the amount of annotate data selfsupervised learn ssl a a popular learning paradigm in computer vision cv and natural language processing nlp can employ unlabeled data to make up for the data shortage of supervised learning in this paper we propose a selfsupervised contrastive learn method of eeg signal for sleep stage classification during the training process we set up a pretext task for the network in order to match the right transformation pair generate from eeg signal in this way the network improve the representation ability by learn the general feature of eeg signal the robustness of the network also get improve in deal with diverse data that be extract constant feature from change data in detail the network performance depend on the choice of transformation and the amount of unlabeled dat

We want to return the title for the objects with the 5 highest scores (this can be changed by the user)

In [103]:
clean_arr = np.array([[high, similarities_np[high]] for high in highest])
clean_arr = clean_arr[clean_arr[:,1].argsort()][::-1]
print(clean_arr)

[[3.39460000e+04 1.00000000e+00]
 [1.69880000e+04 9.05447436e-01]
 [1.30200000e+04 3.13338517e-01]
 [1.27030000e+04 2.94950493e-01]
 [2.19550000e+04 2.91983855e-01]]


In [77]:
print(np.array(summary_full)[highest])

['federate learn fl can be use in mobile edge network to train machine learn model in a distributed manner recently fl have be interpret within a modelagnostic metalearning maml framework which bring fl significant advantage in fast adaptation and convergence over heterogeneous datasets however exist research simply combine maml and fl without explicitly address how much benefit maml bring to fl and how to maximize such benefit over mobile edge network in this paper we quantify the benefit from two aspect optimize fl hyperparameters ie sample data size and the number of communication round and resource allocation ie transmit power in mobile edge network specifically we formulate the mamlbased fl design a an overall learning time minimization problem under the constraint of model accuracy and energy consumption facilitate by the convergence analysis of mamlbased fl we decompose the formulate problem and then solve it use analytical solution and the coordinate descent method with the obt

In [129]:
client = MongoClient("mongodb://localhost:27017/")
db = client["Arxiv"]
collection = db["Arxiv Papers"]

In [106]:
print(original_summary[0])

['lecture note on optimization for machine learn derive from a course at princeton university and tutorial give in mlss buenos aire as well a simon foundation berkeley']


In [134]:
#summary_top_5 = np.array(original_summary)[highest]

#print(summary_top_5[1][0])
query = {"summary" : original_summary[0][0]}
found = collection.find(query)
for doc in found:
    print(doc)

{'_id': ObjectId('679de45665551d082bb28633'), 'doi': None, 'title': 'lecture notes: optimization for machine learning', 'summary': 'lecture notes on optimization for machine learning, derived from a course at princeton university and tutorials given in mlss, buenos aires, as well as simons foundation, berkeley.', 'authors': ['elad hazan'], 'summary_embedded': [[-0.03295045346021652, -0.049884021282196045, 0.02169964089989662, -0.042217276990413666, 0.021081535145640373, -0.007480995263904333, 0.05210074409842491, -0.014372649602591991, -0.062026433646678925, -0.00013597724318969995, -0.042934391647577286, 0.08189061284065247, 0.020059265196323395, -0.04340370371937752, -0.03441626951098442, 0.06136669963598251, 0.010546771809458733, 0.01702974922955036, -0.031569842249155045, -0.09214671701192856, 0.0743427574634552, 0.007017235271632671, 0.0003597683389671147, -0.014236473478376865, 0.04310692101716995, -0.04300112649798393, 0.02624274604022503, 0.05065550655126572, 0.0194369498640298

In [126]:
print(original_summary[0][0])

lecture note on optimization for machine learn derive from a course at princeton university and tutorial give in mlss buenos aire as well a simon foundation berkeley


This is what we return to the user, namely, the title of the respective articles

In [137]:
summary_top_5 = np.array(original_summary)[highest]
for summarized in summary_top_5:
    query = {"summary" : summarized[0]}
    found = collection.find(query)
    for document in found:
        print(document['title'], "\n")

automated federated learning in mobile edge networks -- fast adaptation
  and convergence 

continual local training for better initialization of federated models 

cost-effective federated learning in mobile edge networks 

scheduling algorithms for federated learning with minimal energy
  consumption 

cost-effective federated learning design 

