In [None]:
!pip install bertopic
!pip uninstall scipy
!pip install scipy
!python -m nltk.downloader stopwords

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Found existing installation: scipy 1.9.3
Uninstalling scipy-1.9.3:
  Would remove:
    /usr/local/lib/python3.8/dist-packages/scipy-1.9.3.dist-info/*
    /usr/local/lib/python3.8/dist-packages/scipy.libs/libgfortran-040039e1.so.5.0.0
    /usr/local/lib/python3.8/dist-packages/scipy.libs/libopenblasp-r0-41284840.3.18.so
    /usr/local/lib/python3.8/dist-packages/scipy.libs/libquadmath-96973f99.so.0.0.0
    /usr/local/lib/python3.8/dist-packages/scipy/*
Proceed (y/n)? y
  Successfully uninstalled scipy-1.9.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scipy
  Using cached scipy-1.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.8 MB)
Installing collected packages: scipy
Successfully installed scipy-1.9.3
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already 

In [12]:
import pandas as pd
import numpy as np
import math
import re
import spacy 
import random
import pickle
from bertopic import BERTopic
from collections import defaultdict, Counter
from tqdm import tqdm
from nltk.corpus import stopwords

In [26]:
df = pd.read_csv("all_data/data_final.csv")

In [27]:
df = df[df.author_ids.apply(lambda x: not math.isnan(x) if not isinstance(x, str) else True)]

In [28]:
df.shape

(757871, 27)

In [29]:
df.year.unique()

array([2017., 2016., 2018., 2019., 2020., 2021.])

In [30]:
# split
train, test = df[(df.year <= 2018)], df[(df.year > 2018) & (df.year < 2022)]
train.shape[0] + test.shape[0] == df.shape[0]

True

In [31]:
print("Процент наблюдений в тестовой выборке: ")
test.shape[0] / (train.shape[0] + test.shape[0]) * 100

Процент наблюдений в тестовой выборке: 


13.68346328069025

In [32]:
def get_dct_of_links(df):

    authors = []
    for indx, item in enumerate(df.author_ids.values):
        authors.append(list(map(lambda x: x.strip(), item.split(";"))))

    for lst in authors:
        counter = Counter(lst)
        for i in range(counter[""]):
            lst.remove("")
        
    dct_of_links = defaultdict(set)
    for lst in tqdm(authors):
        if len(lst) != 1:
            for i in range(len(lst)):
                curr_lst = lst.copy()
                curr_lst.remove(lst[i])
                dct_of_links[lst[i]].update(curr_lst)
    
    return dct_of_links

In [33]:
# словари связей(автор: соавторы)
train_dct_of_links, test_dct_of_links = get_dct_of_links(train), get_dct_of_links(test)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 654168/654168 [00:02<00:00, 310251.31it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 103703/103703 [00:00<00:00, 464655.24it/s]


In [34]:
def print_top(dct, top):
    out = []
    for key, value in dct.items():
        out.append(len(value))
    print(sorted(out)[-top:])
    
# топ авторов по количеству соавторов
print("train: ")
print_top(train_dct_of_links, 20)
print("test: ")
print_top(test_dct_of_links, 20)

train: 
[267, 275, 279, 279, 285, 289, 291, 293, 294, 301, 303, 309, 314, 320, 324, 351, 359, 363, 384, 386]
test: 
[80, 80, 81, 81, 81, 85, 88, 93, 96, 98, 98, 99, 101, 101, 103, 104, 109, 110, 113, 140]


In [35]:
df.columns

Index(['references', 'year', 'lang', 'volume', 'page_start', 'doi', 'title',
       'issue', 'isbn', 'authors', 'abstract', 'pdf', 'issn', 'venue', 'fos',
       'n_citation', 'page_end', 'keywords', 'url', 'author_ids',
       'author_names', 'venue_id', 'venue_name', 'id', 'labels',
       'used_in_bert_training', 'topics'],
      dtype='object')

In [36]:
klusters_top = defaultdict(list)
for item, label in tqdm(zip(df.author_ids.values, df.labels.values)):
    for author in list(map(lambda x: x.strip(), item.split(";"))):
        if author != "":
            flag = False
            for lst in klusters_top[label]:
                if lst[0] == author:
                    lst[1] += 1
                    flag = True
                    break
            if flag is False:
                klusters_top[label].append([author, 1])

757871it [4:39:55, 45.12it/s] 


In [37]:
for key, value in tqdm(klusters_top.items()):
    value.sort(key=lambda x: -x[1])
    klusters_top[key] = np.array(klusters_top[key])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2522/2522 [00:01<00:00, 2118.09it/s]


In [38]:
with open("klusters_top.pkl", "wb") as file:
    pickle.dump(klusters_top, file)

In [None]:
class TopicModeling:

    def __init__(self, path_to_model: str):
        self.bert_model = BERTopic.load(path_to_model)
        self.topic_dict = dict(zip(self.bert_model.get_topic_info()['Topic'], 
                                   self.bert_model.get_topic_info()['Name']))
    
    def text_preprocessing(self, text: str):
        regex = re.compile('[A-Za-z]+')
        nlp = spacy.load('en_core_web_sm')
        mystopwords = stopwords.words('english') + ['paper', 'result', 'experiment', 'from', 'subject', 
                                                're', 'edu', 'use', 'data', 'method', 'based', 
                                                'new', 'approach', 'also','system', 'model', 
                                                'present', 'research', 'propose', 'base']
        
        text = ' '.join(regex.findall(text))
        doc = nlp(text)
        text = ' '.join([token.lemma_ for token in doc])
        text = ' '.join([token for token in text.split() if not token in mystopwords])
        
        return text
    
    def score_text(self, text: str):
        text = self.text_preprocessing(text)
        topic, prob = self.bert_model.transform(text)
        return self.topic_dict[topic[0]]

In [None]:
class Recommendation_system:
    
    def __init__(self, model_path, train_dct_of_links, klusters_top):
        self.train_dct_of_links = train_dct_of_links
        self.klusters_top = klusters_top
        self.model = model_path
        # with open(model_path, "rb") as f:
        #     self.model = pickle.load(f)
        self.model = model_path
        self.top_authors = self.get_top_authors()

    
    def get_top_authors(self, top=1000):
        authors_by_collaborators = [(author, len(collaborators)) for author, collaborators in self.train_dct_of_links.items()]
        authors_by_collaborators.sort(key=lambda x: x[1])

        out = [None] * top
        for i in range(top):
            out[i] = authors_by_collaborators[i][0]
        
        return out
    
    @staticmethod
    def delete_elements_from_set(st, count):
        for _ in range(count):
            st.pop()
    

    def add_recommendations_from_top(self, out, top):
        indx = 0
        while len(out) < top:
            out.add(self.top_authors[indx])
            indx += 1

    def get_recommendations_on_articles(self, top, lst_of_articles):
        out = set()
        labels = [int(self.model.score_text(article).split("_")[0]) for article in lst_of_articles]
        labels = list(filter(lambda x: x in self.klusters_top, labels))
        
        if len(labels) != 0:
            klusters_counter = Counter(labels).most_common()
            klusters_count_for_recommendation = min(len(klusters_counter), 3)
            articles_per_cluster = 1 if (top // klusters_count_for_recommendation) == 0 else (top // klusters_count_for_recommendation)


            
            for i in range(klusters_count_for_recommendation):
                curr_kluster = klusters_counter[i][0]
                out.update(self.klusters_top[curr_kluster][:articles_per_cluster, 0])
                if len(out) >= top:
                    self.delete_elements_from_set(st=out, 
                                                count=len(out) - top)
                    return out
                
            
            for i in range(klusters_count_for_recommendation):
                curr_kluster = klusters_counter[i][0]
                out.update(self.klusters_top[curr_kluster][articles_per_cluster:(articles_per_cluster * 2), 0])
                if len(out) >= top:
                    self.delete_elements_from_set(st=out, 
                                                count=len(out) - top)
                    return out
            

            for i in range(klusters_count_for_recommendation, len(klusters_counter)):
                curr_kluster = klusters_counter[i][0]
                out.update(self.klusters_top[curr_kluster][articles_per_cluster:(articles_per_cluster * 2), 0])
                if len(out) >= top:
                    self.delete_elements_from_set(st=out, 
                                                count=len(out) - top)
                    return out

        self.add_recommendations_from_top(out, top)
        return out

    
    def get_recommendation(self, top=10, author_id=None, lst_of_articles=None):
        if author_id is None:
            if lst_of_articles is None:
                return self.top_authors[-top:]

            out = self.get_recommendations_on_articles(top=top,
                                                       lst_of_articles=lst_of_articles)
            
            return out

        elif author_id in self.train_dct_of_links:
            all_recommendation = set()
            first_layer = self.train_dct_of_links[author_id]
            
            for first_layer_item in first_layer:
                for second_layer_item in self.train_dct_of_links[first_layer_item]:
                    all_recommendation.add(second_layer_item)
                    # если скорость работы будет позволять, можно добавлять всех, а потом, храня словарь 
                    # с количеством соавторов для каждого автора, добавлять их по популярности
                    if len(all_recommendation) == top:
                        return all_recommendation

            self.add_recommendations_from_top(all_recommendation, top)
            return all_recommendation

        elif author_id not in self.train_dct_of_links:
            if lst_of_articles is None:
                return self.top_authors[-top:]
            
            out = self.get_recommendations_on_articles(top=top,
                                                       lst_of_articles=lst_of_articles)
            
            return out

In [None]:
# MODEL_PATH = "drive/MyDrive/bert_model"
# model = BERTopic.load(MODEL_PATH)

In [None]:
articles = ["""k-NN is a type of classification where the function is only approximated locally and all computation is deferred until function evaluation. Since this algorithm relies on distance for classification, if the features represent different physical units or come in vastly different scales then normalizing the training data can improve its accuracy dramatically.[3][4]
Both for classification and regression, a useful technique can be to assign weights to the contributions of the neighbors, so that the nearer neighbors contribute more to the average than the more distant ones. For example, a common weighting scheme consists in giving each neighbor a weight of 1/d, where d is the distance to the neighbor.[5]
The neighbors are taken from a set of objects for which the class (for k-NN classification) or the object property value (for k-NN regression) is known. This can be thought of as the training set for the algorithm, though no explicit training step is required.
A peculiarity of the k-NN algorithm is that it is sensitive to the local structure of the data.""",
"""For specific learning algorithms, it is possible to compute the gradient with respect to hyperparameters and then optimize the hyperparameters using gradient descent. The first usage of these techniques was focused on neural networks.[11] Since then, these methods have been extended to other models such as support vector machines[12] or logistic regression.[13]
A different approach in order to obtain a gradient with respect to hyperparameters consists in differentiating the steps of an iterative optimization algorithm using automatic differentiation.[14][15][16][17] A more recent work along this direction uses the implicit function theorem to calculate hypergradients and proposes a stable approximation of the inverse Hessian. The method scales to millions of hyperparameters and requires constant memory.
In a different approach,[18] a hypernetwork is trained to approximate the best response function. One of the advantages of this method is that it can handle discrete hyperparameters as well. Self-tuning networks[19] offer a memory efficient version of this approach by choosing a compact representation for the hypernetwork. More recently, Δ-STN[20] has improved this method further by a slight reparameterization of the hypernetwork which speeds up training. Δ-STN also yields a better approximation of the best-response Jacobian by linearizing the network in the weights, hence removing unnecessary nonlinear effects of large changes in the weights.
Apart from hypernetwork approaches, gradient-based methods can be used to optimize discrete hyperparameters also by adopting a continuous relaxation of the parameters.[21] Such methods have been extensively used for the optimization of architecture hyperparameters in neural architecture search.
""",
"""Spectral methods and finite element methods are closely related and built on the same ideas; the main difference between them is that spectral methods use basis functions that are generally nonzero over the whole domain, while finite element methods use basis functions that are nonzero only on small subdomains (compact support). Consequently, spectral methods connect variables globally while finite elements do so locally. Partially for this reason, spectral methods have excellent error properties, with the so-called "exponential convergence" being the fastest possible, when the solution is smooth. However, there are no known three-dimensional single domain spectral shock capturing results (shock waves are not smooth).[1] In the finite element community, a method where the degree of the elements is very high or increases as the grid parameter h increases is sometimes called a spectral element method.
Spectral methods can be used to solve differential equations (PDEs, ODEs, eigenvalue, etc) and optimization problems. When applying spectral methods to time-dependent PDEs, the solution is typically written as a sum of basis functions with time-dependent coefficients; substituting this in the PDE yields a system of ODEs in the coefficients which can be solved using any numerical method for ODEs. Eigenvalue problems for ODEs are similarly converted to matrix eigenvalue problems[citation needed].
Spectral methods were developed in a long series of papers by Steven Orszag starting in 1969 including, but not limited to, Fourier series methods for periodic geometry problems, polynomial spectral methods for finite and unbounded geometry problems, pseudospectral methods for highly nonlinear problems, and spectral iteration methods for fast solution of steady-state problems. The implementation of the spectral method is normally accomplished either with collocation or a Galerkin or a Tau approach . For very small problems, the spectral method is unique that solutions may be written out symbolically, yielding a practical alternative to series solutions for differential equations.
Spectral methods can be computationally less expensive and easier to implement than finite element methods; they shine best when high accuracy is sought in simple domains with smooth solutions. However, because of their global nature, the matrices associated with step computation are dense and computational efficiency will quickly suffer when there are many degrees of freedom (with some exceptions, for example if matrix applications can be written as Fourier transforms). For larger problems and nonsmooth solutions, finite elements will generally work better due to sparse matrices and better modelling of discontinuities and sharp bends.
""",
"""Calculus, originally called infinitesimal calculus or "the calculus of infinitesimals", is the mathematical study of continuous change, in the same way that geometry is the study of shape, and algebra is the study of generalizations of arithmetic operations.
It has two major branches, differential calculus and integral calculus; the former concerns instantaneous rates of change, and the slopes of curves, while the latter concerns accumulation of quantities, and areas under or between curves. These two branches are related to each other by the fundamental theorem of calculus, and they make use of the fundamental notions of convergence of infinite sequences and infinite series to a well-defined limit.[1]
Infinitesimal calculus was developed independently in the late 17th century by Isaac Newton and Gottfried Wilhelm Leibniz.[2][3] Later work, including codifying the idea of limits, put these developments on a more solid conceptual footing. Today, calculus has widespread uses in science, engineering, and social science.[4]
"""]

In [None]:
model_path="drive/MyDrive/topic_modeling_pipeline.pkl"
with open(model_path, "rb") as f:
    model = pickle.load(f)

In [None]:
%%time

RecSys = Recommendation_system(train_dct_of_links=train_dct_of_links,
                               model_path=model,
                               klusters_top=klusters_top)

for author_id in tqdm(test_dct_of_links):
    value = random.randint(1, 100)
    recommendations = RecSys.get_recommendation(top=value,
                                                author_id=None,
                                                lst_of_articles=list())
    if len(recommendations) != value:
        print(author_id, len(recommendations), value)
        break

100%|██████████| 125497/125497 [00:01<00:00, 79876.24it/s]

CPU times: user 1.79 s, sys: 8.79 ms, total: 1.8 s
Wall time: 1.83 s



