In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sqlalchemy import create_engine, text
import pandas as pd
from sklearn.model_selection import train_test_split


from citations_searcher.constants import POSTGRES_URL

# Create an engine
engine = create_engine(POSTGRES_URL)

In [5]:
positive_query = "SELECT * FROM joined_filtered_positive_references"
negative_query = "SELECT * FROM joined_filtered_negative_references"
metadata_query = "SELECT * FROM train_references_metadata"

with engine.connect() as conn:
    positive_df = pd.read_sql_query(text(positive_query), conn)
    negative_df = pd.read_sql_query(text(negative_query), conn)
    metadata_df = pd.read_sql_query(text(metadata_query), conn)

print(len(positive_df), len(negative_df), len(metadata_df))
positive_df.head()

309626 219260 54163


Unnamed: 0,paper_arxiv_id,paper_title,paper_abstract,paper_tldr,reference_arxiv_id,reference_date,reference_title,reference_authors,reference_abstract,reference_category_1,reference_category_2,reference_citation_count,reference_influential_citation_count,reference_tldr
0,2201.02217,nonlocal kernel network (nkn): a stable and re...,neural operators have recently become popular ...,This work proposes a novel nonlocal neural ope...,2004.00361,2020-08-11 15:43:51,eikonet: solving the eikonal equation with dee...,"jonathan d. smith, kamyar azizzadenesheli, zac...",the recent deep learning revolution has create...,physics.comp-ph,cs.lg,94.0,6.0,EikoNet is a deep learning approach to solving...
1,2002.00585,proving the lottery ticket hypothesis: pruning...,the lottery ticket hypothesis (frankle and car...,"An even stronger hypothesis is proved, showing...",1811.03804,2019-05-28 19:01:22,gradient descent finds global minima of deep n...,"simon s. du, jason d. lee, haochuan li, liwei ...",gradient descent finds a global minimum in tra...,cs.lg,cs.ai,1002.0,140.0,The current paper proves gradient descent achi...
2,2003.11755,a survey of deep learning for scientific disco...,"over the past few years, we have seen fundamen...",This survey provides an overview of many widel...,1807.03748,2019-01-22 18:47:12,representation learning with contrastive predi...,"aaron van den oord, yazhe li, oriol vinyals",while supervised learning has enabled great pr...,cs.lg,stat.ml,7563.0,1105.0,This work proposes a universal unsupervised le...
3,2006.15057,a loss function for generative neural networks...,to train variational autoencoders (vaes) to ge...,A loss function based on Watson's perceptual m...,1412.698,2017-01-30 01:27:54,adam: a method for stochastic optimization,"diederik p. kingma, jimmy ba","we introduce adam, an algorithm for first-orde...",cs.lg,,133232.0,21032.0,"This work introduces Adam, an algorithm for fi..."
4,2102.10739,dissecting the diffusion process in linear gra...,graph convolutional networks (gcns) have attra...,This paper dissects the feature propagation st...,1412.698,2017-01-30 01:27:54,adam: a method for stochastic optimization,"diederik p. kingma, jimmy ba","we introduce adam, an algorithm for first-orde...",cs.lg,,133232.0,21032.0,"This work introduces Adam, an algorithm for fi..."


In [6]:
positive_df['target'] = 1
negative_df['target'] = 0

overall_df = result = pd.concat([positive_df, negative_df], ignore_index=True)
print(len(overall_df))
overall_df.head()

528886


Unnamed: 0,paper_arxiv_id,paper_title,paper_abstract,paper_tldr,reference_arxiv_id,reference_date,reference_title,reference_authors,reference_abstract,reference_category_1,reference_category_2,reference_citation_count,reference_influential_citation_count,reference_tldr,target
0,2201.02217,nonlocal kernel network (nkn): a stable and re...,neural operators have recently become popular ...,This work proposes a novel nonlocal neural ope...,2004.00361,2020-08-11 15:43:51,eikonet: solving the eikonal equation with dee...,"jonathan d. smith, kamyar azizzadenesheli, zac...",the recent deep learning revolution has create...,physics.comp-ph,cs.lg,94.0,6.0,EikoNet is a deep learning approach to solving...,1
1,2002.00585,proving the lottery ticket hypothesis: pruning...,the lottery ticket hypothesis (frankle and car...,"An even stronger hypothesis is proved, showing...",1811.03804,2019-05-28 19:01:22,gradient descent finds global minima of deep n...,"simon s. du, jason d. lee, haochuan li, liwei ...",gradient descent finds a global minimum in tra...,cs.lg,cs.ai,1002.0,140.0,The current paper proves gradient descent achi...,1
2,2003.11755,a survey of deep learning for scientific disco...,"over the past few years, we have seen fundamen...",This survey provides an overview of many widel...,1807.03748,2019-01-22 18:47:12,representation learning with contrastive predi...,"aaron van den oord, yazhe li, oriol vinyals",while supervised learning has enabled great pr...,cs.lg,stat.ml,7563.0,1105.0,This work proposes a universal unsupervised le...,1
3,2006.15057,a loss function for generative neural networks...,to train variational autoencoders (vaes) to ge...,A loss function based on Watson's perceptual m...,1412.698,2017-01-30 01:27:54,adam: a method for stochastic optimization,"diederik p. kingma, jimmy ba","we introduce adam, an algorithm for first-orde...",cs.lg,,133232.0,21032.0,"This work introduces Adam, an algorithm for fi...",1
4,2102.10739,dissecting the diffusion process in linear gra...,graph convolutional networks (gcns) have attra...,This paper dissects the feature propagation st...,1412.698,2017-01-30 01:27:54,adam: a method for stochastic optimization,"diederik p. kingma, jimmy ba","we introduce adam, an algorithm for first-orde...",cs.lg,,133232.0,21032.0,"This work introduces Adam, an algorithm for fi...",1


In [7]:
overall_df.loc[:, 'reference_category_1'] = overall_df['reference_category_1'].fillna('')
overall_df.loc[:, 'reference_category_2'] = overall_df['reference_category_2'].fillna('')

In [8]:
overall_df = overall_df[
    [
        "paper_title",
        "paper_abstract",
        "reference_title",
        "reference_abstract",
        "reference_category_1",
        "reference_category_2",
        "reference_citation_count",
        "reference_influential_citation_count",
        "target",
    ]
]
X_train, X_test, y_train, y_test = train_test_split(
    overall_df.drop(columns=['target']), # Drop the target column from the input features
    overall_df['target'],                # The target column
    test_size=0.2,               # Specify the size of the test set
    random_state=42,             # A seed value for reproducibility
    stratify=overall_df['target']  # Use the combined column for stratification
)

In [7]:
X_train

Unnamed: 0,paper_title,paper_abstract,reference_title,reference_abstract,reference_category_1,reference_category_2,reference_citation_count,reference_influential_citation_count
164793,residual pathway priors for soft equivariance ...,there is often a trade-off between building de...,mdp homomorphic networks: group symmetries in ...,this paper introduces mdp homomorphic networks...,cs.lg,stat.ml,116.0,9.0
377136,clusterability as an alternative to anchor poi...,"the label noise transition matrix, characteriz...",skip-clip: self-supervised spatiotemporal repr...,deep neural networks require collecting and an...,cs.cv,,15.0,0.0
158387,characterizing attacks on deep reinforcement l...,recent studies show that deep reinforcement le...,playing atari with deep reinforcement learning,we present the first deep learning model to su...,cs.lg,,10374.0,1320.0
69588,scalable algorithms for physics-informed neura...,physics-informed machine learning (piml) has e...,nvidia simnet^{tm}: an ai-accelerated multi-ph...,"we present simnet, an ai-driven multi-physics ...",physics.flu-dyn,cs.lg,116.0,13.0
514587,robust pdf document conversion using recurrent...,the number of published pdf documents has incr...,oxford handbook on ai ethics book chapter on r...,from massive face-recognition-based surveillan...,cs.cy,cs.ai,29.0,0.0
...,...,...,...,...,...,...,...,...
210238,fair learning with private demographic data,sensitive attributes such as race are rarely a...,fairness through awareness,"we study fairness in classification, where ind...",cs.cc,cs.cy,3180.0,410.0
172768,robust and differentially private mean estimation,in statistical learning and analysis from shar...,fast implementation of the tukey depth,tukey depth function is one of the most famous...,stat.co,,15.0,0.0
21909,flambe: structural complexity and representati...,in order to deal with the curse of dimensional...,curiosity-driven exploration by self-supervise...,"in many real-world scenarios, rewards extrinsi...",cs.lg,cs.ai,2058.0,286.0
246269,early-learning regularization prevents memoriz...,we propose a novel framework to perform classi...,iterative learning with open-set noisy labels,large-scale datasets possessing clean label an...,cs.cv,,287.0,29.0


In [9]:
# Assuming X_train is already defined and has the columns 'reference_category_1' and 'reference_category_2'
all_categories = pd.concat([X_train['reference_category_1'], X_train['reference_category_2']])
category_counts = all_categories.value_counts()

# Find infrequent categories where count is less than the threshold
threshold = 10_000
infrequent_categories = category_counts[category_counts < threshold].index

# Convert infrequent categories to a set for faster checking
infrequent_categories_set = set(infrequent_categories)

# Create a mask where categories are infrequent
mask_1 = X_train['reference_category_1'].isin(infrequent_categories_set)
mask_2 = X_train['reference_category_2'].isin(infrequent_categories_set)

# Use where or mask to replace
X_train['reference_category_1'] = X_train['reference_category_1'].where(~mask_1, other='other')
X_train['reference_category_2'] = X_train['reference_category_2'].where(~mask_2, other='other')

# Create a mask where categories are infrequent
mask_1 = X_test['reference_category_1'].isin(infrequent_categories_set)
mask_2 = X_test['reference_category_2'].isin(infrequent_categories_set)

# Use where or mask to replace
X_test['reference_category_1'] = X_test['reference_category_1'].where(~mask_1, other='other')
X_test['reference_category_2'] = X_test['reference_category_2'].where(~mask_2, other='other')

X_train.head()

Unnamed: 0,paper_title,paper_abstract,reference_title,reference_abstract,reference_category_1,reference_category_2,reference_citation_count,reference_influential_citation_count
164793,residual pathway priors for soft equivariance ...,there is often a trade-off between building de...,mdp homomorphic networks: group symmetries in ...,this paper introduces mdp homomorphic networks...,cs.lg,stat.ml,116.0,9.0
377136,clusterability as an alternative to anchor poi...,"the label noise transition matrix, characteriz...",skip-clip: self-supervised spatiotemporal repr...,deep neural networks require collecting and an...,cs.cv,,15.0,0.0
158387,characterizing attacks on deep reinforcement l...,recent studies show that deep reinforcement le...,playing atari with deep reinforcement learning,we present the first deep learning model to su...,cs.lg,,10374.0,1320.0
69588,scalable algorithms for physics-informed neura...,physics-informed machine learning (piml) has e...,nvidia simnet^{tm}: an ai-accelerated multi-ph...,"we present simnet, an ai-driven multi-physics ...",other,cs.lg,116.0,13.0
514587,robust pdf document conversion using recurrent...,the number of published pdf documents has incr...,oxford handbook on ai ethics book chapter on r...,from massive face-recognition-based surveillan...,other,cs.ai,29.0,0.0


In [10]:
metadata_df.head()

Unnamed: 0,arxiv_id,date,title,authors,abstract,category_1,category_2,citation_count,influential_citation_count,tldr
0,1906.05059,2019-06-12 11:07:10,higher-order ranking and link prediction: from...,"ryan a. rossi, anup rao, sungchul kim, eunyee ...","in this paper, we introduce the notion of moti...",cs.lg,cs.ir,17.0,2.0,Higher-order network motifs are investigated a...
1,1906.05774,2019-10-08 14:44:59,deep unfolding for communications systems: a s...,"alexios balatsoukas-stimming, christoph studer",deep unfolding is a method of growing populari...,eess.sp,cs.it,155.0,7.0,This survey summarizes the principle of deep u...
2,1801.05772,2018-01-17 17:44:48,ranking data with continuous labels through or...,"stephan clemencon, mastane achab","we formulate a supervised learning problem, re...",stat.ml,,6.0,4.0,
3,1906.05795,2019-06-13 16:29:30,topological data analysis for arrhythmia detec...,"meryll dindin, yuhei umeda, frederic chazal",this paper presents an innovative and generic ...,cs.lg,eess.sp,37.0,2.0,An innovative and generic deep learning approa...
4,1906.05797,2019-06-13 16:29:58,the replica dataset: a digital replica of indo...,"julian straub, thomas whelan, lingni ma, yufan...","we introduce replica, a dataset of 18 highly p...",cs.cv,cs.gr,511.0,117.0,"Replica, a dataset of 18 highly photo-realisti..."


In [11]:
from citations_searcher.models import CustomCatboostClassifier

In [22]:
X_train = X_train.drop(["reference_category_1", "reference_category_2"], axis=1)
X_test = X_test.drop(["reference_category_1", "reference_category_2"], axis=1)

In [23]:
model = CustomCatboostClassifier(min_frequency=10_000, category_features=[])

In [24]:
metadata_df.columns

Index(['arxiv_id', 'date', 'title', 'authors', 'abstract', 'category_1',
       'category_2', 'citation_count', 'influential_citation_count', 'tldr'],
      dtype='object')

In [25]:
fit_metadata = metadata_df[
    [
        "title",
        "abstract",
        # "category_1",
        # "category_2",
        "citation_count",
        "influential_citation_count",
    ]
]
# fit_metadata.loc[:, 'category_1'] = fit_metadata['category_1'].fillna('')
# fit_metadata.loc[:, 'category_2'] = fit_metadata['category_2'].fillna('')
# 
# # Create a mask where categories are infrequent
# mask_1 = fit_metadata['category_1'].isin(infrequent_categories_set)
# mask_2 = fit_metadata['category_2'].isin(infrequent_categories_set)
# 
# # Use where or mask to replace
# fit_metadata['category_1'] = fit_metadata['category_1'].where(~mask_1, other='other')
# fit_metadata['category_2'] = fit_metadata['category_2'].where(~mask_2, other='other')

fit_metadata = fit_metadata.iloc[:5000]
model.fit(X_train, y_train, fit_metadata, verbose=True)

[32m2024-05-05 23:33:03.764[0m | [1mINFO    [0m | [36mcitations_searcher.models.custom_catboost[0m:[36mfit[0m:[36m59[0m - [1mFitting the catboost model...[0m


Learning rate set to 0.024172
0:	learn: 0.6828847	total: 29.7ms	remaining: 29.7s
1:	learn: 0.6732400	total: 55.2ms	remaining: 27.5s
2:	learn: 0.6640338	total: 82.3ms	remaining: 27.3s
3:	learn: 0.6550546	total: 108ms	remaining: 26.8s
4:	learn: 0.6465147	total: 134ms	remaining: 26.7s
5:	learn: 0.6384968	total: 159ms	remaining: 26.3s
6:	learn: 0.6312760	total: 186ms	remaining: 26.3s
7:	learn: 0.6245106	total: 213ms	remaining: 26.4s
8:	learn: 0.6175249	total: 242ms	remaining: 26.7s
9:	learn: 0.6113969	total: 271ms	remaining: 26.9s
10:	learn: 0.6052578	total: 296ms	remaining: 26.6s
11:	learn: 0.5995575	total: 320ms	remaining: 26.4s
12:	learn: 0.5944428	total: 345ms	remaining: 26.2s
13:	learn: 0.5896460	total: 371ms	remaining: 26.1s
14:	learn: 0.5846290	total: 395ms	remaining: 25.9s
15:	learn: 0.5799356	total: 419ms	remaining: 25.8s
16:	learn: 0.5756637	total: 445ms	remaining: 25.7s
17:	learn: 0.5718973	total: 476ms	remaining: 26s
18:	learn: 0.5679462	total: 501ms	remaining: 25.9s
19:	learn:

[32m2024-05-05 23:34:16.704[0m | [1mINFO    [0m | [36mcitations_searcher.models.custom_catboost[0m:[36mfit[0m:[36m61[0m - [1mSuccessfully fitted![0m
[32m2024-05-05 23:34:16.705[0m | [1mINFO    [0m | [36mcitations_searcher.models.custom_catboost[0m:[36mfit[0m:[36m63[0m - [1mFitting vectorizer...[0m
[32m2024-05-05 23:34:17.667[0m | [1mINFO    [0m | [36mcitations_searcher.models.custom_catboost[0m:[36mfit[0m:[36m67[0m - [1mSuccessfully fitted![0m


In [26]:
test = X_test.iloc[0:1][['paper_title', 'paper_abstract']]
test = test.rename(columns={'paper_title': 'title', 'paper_abstract': 'abstract'})

In [27]:
test = test.reset_index(drop=True)
test

Unnamed: 0,title,abstract
0,analysing the noise model error for realistic ...,distant and weak supervision allow to obtain l...


In [28]:
res = model.predict(test, n_predictions=10, n_candidates=100)
fit_metadata.loc[res.reshape(-1,)]

Unnamed: 0,title,abstract,citation_count,influential_citation_count
3415,revisiting self-training for neural sequence g...,self-training is one of the earliest and simpl...,241.0,30.0
4991,eternal sunshine of the spotless net: selectiv...,we explore the problem of selectively forgetti...,260.0,57.0
3328,unsupervised domain adaptation through self-su...,this paper addresses unsupervised domain adapt...,225.0,19.0
3555,self: learning to filter noisy labels with sel...,deep neural networks (dnns) have been shown to...,252.0,20.0
1701,deep self-learning from noisy labels,convnets achieve good results when training fr...,238.0,17.0
137,does learning require memorization? a short ta...,state-of-the-art results on image recognition ...,345.0,34.0
4607,confident learning: estimating uncertainty in ...,"learning exists in the context of data, yet no...",483.0,71.0
2059,symmetric cross entropy for robust learning wi...,training accurate deep neural networks (dnns) ...,674.0,135.0
1002,generative modeling by estimating gradients of...,we introduce a new generative model where samp...,2164.0,283.0
4943,self-training with noisy student improves imag...,"we present noisy student training, a semi-supe...",2002.0,236.0


In [29]:
X_test.head()

Unnamed: 0,paper_title,paper_abstract,reference_title,reference_abstract,reference_citation_count,reference_influential_citation_count
319141,analysing the noise model error for realistic ...,distant and weak supervision allow to obtain l...,performance analysis and comparison of distrib...,deep learning has permeated through many aspec...,23.0,0.0
77476,ncore: neural counterfactual representation le...,estimating an individual's potential response ...,causal effect inference with deep latent-varia...,learning individual-level causal effects from ...,589.0,130.0
348095,successor feature landmarks for long-horizon g...,operating in the real-world often requires age...,graphact: accelerating gcn training on cpu-fpg...,graph convolutional networks (gcns) have emerg...,101.0,9.0
200574,mime: mimicking centralized stochastic algorit...,federated learning (fl) is a challenging setti...,leaf: a benchmark for federated settings,"modern federated networks, such as those compr...",1073.0,276.0
482717,declarative machine learning systems,in the last years machine learning (ml) has mo...,normalization techniques in training dnns: met...,normalization techniques are essential for acc...,118.0,5.0


In [30]:
predictions = model._validation_predict(X_test)

[32m2024-05-05 23:34:18.145[0m | [1mINFO    [0m | [36mcitations_searcher.models.custom_catboost[0m:[36m_validation_predict[0m:[36m137[0m - [1mMaking validating predictions for 105778 objects...[0m
[32m2024-05-05 23:34:20.473[0m | [1mINFO    [0m | [36mcitations_searcher.models.custom_catboost[0m:[36m_validation_predict[0m:[36m139[0m - [1mValidating predictions generated![0m


In [31]:
from citations_searcher.metrics import evaluate_predictions

In [32]:
evaluate_predictions(predictions, y_test)

[32m2024-05-05 23:34:20.592[0m | [1mINFO    [0m | [36mcitations_searcher.metrics.evaluate_predictions[0m:[36mevaluate_predictions[0m:[36m30[0m - [1mTrain scores. accuracy_score: 0.843; precision_score: 0.883; recall_score: 0.843; f1_score: 0.863; [0m


{'accuracy_score': 0.8428312125394695,
 'precision_score': 0.8831340178284477,
 'recall_score': 0.8431030584891644,
 'f1_score': 0.8626543847329505}