In [1]:
import ijson
import re

import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
is_xl_data = False

selected_papers = []

keywords = {
    "deep learning",
    "artificial intelligence",
    "machine learning",
    "computer vision",
    "natural language processing",
    "reinforcement learning",
    "information retrieval",
    "information extraction",
    "unsupervised learning",
    "speech recognition",
    "neural networks",
    "supervised learning",
    "convolutional neural networks",
    "recurrent neural networks",
    "data mining",
    "graph neural networks",
    "dimensionality reduction",
    "explainable ai",
    "adversarial learning",
    "hyperparameter tuning"
}

joined_keywords = '|'.join(re.escape(kw) for kw in keywords)

# number of records in this dataset
# Visit: https://www.aminer.org/citation
n_docs = 5_259_858
idx = 0
with open("./dblp_v14.json", "rb") as f:
    for record in tqdm(ijson.items(f, "item"), total=n_docs):
        if (record["lang"] != "en"
            or record["abstract"] == ""
            or record["title"] == ""
            or record["year"] == ""
            or "references" not in record.keys()
        ): continue

        should_keep = False
        record_kws = {kw.lower() for kw in record["keywords"]}
        if record_kws.intersection(keywords) or re.search(joined_keywords, record["title"].lower()):
            should_keep = True
        elif is_xl_data:
            # Keep every 100th paper that are not in the AI category
            idx += 1
            if idx % 100 == 0:
                should_keep = True

        if should_keep:
            selected_papers.append({
                "id": record["id"],
                "title": record["title"],
                "doi": record["doi"],
                "keywords": record["keywords"],
                "year": record["year"],
                "abstract": record["abstract"]
            })

100%|██████████| 5259858/5259858 [04:49<00:00, 18140.34it/s] 


In [3]:
ai_papers_data = pd.DataFrame(selected_papers)
ai_papers_data

# Convert the date to DateTime object
ai_papers_data["date"] = pd.to_datetime(ai_papers_data["year"].astype(str)+"-01-01")
ai_papers_data.drop(columns=["year"], inplace=True)

ai_papers_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226234 entries, 0 to 226233
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   id        226234 non-null  object        
 1   title     226234 non-null  object        
 2   doi       226234 non-null  object        
 3   keywords  226234 non-null  object        
 4   abstract  226234 non-null  object        
 5   date      226234 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 10.4+ MB


In [4]:
subset = ai_papers_data[ai_papers_data["date"] >= "2000-01-01"].sample(6000, random_state=420)
subset

Unnamed: 0,id,title,doi,keywords,abstract,date
68189,53e9aaf3b7602d9703477624,BioC: a minimalist approach to interoperabilit...,10.1093/database/bat064,"[biomedical research, natural language process...",A vast amount of scientific information is enc...,2013-01-01
118681,53e9b80fb7602d97043c220d,Bankruptcy prediction using support vector mac...,10.1016/j.eswa.2004.12.008,"[support vector machine, bankruptcy prediction...",Bankruptcy prediction has drawn a lot of resea...,2005-01-01
101484,53e9b38fb7602d9703e7197a,Multimodal Biometrics-Based Student Attendance...,10.1109/ISM.2009.25,"[face tracking, multimodal biometrics-based st...",In this paper we present a solution to obtain ...,2009-01-01
155002,557c560af66765fbb46afacb,Inferring social contexts from audio recording...,10.1109/MLSP.2014.6958853,"[audio recording, audio signal processing, fea...","In this paper, we investigate the problem of d...",2014-01-01
21855,53e99e0cb7602d97026d2608,Insight of the Signal Motif of GPI-(like)-anch...,,"[gpi lipid modification, ptm, svm., support ve...",Many proteins contain a signal sequence at the...,2006-01-01
...,...,...,...,...,...,...
6213,53e9999eb7602d97021e4547,A hybrid approach for indexing and retrieval o...,10.1007/978-3-642-15384-6_56,"[hybrid retrieval approach, different issue, c...",This paper focuses on the problem of archaeolo...,2010-01-01
16434,53e99c8bb7602d970253cbd4,Fuzzy Analysis of X-Ray Images for Automated D...,10.1007/978-3-540-30133-2_64,"[pattern recognition, linage segmentation, fuz...",This paper presents the design of a fuzzy deci...,2004-01-01
175867,55c354dd683a451f09d2c43a,VHR satellite image segmentation based on topo...,10.1109/MVA.2015.7153250,"[image resolution, image segmentation, learnin...",High spatial resolution satellite imagery has ...,2015-01-01
35948,53e9a1e1b7602d9702ae04e9,Construction of computer system for microobjec...,10.1007/978-3-540-75187-8_25,"[efficient approach, formal neuron, pollen gra...",We propose a new and efficient approach for so...,2007-01-01


In [5]:
subset.to_feather("./aminer-subset.feather")