In [1]:
%cd '/home/aris/projects/cs6784_research'

%load_ext autoreload
%autoreload 1

from IPython.display import display

/home/aris/projects/cs6784_research


In [2]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

import os
import sys
from pathlib import Path

import json
import pickle

from time import time
from tqdm.notebook import tqdm
from pprint import pprint

In [3]:
# Initialize pandarallel for parallel apply
# https://nalepae.github.io/pandarallel/
# https://nalepae.github.io/pandarallel/user_guide/

# NOTE that pandarallel is not supported on Windows,
# skip this cell if running on windows

from pandarallel import pandarallel

pandarallel.initialize(
    nb_workers=os.cpu_count(),
    progress_bar=False,
    verbose=2
)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
def show_df(df: pd.DataFrame):
    display(df.head())
    print(df.shape)

In [6]:
import pymongo
import getpass

# client = pymongo.MongoClient(
#     'localhost',
#     27017,
#     username='aris',
#     password=getpass.getpass(),
#     authSource='admin'
# )
client = pymongo.MongoClient(
    'oasis.ariseus.net',
    27888,
    username='cs6784_admin',
    password=getpass.getpass(),
    authSource='cs6784'
)

db = client.cs6784
dblpv13 = db.dblpv13

In [13]:
%%time

file_name = 'data/interim/ai_ids.json'

# projection: which columns to preserve
_proj = {
    'title': 1,
    'year': 1,
    'fos': 1,
    'keywords': 1,
    'pdf': 1,
    'doi': 1,
    'url': 1,
    # 'references': 1
}

results = db.dblpv13.find(
    # filter: the MongoDB filter to select data
    {
        'year': {'$gte': 2000, '$lt': 2022},
        'fos': 'Artificial intelligence',
        # 'fos': {'$in': ['Machine learning', 'Artificial intelligence']},
        'keywords': {
            '$exists': True,
            '$not': {'$size': 0}
        },
        # 'pdf': {'$exists': True}
    }, _proj
)

results = list(results)
print(len(results))

df_papers = pd.DataFrame.from_records(results, index='_id')

show_df(df_papers)

541722


Unnamed: 0_level_0,title,year,keywords,fos,doi,pdf,url
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
53e997ccb7602d9701fbee34,SAGE: A New Analysis and Optimization System f...,2000,"[optimization system, intelligent memory archi...","[Database-centric architecture, Architecture, ...",10.1007/3-540-44570-6_12,,[http://dx.doi.org/10.1007/3-540-44570-6_12]
53e997ccb7602d9701fbf738,Shape Description for Content-Based Image Retr...,2000,"[covariance matrix, support vector machines, s...","[Row, Computer vision, Pattern recognition, Co...",10.1007/3-540-40053-2_19,,"[http://dx.doi.org/10.1007/3-540-40053-2_19, h..."
53e997ccb7602d9701fc0025,Neuro-Architecture-Motivated ANNs and Cortical...,2000,"[phylogeny, backpropagation, neural networks, ...","[Architecture, Inheritance of acquired charact...",,,[http://doi.ieeecomputersociety.org/10.1109/IJ...
53e997d1b7602d9701fc348c,Automatic extraction of roads from aerial imag...,2000,"[automatic road extraction, aerial imagery, sn...","[Computer vision, Edge extraction, Computer sc...",10.1007/s001380050004,https://static.aminer.cn/upload/pdf/program/53...,"[http://dx.doi.org/10.1007/s001380050004, http..."
53e997d7b7602d9701fcd7f2,The ``Test and Select'' Approach to Ensemble C...,2000,"[validation set, ensemble creation, improved r...","[Internal combustion engine, Computer science,...",10.1007/3-540-45014-9_3,,"[http://dx.doi.org/10.1007/3-540-45014-9_3, ht..."


(541722, 7)
CPU times: user 8.91 s, sys: 253 ms, total: 9.17 s
Wall time: 25.2 s


In [14]:
%%time

def __fun(x: pd.Series):
    rng = np.random.default_rng()

    x['keywords'] = rng.choice(
        x['keywords'],
        rng.integers(1, 3, endpoint=True)
    )
    x['keywords'] = [
        e.lower().strip()
        for e in x['keywords']
    ]
    return x


# parallel_apply is not supported on Windows, use .apply() instead
# df_papers_proc = df_papers.apply(__fun, axis=1)
df_papers_proc = df_papers.parallel_apply(__fun, axis=1)

show_df(df_papers_proc)

Unnamed: 0_level_0,title,year,keywords,fos,doi,pdf,url
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
53e997ccb7602d9701fbee34,SAGE: A New Analysis and Optimization System f...,2000,[sage system],"[Database-centric architecture, Architecture, ...",10.1007/3-540-44570-6_12,,[http://dx.doi.org/10.1007/3-540-44570-6_12]
53e997ccb7602d9701fbf738,Shape Description for Content-Based Image Retr...,2000,"[present work, covariance matrix]","[Row, Computer vision, Pattern recognition, Co...",10.1007/3-540-40053-2_19,,"[http://dx.doi.org/10.1007/3-540-40053-2_19, h..."
53e997ccb7602d9701fc0025,Neuro-Architecture-Motivated ANNs and Cortical...,2000,"[ontogeny, epigenesis, neural nets]","[Architecture, Inheritance of acquired charact...",,,[http://doi.ieeecomputersociety.org/10.1109/IJ...
53e997d1b7602d9701fc348c,Automatic extraction of roads from aerial imag...,2000,"[aerial imagery, multi-scale]","[Computer vision, Edge extraction, Computer sc...",10.1007/s001380050004,https://static.aminer.cn/upload/pdf/program/53...,"[http://dx.doi.org/10.1007/s001380050004, http..."
53e997d7b7602d9701fcd7f2,The ``Test and Select'' Approach to Ensemble C...,2000,[diesel engine],"[Internal combustion engine, Computer science,...",10.1007/3-540-45014-9_3,,"[http://dx.doi.org/10.1007/3-540-45014-9_3, ht..."


(541722, 7)
CPU times: user 6.79 s, sys: 2.78 s, total: 9.57 s
Wall time: 14.9 s


In [12]:
all_keywords = set.union(*[
    set([
        k.lower().strip()
        for k in e
    ])
    for e in df_papers.keywords
])

print(len(all_keywords))
print(list(all_keywords)[:20])

1522701
['', 'disjunctive error', 'endangered austronesian language', 'encoding visual object', 'random projection ensemble', 'text cloud', 'proposed registration', 'user activity monitoring', 'multimedia semantic analysis soc', 'trec document collection', 'maximum-margin based discriminative information', 'high resolution remotely sensed image segmentation', 'local linear feature invariance', 'nonstationary ensemble learning', 'three layer markov random field', 'cross-language information delivery system', 'recursive gradient', 'crossing safety', 'head driven tree-to-tree translation', 'shift-convergence disparity remapping technique']
