In [1]:
%cd ..

%load_ext autoreload
%autoreload 2

from IPython.display import display

/home/aris/projects/cs6784_research


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import sys
from pathlib import Path
import pickle
from time import time
from tqdm.notebook import tqdm

In [3]:
def show_df(df: pd.DataFrame):
    display(df.head())
    print(df.shape)

In [4]:
import pymongo
import getpass

client = pymongo.MongoClient(
    'localhost',
    27017,
    username='aris',
    password=getpass.getpass(),
    authSource='admin'
)

db = client.cs6784
dblpv13 = db.dblpv13

In [5]:
# Total number of documents

n = db.command('collstats', 'dblpv13')['count']

print(f'DBLP V13 has {n:,} documents.')

DBLP V13 has 5,354,309 documents.


In [111]:
# Define projection attributes that we care

projection = {
    'title': 1,
    'year': 1,
    'keywords': 1,
    # 'authors._id': 1,
    'authors': {
        '$map': {
            'input': '$authors',
            'as': 'e',
            'in': '$$e._id'
        }
    },
    'references': 1
}

In [130]:
# Query Random sample of 1% data by MongoDB

# This process is not reproducible as we cannot set seed to MongoDB query
# https://stackoverflow.com/questions/36690714/manually-setting-the-seed-for-mongodb-sample

results = dblpv13.aggregate([
    {'$sample': {'size': n // 100}},
    {'$project': projection}
])

df_random = pd.DataFrame.from_records(results, index='_id')

show_df(df_random)

Unnamed: 0_level_0,title,year,keywords,references,authors
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
53e9b0deb7602d9703b5b22b,Analysis of Requirements Volatility during Sof...,2004,"[requirements volatility, requirements volatil...","[53e9bdeeb7602d9704aa0346, 53e99f48b7602d97028...","[53f4d351dabfaeedd8781193, 54059d0cdabfae8faa5..."
53e99fa9b7602d97028817dc,A new efficient approach to the design of para...,2006,"[dolph-chebyshev windows, dolph-chebyshev wind...","[53e9aa67b7602d97033ded65, 53e9a4a3b7602d9702d...","[53f43470dabfaeb2ac040679, 5406a790dabfae8faa6..."
5bdc318017c44a1f58a089d4,REGGAE: a novel approach for the identificatio...,2018,,"[53e9b098b7602d9703b002f9, 53e9a743b7602d97030...","[562c7a4045cedb3398c34318, None, None, 53f437e..."
573698826e3b12023e741cef,"Detection, Classification and Characterization...",2015,[],"[5550485145ce0a409eb6d63c, 53e99b71b7602d97024...","[None, 562d13f645cedb3398d4913c, 542a0e46dabfa..."
53e9a7ffb7602d970314459e,A Spectral Viscosity Method Based on Hermite F...,2008,"[nonlinear conservation laws, orthogonal basis...","[53e99b8db7602d9702431d90, 53e9a987b7602d97032...","[53f4560ddabfaec09f205502, 53f39f7fdabfae4b34a..."


(53543, 5)


In [6]:
results = dblpv13.find({
    'year': {'$gte': 1980, '$lt': 2025}
}, {'_id': 1})

_ids = [e['_id'] for e in results]

print(len(_ids))
print(_ids[:5])

5291975
['53e99784b7602d9701f3f8c3', '53e99784b7602d9701f3f71b', '53e99785b7602d9701f427b8', '53e99785b7602d9701f42886', '53e99785b7602d9701f42c6f']


In [7]:
rng = np.random.default_rng(42)

sample_ids = rng.choice(_ids, n // 100, False).tolist()

print(len(sample_ids))
print(sample_ids[:5])

53543
['53e9a55cb7602d9702e836ce', '5ff68c9bd4150a363cd2e19e', '55909ad20cf28af999b589ae', '53e9ad34b7602d9703717779', '5a4aef2617c44a2190f75a43']


In [8]:
results = dblpv13.find({
    '_id': {'$in': sample_ids}
})

df_samples = pd.DataFrame.from_records(results, index='_id')

show_df(df_samples)

Unnamed: 0_level_0,title,authors,venue,year,fos,page_start,page_end,url,references,keywords,n_citation,lang,volume,issue,issn,isbn,doi,pdf,abstract
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
53a7290120f7420be8bc149b,Derivation of Knowledge Structures for Distrib...,"[{'name': 'Luca Stefanutti', '_id': '53f45af4d...","{'_id': '53a728fe20f7420be8bc0b95', 'sid': 'co...",2005,"[Grid computing, Knowledge assessment, Adaptiv...",105,112,[http://www.booksonline.iospress.nl/Content/Vi...,"[53e998f0b7602d970212b0f0, 53e9b4f9b7602d97040...",,,,,,,,,,
53e99784b7602d9701f3f615,Foreword.,"[{'_id': '5405df6bdabfae450f3dc31d', 'name': '...","{'_id': '539078ef20f770854f5a84a4', 'type': 0,...",2013,,K0005,K0005,[http://dx.doi.org/10.5702/massspectrometry.K0...,,"[equity and social cohesion., justice]",4.0,en,2.0,Spec Iss,2187-137X,1-59140-106-2,10.5702/massspectrometry.K0005,https://static.aminer.cn/upload/pdf/program/53...,There is something seriously missing in a fiel...
53e99785b7602d9701f40603,GRISLEE,"[{'_id': '53f44a37dabfaee4dc7e0bed', 'name': '...","{'_id': '555036b77cea80f95414b7d2', 'raw': 'I....",2003,,603,616,,,[],0.0,en,22.0,7-8,,,,,
53e99785b7602d9701f414f4,Dishes,"[{'gid': '5b8692a9e1cd8e14a35a603d', 'oid': '5...","{'_id': '53a72b2d20f7420be8c1c5a8', 'raw': 'SI...",2009,,38,38,[http://doi.acm.org/10.1145/1665137.1665165],,"[traditional photography, twisted reality, pre...",,en,,,,,10.1145/1665137.1665165,,Dishes is a digitally manipulated photo that r...
53e99785b7602d9701f42c6f,Eradication,"[{'_id': '53f3a672dabfae4b34adc4eb', 'name': '...","{'_id': '555036e07cea80f95416334f', 'raw': 'J....",1980,,1203,1203,"[http://dx.doi.org/10.1016/j.jal.2011.08.001, ...",,[],4.0,en,210.0,4475,0036-8075,,10.1126/science.7434020,https://static.aminer.cn/upload/pdf/program/53...,Eradication is a radical form of contraction t...


(53543, 19)


In [145]:
# Initialize pandarallel for parallel apply
# https://nalepae.github.io/pandarallel/
# https://nalepae.github.io/pandarallel/user_guide/

from pandarallel import pandarallel

pandarallel.initialize(
    nb_workers=os.cpu_count(),
    progress_bar=False,
    verbose=2
)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [148]:
def __clean(x: pd.Series):
    # x['year'] = int(x['year'])

    # WARNING: Many authors are removed b/c they don't have an _id
    try:
        authors = [
            e for e in x['authors']
            if e is not None
        ]
        if len(authors) == 0:
            x['authors'] = np.nan
        else:
            x['authors'] = authors
    except TypeError:
        x['authors'] = np.nan

    try:
        if len(x['keywords']) == 0:
            x['keywords'] = np.nan
    except TypeError:
        x['keywords'] = np.nan

    return x


# Apply cleaning per row
# df_samples_proc = df_samples.apply(
df_samples_proc = df_samples.parallel_apply(
    __clean, axis=1
).sort_values(
    by='year', ascending=False
)

df_samples_proc.astype({
    'year': int
})

show_df(df_samples_proc)

Unnamed: 0_level_0,title,year,references,authors,keywords
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6085415991e01180c31e936c,Explainable Detection of Sarcasm in Social Media.,2021,,,
600d4944d4150a363c681c01,A New Approach to Mean Square Exponential Stab...,2021,"[53e9bd1eb7602d97049aae37, 53e9b53bb7602d97040...",,
600fe655d4150a363c202e04,6G-enabled IoT Home Environment control using ...,2021,,,
600fe644d4150a363c201214,A Small-Sample Faulty Line Detection Method Ba...,2021,,,
600fe63bd4150a363c200509,Attention-based contextual interaction asymmet...,2021,,,


(53543, 5)


In [9]:
# Same the processed samplings

with open('data/interim/df_samples.pkl', 'wb') as f:
    pickle.dump(df_samples, f)

In [4]:
with open('data/interim/df_samples.pkl', 'rb') as f:
    df_load = pickle.load(f)

show_df(df_load)

Unnamed: 0_level_0,title,authors,venue,year,fos,page_start,page_end,url,references,keywords,n_citation,lang,volume,issue,issn,isbn,doi,pdf,abstract
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
53a7290120f7420be8bc149b,Derivation of Knowledge Structures for Distrib...,"[{'name': 'Luca Stefanutti', '_id': '53f45af4d...","{'_id': '53a728fe20f7420be8bc0b95', 'sid': 'co...",2005,"[Grid computing, Knowledge assessment, Adaptiv...",105,112,[http://www.booksonline.iospress.nl/Content/Vi...,"[53e998f0b7602d970212b0f0, 53e9b4f9b7602d97040...",,,,,,,,,,
53e99784b7602d9701f3f615,Foreword.,"[{'_id': '5405df6bdabfae450f3dc31d', 'name': '...","{'_id': '539078ef20f770854f5a84a4', 'type': 0,...",2013,,K0005,K0005,[http://dx.doi.org/10.5702/massspectrometry.K0...,,"[equity and social cohesion., justice]",4.0,en,2.0,Spec Iss,2187-137X,1-59140-106-2,10.5702/massspectrometry.K0005,https://static.aminer.cn/upload/pdf/program/53...,There is something seriously missing in a fiel...
53e99785b7602d9701f40603,GRISLEE,"[{'_id': '53f44a37dabfaee4dc7e0bed', 'name': '...","{'_id': '555036b77cea80f95414b7d2', 'raw': 'I....",2003,,603,616,,,[],0.0,en,22.0,7-8,,,,,
53e99785b7602d9701f414f4,Dishes,"[{'gid': '5b8692a9e1cd8e14a35a603d', 'oid': '5...","{'_id': '53a72b2d20f7420be8c1c5a8', 'raw': 'SI...",2009,,38,38,[http://doi.acm.org/10.1145/1665137.1665165],,"[traditional photography, twisted reality, pre...",,en,,,,,10.1145/1665137.1665165,,Dishes is a digitally manipulated photo that r...
53e99785b7602d9701f42c6f,Eradication,"[{'_id': '53f3a672dabfae4b34adc4eb', 'name': '...","{'_id': '555036e07cea80f95416334f', 'raw': 'J....",1980,,1203,1203,"[http://dx.doi.org/10.1016/j.jal.2011.08.001, ...",,[],4.0,en,210.0,4475,0036-8075,,10.1126/science.7434020,https://static.aminer.cn/upload/pdf/program/53...,Eradication is a radical form of contraction t...


(53543, 19)
