In [1]:
import dr_util.file_utils as fu
from omegaconf import DictConfig, OmegaConf
from hydra import initialize, compose

import re
from collections import defaultdict

#import polars as pl
import pandas as pd

%load_ext autoreload
%autoreload 2

In [2]:
# Citing sources
# - Imported the data and simple plotting from here: https://www.kaggle.com/code/auhide/arxiv-paper-classification
# - Data cleaning (comment filtering) from here: https://www.kaggle.com/code/carlmcbrideellis/machine-learning-review-papers-on-arxiv-polars

## Constants

In [3]:
cs_field_map = {'cs.AI': 'Artificial Intelligence',
                'cs.AR': 'Hardware Architecture',
                'cs.CC': 'Computational Complexity',
                'cs.CE': 'Computational Engineering, Finance, and Science',
                'cs.CG': 'Computational Geometry',
                'cs.CL': 'Computation and Language',
                'cs.CR': 'Cryptography and Security',
                'cs.CV': 'Computer Vision and Pattern Recognition',
                'cs.CY': 'Computers and Society',
                'cs.DB': 'Databases',
                'cs.DC': 'Distributed, Parallel, and Cluster Computing',
                'cs.DL': 'Digital Libraries',
                'cs.DM': 'Discrete Mathematics',
                'cs.DS': 'Data Structures and Algorithms',
                'cs.ET': 'Emerging Technologies',
                'cs.FL': 'Formal Languages and Automata Theory',
                'cs.GL': 'General Literature',
                'cs.GR': 'Graphics',
                'cs.GT': 'Computer Science and Game Theory',
                'cs.HC': 'Human-Computer Interaction',
                'cs.IR': 'Information Retrieval',
                'cs.IT': 'Information Theory',
                'cs.LG': 'Machine Learning',
                'cs.LO': 'Logic in Computer Science',
                'cs.MA': 'Multiagent Systems',
                'cs.MM': 'Multimedia',
                'cs.MS': 'Mathematical Software',
                'cs.NA': 'Numerical Analysis',
                'cs.NE': 'Neural and Evolutionary Computing',
                'cs.NI': 'Networking and Internet Architecture',
                'cs.OH': 'Other Computer Science',
                'cs.OS': 'Operating Systems',
                'cs.PF': 'Performance',
                'cs.PL': 'Programming Languages',
                'cs.RO': 'Robotics',
                'cs.SC': 'Symbolic Computation',
                'cs.SD': 'Sound',
                'cs.SE': 'Software Engineering',
                'cs.SI': 'Social and Information Networks',
                'cs.SY': 'Systems and Control',
                'eess.AS': 'Audio and Speech Processing',
                'eess.IV': 'Image and Video Processing',
                'eess.SP': 'Signal Processing',
                'physics.comp-ph': 'Computational Physics',
                'physics.data-an': 'Data Analysis, Statistics and Probability',
                'q-bio.QM': 'Quantitative Methods',
                'q-fin.CP': 'Computational Finance',
                'stat.AP': 'Applications',
                'stat.CO': 'Computation',
                'stat.ME': 'Methodology',
                'stat.ML': 'Machine Learning',
                'stat.OT': 'Other Statistics',
                'stat.TH': 'StatisticsTheory'}

In [4]:
sci_field_map = {'astro-ph': 'Astrophysics',
                'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
                'astro-ph.EP': 'Earth and Planetary Astrophysics',
                'astro-ph.GA': 'Astrophysics of Galaxies',
                'astro-ph.HE': 'High Energy Astrophysical Phenomena',
                'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
                'astro-ph.SR': 'Solar and Stellar Astrophysics',
                'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
                'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
                'cond-mat.mtrl-sci': 'Materials Science',
                'cond-mat.other': 'Other Condensed Matter',
                'cond-mat.quant-gas': 'Quantum Gases',
                'cond-mat.soft': 'Soft Condensed Matter',
                'cond-mat.stat-mech': 'Statistical Mechanics',
                'cond-mat.str-el': 'Strongly Correlated Electrons',
                'cond-mat.supr-con': 'Superconductivity',
                'cs.AI': 'Artificial Intelligence',
                'cs.AR': 'Hardware Architecture',
                'cs.CC': 'Computational Complexity',
                'cs.CE': 'Computational Engineering, Finance, and Science',
                'cs.CG': 'Computational Geometry',
                'cs.CL': 'Computation and Language',
                'cs.CR': 'Cryptography and Security',
                'cs.CV': 'Computer Vision and Pattern Recognition',
                'cs.CY': 'Computers and Society',
                'cs.DB': 'Databases',
                'cs.DC': 'Distributed, Parallel, and Cluster Computing',
                'cs.DL': 'Digital Libraries',
                'cs.DM': 'Discrete Mathematics',
                'cs.DS': 'Data Structures and Algorithms',
                'cs.ET': 'Emerging Technologies',
                'cs.FL': 'Formal Languages and Automata Theory',
                'cs.GL': 'General Literature',
                'cs.GR': 'Graphics',
                'cs.GT': 'Computer Science and Game Theory',
                'cs.HC': 'Human-Computer Interaction',
                'cs.IR': 'Information Retrieval',
                'cs.IT': 'Information Theory',
                'cs.LG': 'Machine Learning',
                'cs.LO': 'Logic in Computer Science',
                'cs.MA': 'Multiagent Systems',
                'cs.MM': 'Multimedia',
                'cs.MS': 'Mathematical Software',
                'cs.NA': 'Numerical Analysis',
                'cs.NE': 'Neural and Evolutionary Computing',
                'cs.NI': 'Networking and Internet Architecture',
                'cs.OH': 'Other Computer Science',
                'cs.OS': 'Operating Systems',
                'cs.PF': 'Performance',
                'cs.PL': 'Programming Languages',
                'cs.RO': 'Robotics',
                'cs.SC': 'Symbolic Computation',
                'cs.SD': 'Sound',
                'cs.SE': 'Software Engineering',
                'cs.SI': 'Social and Information Networks',
                'cs.SY': 'Systems and Control',
                'econ.EM': 'Econometrics',
                'eess.AS': 'Audio and Speech Processing',
                'eess.IV': 'Image and Video Processing',
                'eess.SP': 'Signal Processing',
                'gr-qc': 'General Relativity and Quantum Cosmology',
                'hep-ex': 'High Energy Physics - Experiment',
                'hep-lat': 'High Energy Physics - Lattice',
                'hep-ph': 'High Energy Physics - Phenomenology',
                'hep-th': 'High Energy Physics - Theory',
                'math.AC': 'Commutative Algebra',
                'math.AG': 'Algebraic Geometry',
                'math.AP': 'Analysis of PDEs',
                'math.AT': 'Algebraic Topology',
                'math.CA': 'Classical Analysis and ODEs',
                'math.CO': 'Combinatorics',
                'math.CT': 'Category Theory',
                'math.CV': 'Complex Variables',
                'math.DG': 'Differential Geometry',
                'math.DS': 'Dynamical Systems',
                'math.FA': 'Functional Analysis',
                'math.GM': 'General Mathematics',
                'math.GN': 'General Topology',
                'math.GR': 'Group Theory',
                'math.GT': 'Geometric Topology',
                'math.HO': 'History and Overview',
                'math.IT': 'Information Theory',
                'math.KT': 'K-Theory and Homology',
                'math.LO': 'Logic',
                'math.MG': 'Metric Geometry',
                'math.MP': 'Mathematical Physics',
                'math.NA': 'Numerical Analysis',
                'math.NT': 'Number Theory',
                'math.OA': 'Operator Algebras',
                'math.OC': 'Optimization and Control',
                'math.PR': 'Probability',
                'math.QA': 'Quantum Algebra',
                'math.RA': 'Rings and Algebras',
                'math.RT': 'Representation Theory',
                'math.SG': 'Symplectic Geometry',
                'math.SP': 'Spectral Theory',
                'math.ST': 'Statistics Theory',
                'math-ph': 'Mathematical Physics',
                'nlin.AO': 'Adaptation and Self-Organizing Systems',
                'nlin.CD': 'Chaotic Dynamics',
                'nlin.CG': 'Cellular Automata and Lattice Gases',
                'nlin.PS': 'Pattern Formation and Solitons',
                'nlin.SI': 'Exactly Solvable and Integrable Systems',
                'nucl-ex': 'Nuclear Experiment',
                'nucl-th': 'Nuclear Theory',
                'physics.acc-ph': 'Accelerator Physics',
                'physics.ao-ph': 'Atmospheric and Oceanic Physics',
                'physics.app-ph': 'Applied Physics',
                'physics.atm-clus': 'Atomic and Molecular Clusters',
                'physics.atom-ph': 'Atomic Physics',
                'physics.bio-ph': 'Biological Physics',
                'physics.chem-ph': 'Chemical Physics',
                'physics.class-ph': 'Classical Physics',
                'physics.comp-ph': 'Computational Physics',
                'physics.data-an': 'Data Analysis, Statistics and Probability',
                'physics.ed-ph': 'Physics Education',
                'physics.flu-dyn': 'Fluid Dynamics',
                'physics.gen-ph': 'General Physics',
                'physics.geo-ph': 'Geophysics',
                'physics.hist-ph': 'History and Philosophy of Physics',
                'physics.ins-det': 'Instrumentation and Detectors',
                'physics.med-ph': 'Medical Physics',
                'physics.optics': 'Optics',
                'physics.plasm-ph': 'Plasma Physics',
                'physics.pop-ph': 'Popular Physics',
                'physics.soc-ph': 'Physics and Society',
                'physics.space-ph': 'Space Physics',
                'q-bio.BM': 'Biomolecules',
                'q-bio.CB': 'Cell Behavior',
                'q-bio.GN': 'Genomics',
                'q-bio.MN': 'Molecular Networks',
                'q-bio.NC': 'Neurons and Cognition',
                'q-bio.OT': 'Other Quantitative Biology',
                'q-bio.PE': 'Populations and Evolution',
                'q-bio.QM': 'Quantitative Methods',
                'q-bio.SC': 'Subcellular Processes',
                'q-bio.TO': 'Tissues and Organs',
                'q-fin.CP': 'Computational Finance',
                'q-fin.EC': 'Economics',
                'q-fin.GN': 'General Finance',
                'q-fin.MF': 'Mathematical Finance',
                'q-fin.PM': 'Portfolio Management',
                'q-fin.PR': 'Pricing of Securities',
                'q-fin.RM': 'Risk Management',
                'q-fin.ST': 'Statistical Finance',
                'q-fin.TR': 'Trading and Market Microstructure',
                'quant-ph': 'Quantum Physics',
                'stat.AP': 'Applications',
                'stat.CO': 'Computation',
                'stat.ME': 'Methodology',
                'stat.ML': 'Machine Learning',
                'stat.OT': 'Other Statistics',
                'stat.TH': 'StatisticsTheory'}

## Load Metadata

In [5]:
with initialize(config_path="../configs/", version_base=None):
    cfg = compose(config_name="paper_data.yaml")
print(OmegaConf.to_yaml(OmegaConf.to_container(cfg, resolve=True)))
print(fu.fu_help())

data_dir: /Users/daniellerothermel/drotherm/data/
raw_pdf_dir: /Users/daniellerothermel/drotherm/data/raw_pdfs/
parsed_pdf_dir: /Users/daniellerothermel/drotherm/data/parsed_pdfs/
metadata_dir: /Users/daniellerothermel/drotherm/data/parsed_pdfs/
author_data_dir: /Users/daniellerothermel/drotherm/data/author_data/
author_summaries_dir: /Users/daniellerothermel/drotherm/data/author_data/summaries/
arxiv_dir: /Users/daniellerothermel/drotherm/data/arxiv/
author_info_file: /Users/daniellerothermel/drotherm/data/author_data/manual_profiles.json
arxiv_metadata_file: /Users/daniellerothermel/drotherm/data/arxiv/arxiv-metadata-oai-2024-10-16.json
prof_pattern: (?P<professor_name>[\w_]+)
file_type_pattern: (?P<file_type>\w+)
version_pattern: v(?P<version>\d+)
author_summary_file_pattern: (?P<professor_name>[\w_]+)\.(?P<file_type>\w+)\.v(?P<version>\d+)


:: Help for dr_util.file_utils ::
 --------------------------------- 

 For pathlib helpers: fu.pathlib_help()

 Main Functions
  - load_file(

## Pandas Processing

### Utils

In [6]:
from typing import Dict, Any, Generator, Tuple, Optional
import json

In [7]:
def create_dataframe(generator: Generator, keep_cats=None, max_papers=None) -> pd.DataFrame:

    cols_to_drop = set(['versions', 'authors_parsed', 'report-no', 'license', 'submitter'])
    
    # I'll use this column to filter out paper duplicates.
    rows_to_cols = defaultdict(list)
    if keep_cats is not None:
        lower_cats = set([cat.lower() for cat in keep_cats])
    
    for row in generator:
        if max_papers is not None and len(rows_to_cols['abstracts']) == max_papers:
            break

        rcs = set([c.lower() for c in row['categories']])
        if rcs.isdisjoint(lower_cats):
            continue

        for k, v in row.items():
            if k in cols_to_drop:
                continue
                
            if k == 'versions_dates':
                rows_to_cols['date'].append(v[0])
            else:
                rows_to_cols[k].append(v)
        
    return pd.DataFrame.from_dict(rows_to_cols)


In [8]:
def get_dataset_generator(path: str) -> Generator:
    with open(path, "r") as fp:
        for line in fp:
            row = json.loads(line)
            yield row

In [20]:
# Initially from: https://www.kaggle.com/code/vbookshelf/part-1-build-an-arxiv-rag-search-system-w-faiss
def clean_text(x):
    # Replace newline characters with a space
    new_text = x.replace("\n", " ")
    # Remove leading and trailing spaces
    new_text = new_text.strip()
    return new_text

### Run

In [10]:
dataset_generator = get_dataset_generator(
    path=cfg.arxiv_metadata_file,
)
kc = set([c.lower() for c in cs_field_map.keys()]) # get the category names to keep
df = create_dataframe(dataset_generator, keep_cats=kc)
print(f">> Initial num pages after load: {df.shape[0]}")
df["date"] = pd.to_datetime(df["date"])
df = df.dropna(subset=['title', 'authors', 'date', 'abstract', 'categories']) # Expect a value in every column
print(f">>   after dropping nulls: {df.shape[0]}")

>> Initial num pages after load: 82892
>>   after dropping nulls: 82892


In [None]:
#df = df[df['comments'].str.contains(r'(?i)page', regex=True)]
#print(f">>   after dropping papers without page counts: {df.shape[0]}")
#df["url"] = "https://arxiv.org/abs/" + df["id"]
#df["pdf_url"] = "https://arxiv.org/pdf/" + df["id"]

In [24]:
df['abstract'] = df['abstract'].apply(clean_text)

In [11]:
df.to_json(cfg.arxiv_metadata_file.replace(".json", "_cs_cleaned.jsonl"), orient='records', lines=True)

In [12]:
df['date'].max(), df['date'].min()

(Timestamp('2018-07-18 05:34:49'), Timestamp('2007-03-31 23:52:33'))

In [16]:
df['abstract'][0]

'  In this paper, we introduce the on-line Viterbi algorithm for decoding hidden\nMarkov models (HMMs) in much smaller than linear space. Our analysis on\ntwo-state HMMs suggests that the expected maximum memory used to decode\nsequence of length $n$ with $m$-state HMM can be as low as $\\Theta(m\\log n)$,\nwithout a significant slow-down compared to the classical Viterbi algorithm.\nClassical Viterbi algorithm requires $O(mn)$ space, which is impractical for\nanalysis of long DNA sequences (such as complete human genome chromosomes) and\nfor continuous data streams. We also experimentally demonstrate the performance\nof the on-line Viterbi algorithm on a simple HMM for gene finding on both\nsimulated and real DNA sequences.\n'

In [18]:
print(df['abstract'][0])

  In this paper, we introduce the on-line Viterbi algorithm for decoding hidden
Markov models (HMMs) in much smaller than linear space. Our analysis on
two-state HMMs suggests that the expected maximum memory used to decode
sequence of length $n$ with $m$-state HMM can be as low as $\Theta(m\log n)$,
without a significant slow-down compared to the classical Viterbi algorithm.
Classical Viterbi algorithm requires $O(mn)$ space, which is impractical for
analysis of long DNA sequences (such as complete human genome chromosomes) and
for continuous data streams. We also experimentally demonstrate the performance
of the on-line Viterbi algorithm on a simple HMM for gene finding on both
simulated and real DNA sequences.



In [23]:
df['authors']

0        Rastislav \v{S}r\'amek, Bro\v{n}a Brejov\'a, T...
1                                                I. Grabec
2                                             Sergey Gubin
3                    Ketan D. Mulmuley Hariharan Narayanan
4                                        Ketan D. Mulmuley
                               ...                        
82887                         Wenfei Du and Rob Tibshirani
82888    Ranju Mandal and Partha Pratim Roy and Umapada...
82889                 Leying Guan, Xi Chen, Wing Hung Wong
82890                                       Amirsina Torfi
82891    Yuan Liu, Yuancheng Wang, Nan Li, Xu Cheng, Yi...
Name: authors, Length: 82892, dtype: object

In [21]:
clean_text(df['abstract'][0])

'In this paper, we introduce the on-line Viterbi algorithm for decoding hidden Markov models (HMMs) in much smaller than linear space. Our analysis on two-state HMMs suggests that the expected maximum memory used to decode sequence of length $n$ with $m$-state HMM can be as low as $\\Theta(m\\log n)$, without a significant slow-down compared to the classical Viterbi algorithm. Classical Viterbi algorithm requires $O(mn)$ space, which is impractical for analysis of long DNA sequences (such as complete human genome chromosomes) and for continuous data streams. We also experimentally demonstrate the performance of the on-line Viterbi algorithm on a simple HMM for gene finding on both simulated and real DNA sequences.'

In [22]:
print(clean_text(df['abstract'][0]))

In this paper, we introduce the on-line Viterbi algorithm for decoding hidden Markov models (HMMs) in much smaller than linear space. Our analysis on two-state HMMs suggests that the expected maximum memory used to decode sequence of length $n$ with $m$-state HMM can be as low as $\Theta(m\log n)$, without a significant slow-down compared to the classical Viterbi algorithm. Classical Viterbi algorithm requires $O(mn)$ space, which is impractical for analysis of long DNA sequences (such as complete human genome chromosomes) and for continuous data streams. We also experimentally demonstrate the performance of the on-line Viterbi algorithm on a simple HMM for gene finding on both simulated and real DNA sequences.


# Experimental

## Processing abstracts with Spacy

In [None]:
#Import NLP librarys and the spacy package to preprocess the abstract text
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #import commen list of stopword
import en_core_web_sm  # import downlaoded model

In [None]:
# Parser
parser = en_core_web_sm.load()
parser.max_length = 7000000 #Limit the size of the parser

def spacy_tokenizer(sentence):
    ''' Function to preprocess text of scientific papers 
        (e.g Removing Stopword and puntuations)'''
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] # transform to lowercase and then split the scentence
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ] #remove stopsword an punctuation
    mytokens = " ".join([i for i in mytokens]) 
    return mytokens

In [None]:
import string

punctuations = string.punctuation #list of punctuation to remove from text
stopwords = list(STOP_WORDS)
stopwords[:10]

In [None]:
df["processed_text"] = df["abstract"].apply(spacy_tokenizer)

## Analysis

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [None]:
df['abstract'].describe(include='all')
# If any were not unique
# df.drop_duplicates(['abstract',], inplace=True)

In [None]:
# I'll naively split the text by whitespace directly. It should be okay for an approximation.
# I might've alternatively tokenized using SpaCy, for example, or a HuggingFace WordPiece Tokenizer, for that matter.
df["abstract_len"] = df["abstract"].apply(lambda text: len(text.strip().split()))
#weights = np.ones_like(df["abstract_len"]) / len(dataset_df) * 100
#df["abstract_len"].hist(weights=weights)
#plt.ylabel("% Papers")
plt.ylabel("% Papers")


df["abstract_len"].hist()
plt.ylabel("# Papers")

plt.title("Abstract size distribution")
plt.xlabel("Abstract Number Words")
plt.show()

## Polars to Process

In [None]:
pl.Config(fmt_str_lengths=150);

In [None]:
dtypes = {
    'id': pl.Utf8,
    'submitter': pl.Utf8,
    'authors': pl.Utf8,
    'title': pl.Utf8,
    'comments': pl.Utf8,
    'journal-ref': pl.Utf8,
    'doi': pl.Utf8,
    'abstract': pl.Utf8,
    'report-no': pl.Utf8,
    'categories': pl.List(pl.Utf8),
    'versions': pl.List(pl.Utf8),
    'versions_dates': pl.List(pl.Utf8),
}

In [None]:
md_df2 = pl.read_ndjson(
    cfg.arxiv_metadata_file,
    dtypes=dtypes,
)

In [None]:
md_df = pl.read_ndjson(cfg.arxiv_metadata_file)
md_df.head(1)

In [None]:
cats_df = pl.Series(list(cs_field_map.keys()))
cats_df.head(3)

In [None]:
md_df = md_df.with_columns(pl.col("categories").cast(pl.List(pl.Utf8)))

In [None]:
md_df.head(2)

In [None]:
cs_df = md_df.filter(
    pl.col("categories").arr.eval(pl.element().is_in(cats_df)).any()
)
cs_df.shape

In [None]:
md_df = md_df.filter(
    pl.col('categories').is_in(list(cs_field_map.keys()))
)
md_df.shape

In [None]:

    & pl.col('comments').str.contains("(?i)page")