In [67]:
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import dask.bag as db
from dask.bag import Item
import json
import pandas as pd

docs = db.read_text('/Users/issa/code/clairefiltz/litreview/raw_data/arxiv-metadata-oai-snapshot.json').map(json.loads)

In [68]:
# start prototyping with a subset of the data so it's easyer to handel:
# This procedure was recommended in the ArXiv dataset itself

get_latest_version = lambda x: x['versions'][-1]['created']


# get only necessary fields of the metadata file
trim = lambda x: {'id': x['id'],
                  'authors': x['authors'],
                  'title': x['title'],
                  'doi': x['doi'],
                  'category':x['categories'].split(' '),
                  'abstract':x['abstract'],}
# filter for papers published on or after 2019-01-01
columns = ['id','category','abstract']
docs_df = (docs.filter(lambda x: int(get_latest_version(x).split(' ')[3]) > 2018).map(trim).compute())

# convert to pandas
docs_df = pd.DataFrame(docs_df)

In [69]:
docs_df.shape

(541338, 6)

In [70]:
docs_df['abstract'].apply(len).mean()


1046.829548267441

In [71]:
docs_df['abstract'].apply(len).min()

6

In [72]:
docs_df = docs_df.drop(docs_df[docs_df['abstract'].apply(len) < 1046].index)

In [73]:
docs_df.shape

(265819, 6)

In [74]:
docs_df['abstract'].apply(len).min()

1046

In [9]:
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

Collecting nltk
  Using cached nltk-3.6.5-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Using cached tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
Collecting regex>=2021.8.3
  Downloading regex-2021.11.10-cp38-cp38-macosx_10_9_x86_64.whl (288 kB)
     |████████████████████████████████| 288 kB 3.3 MB/s            
[?25hCollecting click
  Downloading click-8.0.3-py3-none-any.whl (97 kB)
     |████████████████████████████████| 97 kB 13.3 MB/s            
[?25hInstalling collected packages: tqdm, regex, click, nltk
Successfully installed click-8.0.3 nltk-3.6.5 regex-2021.11.10 tqdm-4.62.3


[nltk_data] Downloading package stopwords to /Users/issa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/issa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/issa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [85]:
# sampling a smaller subset of 1K
docs_1k = docs_df.sample(n = 1000)

In [78]:
import string
from nltk.corpus import stopwords 
from nltk import word_tokenize

def remove_punctuation(text):
    for punctuation in string.punctuation: 
        text = text.replace(punctuation, ' ') 
    return text

def lowercase(text): 
    lowercased = text.lower() 
    return lowercased

def remove_numbers(text):
    words_only = ''.join([i for i in text if not i.isdigit()])
    return words_only


stop_words = set(stopwords.words('english')) 

def remove_stopwords(text):
    tokenized = word_tokenize(text)
    without_stopwords = [word for word in tokenized if not word in stop_words]
    return without_stopwords


In [86]:
docs_1k['clean_abstract'] = docs_1k.abstract.apply(remove_punctuation).apply(lowercase).apply(remove_numbers).apply(remove_stopwords)
docs_1k.head(5)

Unnamed: 0,id,authors,title,doi,category,abstract,clean_abstract
178481,1911.02749,Tong Zhang and Fatih Porikli,Sparse Coding on Cascaded Residuals,,"[cs.CV, cs.LG, eess.IV]",This paper seeks to combine dictionary learn...,"[paper, seeks, combine, dictionary, learning, ..."
331163,2009.12524,Zanyar Zohourianshahzadi (UCCS) and Jugal Kuma...,Neural Twins Talk,10.1109/HCCAI49649.2020.00009,[cs.CV],Inspired by how the human brain employs more...,"[inspired, human, brain, employs, neural, path..."
188514,1911.12784,"Xueying Zhang, Wenlong Cai, Mengxing Wang, Kai...",Spin-torque memristors based on perpendicular ...,,[physics.app-ph],Spin-torque memristors were proposed in 2009...,"[spin, torque, memristors, proposed, could, pr..."
281705,2006.10055,"Oliver H.E. Philcox, Elena Massara, and David ...",What does the Marked Power Spectrum Measure? I...,10.1103/PhysRevD.102.043516,"[astro-ph.CO, astro-ph.GA, astro-ph.IM, hep-ph...",The marked power spectrum is capable of plac...,"[marked, power, spectrum, capable, placing, fa..."
66392,1902.08145,"Remco Duits, Etienne St-Onge, Jim Portegies, B...",Total Variation and Mean Curvature PDEs on $\m...,,"[math.AP, math.DG]",Total variation regularization and total var...,"[total, variation, regularization, total, vari..."


In [88]:
docs_1k['clean_abstract'] = docs_1k['clean_abstract'].apply(lambda x:' '.join(x))
docs_1k.head(5)

Unnamed: 0,id,authors,title,doi,category,abstract,clean_abstract
178481,1911.02749,Tong Zhang and Fatih Porikli,Sparse Coding on Cascaded Residuals,,"[cs.CV, cs.LG, eess.IV]",This paper seeks to combine dictionary learn...,paper seeks combine dictionary learning hierar...
331163,2009.12524,Zanyar Zohourianshahzadi (UCCS) and Jugal Kuma...,Neural Twins Talk,10.1109/HCCAI49649.2020.00009,[cs.CV],Inspired by how the human brain employs more...,inspired human brain employs neural pathways i...
188514,1911.12784,"Xueying Zhang, Wenlong Cai, Mengxing Wang, Kai...",Spin-torque memristors based on perpendicular ...,,[physics.app-ph],Spin-torque memristors were proposed in 2009...,spin torque memristors proposed could provide ...
281705,2006.10055,"Oliver H.E. Philcox, Elena Massara, and David ...",What does the Marked Power Spectrum Measure? I...,10.1103/PhysRevD.102.043516,"[astro-ph.CO, astro-ph.GA, astro-ph.IM, hep-ph...",The marked power spectrum is capable of plac...,marked power spectrum capable placing far tigh...
66392,1902.08145,"Remco Duits, Etienne St-Onge, Jim Portegies, B...",Total Variation and Mean Curvature PDEs on $\m...,,"[math.AP, math.DG]",Total variation regularization and total var...,total variation regularization total variation...


In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer

text = docs_1k['clean_abstract']

vec = TfidfVectorizer()

X = vec.fit_transform(text)
X = X.toarray()
X.shape

(1000, 13278)

In [90]:
X_df = pd.DataFrame(X ,columns = vec.get_feature_names_out())
X_df.head(5)

Unnamed: 0,aa,aavso,ab,abaqus,abbott,abbreviated,abcd,abelian,aberration,abilities,...,zobov,zone,zones,zou,zpl,zpp,zr,zsl,zwick,zwicky
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
