## In this work we removed old indexes (before 2007-04) because of very non understandable format specifications.

In [1]:
import re
import sys
import json
import subprocess
import traceback
import xml.etree.ElementTree as ET

import csv
from collections import defaultdict

import json
import ast

from functools import reduce

In [2]:
manifest_file = 'tools/arXiv_pdf_manifest_newindex.xml'
meta_snap_file = 'tools/arxiv-metadata-oai-snapshot.json'
meta_cat_file = 'tools/arxiv-metadata-ext-category.csv'
meta_tax_file = 'tools/arxiv-metadata-ext-taxonomy.csv'

in_categories = 'categories.txt'

year_choice = '2020'
month_choice = '09'
seq_nums = range(1, 30)

out_dir = 'data'
mode = 'pdf'
log_file_path = 'logs/log.txt'

In [3]:
if mode != 'pdf' and mode != 'src':
    raise Exception('mode should be "pdf" or "src"')

# s3cmd
First thing is to set s3cmd environment with your personal configuration keys. You will need to set up a AWS account, and set the payment card in order to pay for the arXiv download. In fact, the arXiv buckets are configurated as "requester-pay", so you'll be charged for each download. Not be intimidated: you'll pay around 0,02€ for each GB you download, but remember that the whole arXive is around 2TB that costs you a bit less than 50€.

You need to set s3cmd with the following command:

$ s3cmd --configure

and enter ACCESS_KEY and SECRET_KEY. After this, you are right to go.


In [4]:
def get_file(fname, out_dir):
    cmd = ['s3cmd', 'get', '--requester-pays',
           's3://arxiv/%s' % fname, './%s' % out_dir]
    print(' '.join(cmd))
    # subprocess.call(' '.join(cmd), shell=True)    

# Utils

We need some utils function, so we create, first of all, the functions we need in order to manage the data and save the useful files for further uses.

In [5]:
def categories_to_ordered_list(path_to_file):
  """""
  Arguments:
    - path_to_file: path to file 'arxiv-metadata-ext-taxonomy.csv' composed by a list of all categories that forms the arXiv taxonomy.

  Output:
    - meta_ls: list of all categories and description, grouped by archive without intra group divisions (ex. just 'cs' and not 'cs.AI', etc)
    - unordered: list of all categories in the order are proveded in 'arxiv-metadata-ext-taxonomy.csv' file.
  """""

  meta_ls = []
  unordered = []
  with open(path_to_file, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    header = next(csvreader)
    for row in csvreader:
      meta_ls.append([row[4].split('.')[0] if '.' in row[4] else row[4], row[1]])
      unordered.append(row[4])
  meta_ls = [list(item) for item in set(tuple(row) for row in meta_ls)]
  unordered = list(set(unordered))
  return sorted(meta_ls), unordered

In [6]:
def input_snapshot_to_json(path_to_file, meta_tax_unordered):
  """""
  Arguments:
    - path_to_file: path to file 'arxiv-metadata-oai-snapshot.json' composed by a list of all papers with id, title, authors, abstract and others properties.

  Output:
    - df_data_id: dictionary with "id" key and dictionary values composed by "id, nid (modified id to represent old and new id in one single representation), and categories"
    - df_data_nid: dictionary with "nid" key and dictionary values composed by "id, nid (modified id to represent old and new id in one single representation), and categories"
    - ls_data: list of ordered "nid" values in order to obtain a sub list in between two valued (necessary to comprehend which pdf are in a .tar file, and of which category).
  """""
  df_data_id = {}
  df_data_nid = {}
  ls_data = []
  last = 0
  with open(path_to_file) as json_file:
    for j,line in enumerate(json_file):
      single_dict = {}
      # 1382795
      # 1796911
      if j > 1382793:
        break

      if int(j/1382795*100) != last:
        last = int(j/1382795*100)
        print(' ['+ str(last) +'%] '+str(j)+'/1382795')
      row = json.loads(line)
      single_dict['id'] = row['id']
      single_dict['nid'] = '20'+row['id'] if '.' in row['id'] else ('20'+str(row['id'][-7:-3]) if str(row['id'][-7:-3]) < "50" else '19'+str(row['id'][-7:-3]))+'.'+get_number(meta_tax_unordered, row['id'])+str(row['id'][-3:])
      single_dict['categories'] = row['categories']
      df_data_id[single_dict['id']] = single_dict
      df_data_nid[single_dict['nid']] = single_dict
      ls_data.append(single_dict['nid'])
  return df_data_id, df_data_nid, sorted(ls_data)

Once that we have saved our useful files, we could start the script using the already created files instead of calsulate everithing from scratch, again. So we'll try to load the saved files, if something goes wrong, we will proceed creating those again, but if everithing goes right, we will proceed stright forward with the loaded files.

In [7]:
def read_dict_from_csv(path_to_file):
  meta_df = {}
  with open(path_to_file, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    # header = next(csvreader)
    for row in csvreader:
      first, second = row[0], row[1]
      
      meta_df[first] = second
      
  return meta_df

def read_list_from_txt(path_to_file):
  meta_ls = []
  with open(path_to_file, 'r') as txtfile:
    for row in txtfile:
      meta_ls.append(row.strip())
  return meta_ls

Then we need the funtions to save our dictionaries and lists to file.

In [8]:
def save_dict_to_csv(file_dict, filt_to_save):
  w = csv.writer(open(filt_to_save, "w"))
  for key, val in file_dict.items():
        w.writerow([key, val])

def save_list_to_txt(file_list, filt_to_save, to_set=True, sort=True):
  unique_list = file_list
  if to_set: 
    unique_list = set(unique_list)
  if sort:
    unique_list = sorted(unique_list)
  with open(filt_to_save, 'w') as f:
    for item in unique_list:
        f.write("%s\n" % item)

In [9]:
log_file = open(log_file_path, 'a')

In [10]:
  try:
    print("Check if files exist")
    meta_snap_dict_id = read_dict_from_csv('backups/metadata_snap_dict_id.csv')
    meta_snap_dict_nid = read_dict_from_csv('backups/metadata_snap_dict_nid.csv')
    meta_cat_list = read_list_from_txt('backups/metadata_cat_list_nid.txt')
    meta_tax_list = read_list_from_txt('backups/selected_categories.txt')
    print("Files exist, load them correctly")
    
  except IOError:
    print("Files don't exist")
    print("Re make those")
    meta_tax_list, meta_tax_unordered = categories_to_ordered_list(meta_tax_file)
    meta_snap_dict_id, meta_snap_dict_nid, meta_cat_list = input_snapshot_to_json(meta_snap_file, meta_tax_unordered)

    print("Save files for future")
    save_dict_to_csv(meta_snap_dict_id, 'backups/metadata_snap_dict_id.csv')
    save_dict_to_csv(meta_snap_dict_nid, 'backups/metadata_snap_dict_nid.csv')
    save_list_to_txt(meta_cat_list, 'backups/metadata_cat_list_nid.txt')
    save_list_to_txt(meta_tax_list, 'backups/selected_categories.txt', False, False)


Check if files exist
Files exist, load them correctly


In [11]:
  
def input_categories_to_list(path_to_file):
  """""
  Arguments:
    - path_to_file: path to file 'categories.txt' composed by a list of all categories we are interested in. It can contain commented lines ("#") and empty lines ("\n") that will be avoided.

  Output:
    - in_ls: list of all categories we are interested in.
  """""
  in_ls = []
  with open(path_to_file, 'r') as txtfile:
    for row in txtfile:
      line = txtfile.readline()
      if line[0] != '#' and line != '\n':
        in_ls.append(line.strip())

  return sorted(in_ls)

In [12]:
def read_dict_list_from_csv(path_to_file):
  """""
  Arguments:
    - path_to_file: path to file 'arxiv-metadata-ext-category.csv' composed by a list of papers and its categories.

  Output:
    - meta_df: list of all categories for each paper.
  """""
  meta_df = {}
  with open(path_to_file, 'r') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')
    # header = next(csvreader)
    for row in csvreader:
      first, second = row[0], row[1]
      if first in meta_df:
          meta_df[first].append(second)
      else:
          meta_df[first] = [second] 
      
  return meta_df

In [13]:
in_cat_list = input_categories_to_list(in_categories)
meta_tax_list = read_dict_list_from_csv('tools/arxiv-metadata-ext-category.csv')


In [14]:
def tar_categories_list(fitem, litem, meta_snap_dict_id, meta_snap_dict_nid, meta_cat_list):
  """""
  Arguments:
    - fitem: first item id in tar file
    - litem: last item id in tar file
    - meta_snap_dict_id: dictionary with keys "id" and values "dictionary with id, nid, and categories" for each paper
    - meta_snap_dict_nid: dictionary with keys "nid" and values "dictionary with id, nid, and categories" for each paper
    - meta_cat_list: list loaded from 'metadata_cat_list_nid.txt' composed by all nid index ordered by date

  Output:
    - step_seven [None]: list of all categories classes that occur at least once in the arxiv tar papers. It is None if first index is bigger than last index. It used to occure whith old format index so we removed old indexes (before 2007-04)
  """""
  fid = fitem if '.' in fitem else str(fitem[:-7])+'/'+str(fitem[-7:])
  lid = litem if '.' in litem else str(litem[:-7])+'/'+str(litem[-7:])
  
  fnid = ast.literal_eval(meta_snap_dict_id[fid])
  lnid = ast.literal_eval(meta_snap_dict_id[lid])

  findex = meta_cat_list.index(fnid['nid'])
  lindex = meta_cat_list.index(lnid['nid'])

  step_one = [meta_snap_dict_nid[meta_cat] for meta_cat in meta_cat_list[findex:lindex+1]]
  
  step_two = [ast.literal_eval(meta_snap)['categories'] for meta_snap in step_one]
  
  step_three = [ele.split(' ') for ele in step_two]
  
  step_four = [[el.split('.')[0] if '.' in el else el for el in ele ] for ele in step_three]
  
  step_five = reduce(lambda x,y: x+y, step_four)
  
  step_six = set(step_five)

  step_seven = list(step_six)

  return step_seven if findex < lindex else None


In [22]:
def intersection(lst1, lst2): 
  lst3 = [value for value in lst1 if value in lst2] 
  return lst3 

In [23]:
def paper_to_download(cat_list, in_cat_list):
  inter = intersection(cat_list, in_cat_list)
  return False if not inter else True, inter

In [24]:
total_size = 0

try:
  for event, elem in ET.iterparse(manifest_file):
    
    tag = elem.tag
    value = elem.text

    if event == 'end':
          
      if tag != 'file':
        if value == None:
          raise Exception('None occured in start when tag != file at: ', fname, fitem, litem, tag)

        elif tag == 'filename' :
          fname = value 
        elif tag == 'first_item' :
          fitem = value 
        elif tag == 'last_item' :
          litem = value 
        elif tag == 'size' :
          size = int(value) 

      elif tag == 'file':

        cat_list = tar_categories_list(fitem, litem, meta_snap_dict_id, meta_snap_dict_nid, meta_cat_list)
        
        if cat_list == None:
          raise Exception('None occured in categories list: first index > last index. ** If you are working also with old index formatted papers (before 2007-04) have a look to the Introduction at the beginnig. **')

        
        choice, intersection = paper_to_download(cat_list, in_cat_list)

        total_size += size

        # get_file(fname, out_dir='%s/%s/' % (out_dir, mode))
        
        log_file.write(str(fname) + '\t' + str(choice) + '\t' + str(intersection) + '\t' + str(size / 1073741824)+' GB' + '\n')

        print(str(total_size / 1073741824)+' GB')
    
    elem.clear()
except:
  traceback.print_exc()

print('Finished')


0.5260372646152973 GB
Finished
Traceback (most recent call last):
  File "<ipython-input-24-491a2e005694>", line 32, in <module>
    choice, intersection = paper_to_download(cat_list, in_cat_list)
  File "<ipython-input-23-9100a4ea0e3d>", line 2, in paper_to_download
    inter = intersection(cat_list, in_cat_list)
TypeError: 'list' object is not callable
