# Splitting out data for garrettgoon.com/arxiv-vixra-quiz

In [1]:
# numpy: linear algebra.
import numpy as np

# pandas: tabular data.
import pandas as pd
pd.set_option('float_format', '{:f}'.format)

# pytorch: ML.
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# matplotlib and seaborn: plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
%matplotlib inline

In [2]:
# Give the notebook access to the rest of your google drive files.
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
# Enter the re4/1AX4XfWhLukbySeAMdvjacsO1rPZFrZP1GLQbUYtWVIYEtqKnEeydcTpw16Ylevant foldername
FOLDERNAME = '/content/drive/My Drive/ML/arxiv_vixra/'
assert FOLDERNAME is not None, "[!] Enter the foldername."
# For importing .py modules stored in FOLDERNAME or a subdirectory:
import sys
sys.path.append(FOLDERNAME)

Mounted at /content/drive


Copy and unzip the data to the local directory

In [3]:
# Quick check if data is already mounted.
!ls

drive  sample_data


In [4]:
data_file = 'balanced_filtered_data_train.feather'
data_path = FOLDERNAME + 'data/data_splits/' + data_file
!cp '{data_path}' .
balanced_filtered_data = pd.read_feather(data_file)
balanced_filtered_data.head()

Unnamed: 0,title,abstract,file_name,source,title_char_len,abstract_char_len,title_word_count,abstract_word_count,abstract_avg_word_len,title_avg_word_len,abstract_numerical_fraction,abstract_word_len_var,abstract_common_char_frac,title_common_char_frac
0,REITS: Reflective Surface for Intelligent Tran...,Autonomous vehicles are predicted to dominat...,arxiv.2010.13986,arxiv,58,938,7,154,6.090909,8.285714,0.003198,11.043684,0.986289,1.0
1,Mirror Images and Division by Zero Calculus,Very classical results on the mirror images of...,vixra.2009.0051,vixra,37,198,7,41,4.829268,5.285714,0.0,8.044021,1.0,1.0
2,Electromagnetic Waves in an Expanding 5D Universe,Electromagnetism is analyzed in a 5D expanding...,vixra.1301.0086,vixra,43,491,7,89,5.516854,6.142857,0.004073,13.71039,1.0,1.0
3,Atomic-Scale Erector,To design buildings that can withstand the lar...,vixra.1907.0094,vixra,19,453,2,89,5.089888,9.5,0.013245,9.52001,0.990792,1.0
4,Temperature dependence of microwave and THz di...,"The microwave, near-millimetre and infrared ...",arxiv.cond-mat.0401425,arxiv,78,468,11,77,6.077922,7.090909,0.042735,11.94198,0.985375,0.988889


# JSON Exporting for Website

Take an equal number of samples, map the `source` column onto 0/1, and append a `url` column.

In [11]:
num_samples = 5000
website_arxiv_data =balanced_filtered_data[balanced_filtered_data['source'] == 'arxiv'].sample(num_samples)
website_vixra_data =balanced_filtered_data[balanced_filtered_data['source'] == 'vixra'].sample(num_samples)
website_data = website_arxiv_data.append(website_vixra_data)
website_data = website_data.sample(frac=1)
website_data = website_data[['source', 'title', 'abstract','file_name']]
website_data['source'] = website_data['source'].apply(lambda x: 1 if x == 'arxiv' else 0)
website_data.head()

Unnamed: 0,source,title,abstract,file_name
19262,1,On the Residual Finiteness Growths of Particul...,We give a quantification of residual finiten...,arxiv.1412.6835
8781,1,Bloch Solutions of Periodic Dirac Equations in...,We provide the representation of quasi-perio...,arxiv.1006.3596
12712,1,Deflagration to Detonation,Thermonuclear explosions of Type Ia supernov...,arxiv.astro-ph.9910454
19708,1,Non-linear Nyquist theorem: A conjecture,Thermodynamics of equilibrium states is well...,arxiv.1409.6461
8988,1,Mixed Direct-Iterative Methods for Boundary In...,This paper describes a mixed direct-iterativ...,arxiv.chem-ph.9510002


Convert the file names to the appropriate urls

In [6]:
import re

# File names are of the form arxiv.1706.03183.txt or arxiv.supr-con.9608008.txt or vixra.0702.0059.txt
# so strip the .txt's and arxiv.'s and vixra.'s, turn the 

def slash_to_dot(match_obj):
    found_str = match_obj.group()
    if found_str is not None:
        return found_str.replace('/', '.')

def file_to_url(file_name, source):
    file_name = file_name.replace('arxiv.', '')
    file_name = file_name.replace('vixra.', '')
    file_name = file_name.replace('.txt', '')
    file_name = file_name.replace('.', '/')
    file_name = re.sub(r"\d[/]\d", slash_to_dot, file_name)
    if source:
        url = 'https://arxiv.org/abs/' + file_name
    else:
        url = 'https://www.vixra.org/abs/' + file_name
    return url
    
print(file_to_url('arxiv.1706.03183.txt', 1),
      file_to_url('arxiv.supr-con.9608008.txt', 1),
      file_to_url('vixra.0702.0059.txt', 0),
      sep='\n')

https://arxiv.org/abs/1706.03183
https://arxiv.org/abs/supr-con/9608008
https://www.vixra.org/abs/0702.0059


Create url field

In [7]:
website_data['url'] = website_data['file_name']
website_data.loc[website_data['source']==1, 'url'] = website_data.loc[website_data['source']==1, 'url'].apply(lambda x: file_to_url(x,1))
website_data.loc[website_data['source']==0, 'url'] = website_data.loc[website_data['source']==0, 'url'].apply(lambda x: file_to_url(x,0))

In [8]:
website_data.head()

Unnamed: 0,source,title,abstract,file_name,url
10789,0,Energy-Efficient Quantum Electronics,Scientists have made a breakthrough in the dev...,vixra.2003.0519,https://www.vixra.org/abs/2003.0519
26964,0,Packaged Drinking Water Quality Characteristic...,The inadequacy of protected water supplies in ...,vixra.1405.0111,https://www.vixra.org/abs/1405.0111
18358,1,Heegard-Berger and Cascade Source Coding Probl...,"For the HB problem with the CR constraint, t...",arxiv.1112.1762,https://arxiv.org/abs/1112.1762
8825,0,"Electron Carries ""Hidden"" 31,6 GW Field Energy...","An electron is enveloped by a ""hidden"" electr...",vixra.1808.0179,https://www.vixra.org/abs/1808.0179
21284,0,What Quantum Symmetry Should be,This two page note summarises the quantum grav...,vixra.1908.0481,https://www.vixra.org/abs/1908.0481


In [9]:
website_data.to_json(FOLDERNAME + 'data/website_data.json', orient='index')