# Splitting out data for garrettgoon.com/arxiv-vixra-quiz

In [1]:
# numpy: linear algebra.
import numpy as np

# pandas: tabular data.
import pandas as pd
pd.set_option('float_format', '{:f}'.format)

# pytorch: ML.
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# matplotlib and seaborn: plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
%matplotlib inline

In [2]:
# We will use a set random seed when calling df.sample for reproducibility
pd_random_state=1

In [3]:
# Give the notebook access to the rest of your google drive files.
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
# Enter the re4/1AX4XfWhLukbySeAMdvjacsO1rPZFrZP1GLQbUYtWVIYEtqKnEeydcTpw16Ylevant foldername
FOLDERNAME = '/content/drive/My Drive/ML/arxiv_vixra/'
assert FOLDERNAME is not None, "[!] Enter the foldername."
# For importing .py modules stored in FOLDERNAME or a subdirectory:
import sys
sys.path.append(FOLDERNAME)

Mounted at /content/drive


Copy and unzip the data to the local directory

In [4]:
# Quick check if data is already mounted.
!ls

drive  sample_data


In [7]:
data_file = 'balanced_filtered_data_train.feather'
data_path = FOLDERNAME + 'data/balanced_data_splits/' + data_file
!cp '{data_path}' .
balanced_filtered_data = pd.read_feather(data_file)
balanced_filtered_data.head()

Unnamed: 0,title,abstract,file_name,source,title_char_len,abstract_char_len,title_word_count,abstract_word_count,abstract_avg_word_len,title_avg_word_len,abstract_numerical_fraction,abstract_word_len_var,abstract_common_char_frac,title_common_char_frac
0,Energy Mass Equivalence in Celestial Systems,E=mc^2 \r\nwe could take this relation to cele...,vixra.1810.0020,vixra,39,428,6,107,4.0,6.5,0.014019,6.0,0.988868,1.0
1,Controllable optical response and tunable sens...,We study the self interference effect of a r...,arxiv.2010.09319,arxiv,88,873,14,149,5.85906,6.285714,0.0,12.080807,0.986328,0.990291
2,CLTG Emergent and Fission Classifications of M...,Part I of Emergent Theory elaborated many obje...,vixra.1809.0085,vixra,88,449,13,86,5.22093,6.769231,0.0,8.497701,0.988785,1.0
3,Stochastic Three-Composite Convex Minimization,We propose a stochastic optimization method ...,arxiv.1701.09033,arxiv,43,749,4,127,5.897638,10.75,0.0,12.627317,0.986333,1.0
4,A Global SU(5) F-theory model with Wilson line...,We engineer compact SU(5) Grand Unified Theo...,arxiv.1206.6132,arxiv,47,919,9,161,5.708075,5.222222,0.003264,9.486208,0.986137,1.0


# JSON Exporting for Website

Take an equal number of samples, map the `source` column onto 0/1, and append a `url` column.

In [8]:
website_samples_of_each = 5000
website_arxiv_data =balanced_filtered_data[balanced_filtered_data['source'] == 'arxiv'].sample(website_samples_of_each, random_state=pd_random_state)
website_vixra_data =balanced_filtered_data[balanced_filtered_data['source'] == 'vixra'].sample(website_samples_of_each, random_state=pd_random_state)
website_data = website_arxiv_data.append(website_vixra_data)
website_data = website_data.sample(frac=1, random_state=pd_random_state)
website_data = website_data[['source', 'title', 'abstract','file_name']]
website_data['source'] = website_data['source'].apply(lambda x: 1 if x == 'arxiv' else 0)
website_data.head()

Unnamed: 0,source,title,abstract,file_name
2469,0,Is Laser Quantum Generator?,In the scientific articles and the publication...,vixra.1401.0004
30978,1,Equivariant cellular homology and its applicat...,In this work we develop a cellular equivaria...,arxiv.math.0112182
28950,1,Mitigating Inter-network Interference in LoRa ...,Long Range (LoRa) is a popular technology us...,arxiv.1611.00688
17519,1,Optomechanical trapping and cooling of partial...,We consider the radiative trapping and cooli...,arxiv.0708.4078
17103,0,Communicative Message as Nuclear Thinking of a...,The paper is positioned in the Fundamentals of...,vixra.1511.0069


Convert the file names to the appropriate urls

In [9]:
import re

# File names are of the form arxiv.1706.03183.txt or arxiv.supr-con.9608008.txt or vixra.0702.0059.txt
# so strip the .txt's and arxiv.'s and vixra.'s, turn the 

def slash_to_dot(match_obj):
    found_str = match_obj.group()
    if found_str is not None:
        return found_str.replace('/', '.')

def file_to_url(file_name, source):
    file_name = file_name.replace('arxiv.', '')
    file_name = file_name.replace('vixra.', '')
    file_name = file_name.replace('.txt', '')
    file_name = file_name.replace('.', '/')
    file_name = re.sub(r"\d[/]\d", slash_to_dot, file_name)
    if source:
        url = 'https://arxiv.org/abs/' + file_name
    else:
        url = 'https://www.vixra.org/abs/' + file_name
    return url
    
print(file_to_url('arxiv.1706.03183.txt', 1),
      file_to_url('arxiv.supr-con.9608008.txt', 1),
      file_to_url('vixra.0702.0059.txt', 0),
      sep='\n')

https://arxiv.org/abs/1706.03183
https://arxiv.org/abs/supr-con/9608008
https://www.vixra.org/abs/0702.0059


Create url field

In [10]:
website_data['url'] = website_data['file_name']
website_data.loc[website_data['source']==1, 'url'] = website_data.loc[website_data['source']==1, 'url'].apply(lambda x: file_to_url(x,1))
website_data.loc[website_data['source']==0, 'url'] = website_data.loc[website_data['source']==0, 'url'].apply(lambda x: file_to_url(x,0))

In [11]:
website_data.head()

Unnamed: 0,source,title,abstract,file_name,url
2469,0,Is Laser Quantum Generator?,In the scientific articles and the publication...,vixra.1401.0004,https://www.vixra.org/abs/1401.0004
30978,1,Equivariant cellular homology and its applicat...,In this work we develop a cellular equivaria...,arxiv.math.0112182,https://arxiv.org/abs/math/0112182
28950,1,Mitigating Inter-network Interference in LoRa ...,Long Range (LoRa) is a popular technology us...,arxiv.1611.00688,https://arxiv.org/abs/1611.00688
17519,1,Optomechanical trapping and cooling of partial...,We consider the radiative trapping and cooli...,arxiv.0708.4078,https://arxiv.org/abs/0708.4078
17103,0,Communicative Message as Nuclear Thinking of a...,The paper is positioned in the Fundamentals of...,vixra.1511.0069,https://www.vixra.org/abs/1511.0069


In [12]:
  website_data.to_json(FOLDERNAME + 'data/website_data.json', orient='index')