In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
import tensorflow_hub as tf_hub
from tensorflow.keras import mixed_precision
from official.nlp import optimization  # to create AdamW optimizer

#device_to_use = 'CPU' #SET GLOBAL DEVICE
device_to_use = 'GPU' #SET GLOBAL DEVICE

USE_SOFTMAX = False
TRAIN_MODEL = False


if (device_to_use=='GPU'):
    #mixed_precision.set_global_policy('mixed_float16') #tf bug, cant save model when using fp16 :(
    #tf.config.optimizer.set_jit(True) # Enable XLA. SOMETIMES IT DOESNT WORK AND U CANT FIGURE OUT WHATS WRONG
    pass
else:
    # Hide GPU from visible devices
    tf.config.set_visible_devices([], 'GPU')

print(tf.config.get_visible_devices())

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2022-02-03 10:18:53.842890: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-03 10:18:53.875166: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-03 10:18:53.875360: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [2]:
def read_json_data(filename):
  import json
  with open(filename, 'r') as json_file:
    dataset_dict = json.load(json_file)

  dataset = []

  for _,group in dataset_dict.items():
    group_data = []

    for _,data in group.items():
      group_data.append(data)

    dataset.append(group_data)

  train_set = dataset[0]
  val_set = dataset[1]
  test_set = dataset[2]

  return train_set,val_set,test_set

def remove_copyright(text):

  copyright_idx = text.find('©')
  if(copyright_idx!=-1):
    text = text[:copyright_idx]

  return text


def create_tf_dataset(text,labels,preprocess_model=None,softmax=False,shuffle=True):
  if(softmax and (labels is not None) ):
    labels = tf.one_hot(labels,2)

  if(labels is not None):
    dataset = tf.data.Dataset.from_tensor_slices( (text,labels) )
  else:
    dataset = tf.data.Dataset.from_tensor_slices( text )

  if(shuffle): dataset = dataset.shuffle(32768)
  
  dataset = dataset.batch(8)

  if (preprocess_model is not None and (labels is not None)):
    dataset = dataset.map(lambda x,y: (preprocess_model(x), y))
  elif (preprocess_model is not None and (labels is None)):
    dataset = dataset.map(lambda x: preprocess_model(x))

  dataset = dataset.prefetch(8)

  return dataset

def get_datasets_and_metadata(filename):
  train_set,val_set,test_set = read_json_data(filename)

  train_abs = [None] * len(train_set[0])
  for idx,abs in enumerate(train_set[0]):
    train_abs[idx] = remove_copyright(abs)

  val_abs = [None] * len(val_set[0])
  for idx,abs in enumerate(val_set[0]):
    val_abs[idx] = remove_copyright(abs)

  test_abs = [None] * len(test_set[0])
  for idx,abs in enumerate(test_set[0]):
    test_abs[idx] = remove_copyright(abs)

  train_set[0] = train_abs
  val_set[0] = val_abs
  test_set[0] = test_abs

  return train_set,val_set,test_set

In [3]:
bert_model_name = 'small_bert/bert_en_uncased_L-6_H-768_A-12'

map_name_to_handle = {
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1'
}

map_model_to_preprocess = {
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

bert_preprocess_model = tf_hub.KerasLayer(tfhub_handle_preprocess)

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


2022-02-03 10:19:07.016493: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-03 10:19:07.017331: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-03 10:19:07.017598: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-03 10:19:07.017773: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [4]:
def build_preprocess_model(sentence_features, seq_length=512):
  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = tf_hub.load(tfhub_handle_preprocess)
  tokenizer = tf_hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  segments = [tokenizer(s) for s in input_segments]

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  packer = tf_hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(segments)
  return tf.keras.Model(input_segments, model_inputs)

def build_classifier_model(num_classes=2, softmax=False):
    
    class Classifier(tf.keras.Model):
        def __init__(self):
            super(Classifier, self).__init__(name="prediction")
            self.encoder = tf_hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')

            if(softmax): 
               self.dense = tf.keras.layers.Dense(num_classes,activation='softmax')
            else:
                self.dense = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')

        def call(self, preprocessed_text):
            encoder_outputs = self.encoder(preprocessed_text)
            pooled_output = encoder_outputs["pooled_output"]
            x = self.dense(pooled_output)
            return x

    model = Classifier()
    return model

def get_model_predictions(dataset,model,softmax=False):
    predictions = model.predict(dataset)
    
    if (softmax):
        yes_scores = predictions[:,1]
        no_scores = predictions[:,0]
        
        positive = np.count_nonzero(yes_scores>no_scores)
        negative = np.count_nonzero(yes_scores<no_scores)
    else:
        positive = np.count_nonzero(predictions>=0.5)
        negative = np.count_nonzero(predictions<0.5)

    print(float(positive)/(positive+negative))

    return predictions

In [5]:
train_set,val_set,test_set = get_datasets_and_metadata("AI_Paper_Classification_Dataset2.json")

preprocess_model = build_preprocess_model(["input 1"])

train_dataset = create_tf_dataset(train_set[0],train_set[1],preprocess_model,softmax=USE_SOFTMAX)
val_dataset = create_tf_dataset(val_set[0],val_set[1],preprocess_model,softmax=USE_SOFTMAX)

classifier_model = build_classifier_model(softmax=USE_SOFTMAX)

In [6]:
if (USE_SOFTMAX):
    loss = tf.keras.losses.CategoricalCrossentropy()
else:
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)


metrics = [ tf.keras.metrics.CategoricalAccuracy() if USE_SOFTMAX else tf.keras.metrics.BinaryAccuracy() ,
            tf.keras.metrics.AUC(),
            tf.keras.metrics.FalsePositives(),
            tf.keras.metrics.FalseNegatives(),
            tf.keras.metrics.TruePositives(),
            tf.keras.metrics.TrueNegatives(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()
        ]


epochs = 1
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)


optimizer = optimization.create_optimizer(init_lr=3e-5,
                                        num_train_steps=num_train_steps,
                                        end_lr=1e-6,
                                        num_warmup_steps=num_warmup_steps,
                                        optimizer_type='adamw')


classifier_model.compile(optimizer=optimizer,
                        loss=loss,
                        metrics=metrics)


if (TRAIN_MODEL):
    history = classifier_model.fit(x=train_dataset,
                                validation_data=val_dataset,
                                epochs=epochs
                                )

    classifier_model.save_weights('./bert_weights3/')
    
else:
    classifier_model.load_weights('./bert_weights3/')

In [24]:
classifier_model.evaluate(val_dataset)



[0.13383325934410095,
 0.964496910572052,
 0.9908130168914795,
 2581.0,
 1165.0,
 41959.0,
 59807.0,
 0.9420520663261414,
 0.9729849100112915]

In [None]:
test_dataset = create_tf_dataset(test_set[0],None,preprocess_model,shuffle=False,softmax=USE_SOFTMAX)
predictions = get_model_predictions(test_dataset,classifier_model)

In [7]:
test_files = {
            'Statistical Methodology.xlsx':'Statistical Methodology.xlsx',
            'Agricultural Systems.xlsx':'Agricultural Systems.xlsx',
            'Robotics.xlsx':'Robotics.xlsx',
            'Signal Processing.xlsx':'Signal Processing.xlsx',
            'Molecular Biology.xlsx':'Molecular Biology.xlsx',
            'Economics.xlsx':'Economics.xlsx',
            'Medicine.xlsx':'Medicine.xlsx',
            'Neurology.xlsx':'Neurology.xlsx',
            'Network.xlsx':'Network.xlsx',
            'Archaeology.xlsx':'Archaeology.xlsx',
            'Political Psychology.xlsx':'Political Psychology.xlsx'
            }

for key,value in test_files.items():
  #if(key=='Archaeology.xlsx'):
    test_abs = pd.read_excel(value,sheet_name = 'Sheet1')
    test_abs = test_abs.loc[:,['Abstract']]
    test_abs = test_abs.to_numpy().squeeze()

    test_abs2 = [None] * len(test_abs)
    for idx,abs in enumerate(test_abs):
      test_abs2[idx] = remove_copyright(abs)


    test_abs2 = create_tf_dataset(test_abs2,None,preprocess_model,softmax=USE_SOFTMAX,shuffle=False)

    print(key)
    get_model_predictions(test_abs2,classifier_model,softmax=USE_SOFTMAX)
    
    # count = 0
    # max_shown = 5
    # for i in range(len(yes_scores)):
    #   if yes_scores[i]>no_scores[i]:
    #     print(yes_scores[i])
    #     print(abs_text[i])
    #     count= count + 1
    #   if (max_shown == count):
    #     break



Statistical Methodology.xlsx


2022-02-01 11:13:38.614584: I tensorflow/compiler/xla/service/service.cc:171] XLA service 0x7f16b400bf00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-02-01 11:13:38.614623: I tensorflow/compiler/xla/service/service.cc:179]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2022-02-01 11:13:38.671466: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:237] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-02-01 11:13:43.070009: I tensorflow/compiler/jit/xla_compilation_cache.cc:351] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
2022-02-01 11:13:46.873318: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


0.3375565610859729
Agricultural Systems.xlsx
0.01001001001001001
Robotics.xlsx
0.6155
Signal Processing.xlsx
0.37104072398190047
Molecular Biology.xlsx
0.011182108626198083
Economics.xlsx
0.023099850968703428
Medicine.xlsx
0.0042643923240938165
Neurology.xlsx
0.023578363384188627
Network.xlsx
0.24266365688487584
Archaeology.xlsx
0.009013520280420632
Political Psychology.xlsx
0.010771992818671455


2022-02-03 10:19:42.403051: I tensorflow/stream_executor/cuda/cuda_blas.cc:1774] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


0.03999166506379432


In [23]:
def extract_data(data_sorted,predictions_sorted,softmax=False):
  percent_AI_per_value = []
  values = [data_sorted[0]]

  total_count_per_value = 0
  pos_count_per_value = 0

  for idx,prediction in enumerate(predictions_sorted):
    if data_sorted[idx] != values[-1]:
      values.append(data_sorted[idx])
      percent_AI_per_value.append(float(pos_count_per_value)/total_count_per_value)
      total_count_per_value = 0
      pos_count_per_value = 0
    
    total_count_per_value += 1
    if(softmax):
      if prediction[1]>prediction[0]:
        pos_count_per_value += 1
    else:
      if prediction>=0.5:
        pos_count_per_value += 1

  percent_AI_per_value.append(float(pos_count_per_value)/total_count_per_value)

  for i in range(len(values)):
    print(values[i]) 
    print(percent_AI_per_value[i])



subjects = np.array(test_set[1])
subject_sort_idx = np.argsort(subjects.astype(np.str))
subj_sorted = subjects[subject_sort_idx]
predictions_subj_sort = predictions[subject_sort_idx]
extract_data(subj_sorted,predictions_subj_sort)

Agricultural_and_Biological_Sciences
0.00979706088173548
Arts_and_humanities
0.034565409360413774
Biochemistry, Genetics and Molecular Biology
0.014830508474576272
Business_management_accounting
0.03364692861095739
Chemical_Engineering
0.01562027612617152
Chemistry
0.024121709538584727
Computer_Science
0.24453704644780974
Earth_and_Planetary_sciences
0.028452814904819764
Engineering
0.0671808402679115
Environmental_Sciences
0.016875505254648343
Health_professions
0.049488583531356545
Immunology_and_Microbiology
0.006179196704428424
Material_Science
0.010258473297797445
Mathematics
0.11201381131309028
Medicine
0.008932324506094998
Neuroscience
0.03946297803091944
Nursing
0.004123022063739693
Pharmacology_Toxicology,Pharmaceutics
0.012663357309289839
Physics_and_Astronomy
0.02299082383785419
Psychology
0.01858736059479554
Social_Sciences
0.03219978746014878
decision_science
0.17195956295312978
dentistry
0.00314828418511911
economics_econometrics
0.017331407621375403
energy
0.029370348018

In [13]:
cnt = 0
abstracts_to_show = 100
abstracts_shown = 0
for idx,prediction in enumerate(predictions_subj_sort):
    if subj_sorted[idx]=='Computer_Science' and prediction>=0.5:
        if(cnt>=0 and abstracts_shown<abstracts_to_show):
            print(test_set[0][subject_sort_idx[idx]])
            print("")
            abstracts_shown+=1
        cnt+=1


This paper presents a novel optimization-based approach to compute time-optimal trajectories for robotic systems operating in an environment with the presence of obstacles under kinodynamic constraints. The proposed approach employs a modified rapid exploring random tree algorithm (RRT) to generate a geometrical sub-optimal path inside a feasible safe region. Subsequently, a trajectory is parametrized by fourth order non-uniform B-splines and is optimized along the path with respect to kinodynamic constraints by an interior point optimizer. The optimization process is performed in the safe region without any further collision checking, which is very effective in extremely confined and complex environments. Finally, the potential and efficiency of the approach is illustrated and compared with the notable RRT* algorithm in state space by numerical simulations. 

Protein-Protein Interactions (PPIs) play a vital role in most cellular processes. Although many efforts have been devoted to de

In [22]:
#create excel with abstracts predicted as AI per subject area
current_subj = subj_sorted[0]
abstracts_list = []
writer = pd.ExcelWriter('AI_Predictions_per_Subject_Area.xlsx', engine = 'openpyxl')

for idx,prediction in enumerate(predictions_subj_sort):
    if subj_sorted[idx] != current_subj:
        df = pd.DataFrame(abstracts_list)
        df.to_excel(writer, sheet_name = current_subj)
        writer.save()
        abstracts_list = []
        current_subj = subj_sorted[idx]
    
    if(prediction>=0.5):
        abstracts_list.append(test_set[0][subject_sort_idx[idx]])

writer.close()

In [21]:
writer = pd.ExcelWriter('AI_Paper_Classification_Dataset2.xlsx', engine = 'openpyxl')
df = pd.DataFrame(test_set[0])
df.to_excel(writer, sheet_name = '1', encoding='utf-8', index=False)
writer.save()
writer.close()

In [None]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]", '[CLS]', '[SEP]', '[MASK]']

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 20000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

dataset2 = tf.data.Dataset.from_tensor_slices(dataset2)

pt_vocab = bert_vocab.bert_vocab_from_dataset(
    dataset2.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [None]:
print(pt_vocab[:100])
print(pt_vocab[100:200])
print(pt_vocab[200:400])
print(pt_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '£', '¥', '¦', '§', '©', '¬', '®', '°', '±', '·', '»', '¿', '×', 'æ', 'ð', '÷', 'ø', 'þ', 'đ', 'ħ', 'ı', 'ĸ', 'ł', 'œ', 'ɑ', 'ɛ', 'ɤ', 'ʹ']
['ʼ', 'ˆ', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'ρ', 'σ', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ϭ', 'ћ', 'ԑ', '‐', '‒', '–', '—', '‖', '‘', '’', '“', '”', '„', '†', '‡', '•', '‰', '′', '‹', '›', '⁄', '⁎', '€', '℘', '→', '↔', '⇒', '∀', '∂', '∃', '∅', '∆', '∇', '∈', '∏', '∑', '−', '∓', '∕', '∗', '∘', '∙', '√', '∝', '∞', '∣', '∥', '∧', '∨', '∩', '∪', '∫', '∶', '∼', '≃', '≈', '≊', '≔', '≡', '≤', '≥', '≧', '≪', '≫', '≲', '≳', '⊂', '⊃', '⊆', '⊕',

In [43]:
# add subject area column and merge all excels in the folder
import os

directory = os.fsencode("./no_excels")

excels_to_merge = []
    
for file in os.listdir(directory):
     filename = os.fsdecode(file)
     excel = pd.read_excel("./no_excels/"+filename,sheet_name = 0)
     print(filename)
     excel["Subject Area"] = [None]*excel.shape[0]

     for i in range(excel.shape[0]):
          excel.iloc[i,6] = filename[3:-5]
     
     #excel.to_excel("./no_excels/"+filename, sheet_name = "1", index = False)
     
     excels_to_merge.append(excel)

main_excel = pd.concat(excels_to_merge)
main_excel.to_excel("./no_excels/"+"no_labels.xlsx", sheet_name = "1", index = False)

No_Computer_Science.xlsx
No_Chemical_Engineering.xlsx
No_Mathematics.xlsx
No_Neuroscience.xlsx
No_Engineering.xlsx
No_Chemistry.xlsx
No_Material_Science.xlsx
No_dentistry.xlsx
No_Social_Sciences.xlsx
No_Agricultural_and_Biological_Sciences.xlsx
No_Physics_and_Astronomy.xlsx
No_Immunology_and_Microbiology.xlsx
No_Environmental_Sciences.xlsx
No_economics_econometrics.xlsx
No_Psychology.xlsx
No_Biochemistry, Genetics and Molecular Biology.xlsx
No_Health_professions.xlsx
No_Earth_and_Planetary_sciences.xlsx
No_Pharmacology_Toxicology,Pharmaceutics.xlsx
No_Business_management_accounting.xlsx
No_Nursing.xlsx
No_decision_science.xlsx
No_energy.xlsx
No_veterinary.xlsx
No_Medicine.xlsx
No_Arts_and_humanities.xlsx


In [30]:
def get_arxiv_metadata(file_loc):
    with open(file_loc) as f:
        for line in f:
            yield line

def get_arxiv_AI_ML_papers(arxiv_file):
    import json

    arxiv_metadata = get_arxiv_metadata('arxiv-metadata-oai-snapshot.json')

    arxiv_AI_ML_abstracts = []
    arxiv_AI_ML_subject = []
    arxiv_AI_ML_years = []
    arxiv_AI_ML_journals = []

    for paper in arxiv_metadata:
        paper = json.loads(paper)
        
        if ( paper['categories'].__contains__('cs.AI') or paper['categories'].__contains__('cs.LG') ):
            arxiv_AI_ML_abstracts.append(paper['abstract'])

            if ( paper['categories'].__contains__('cs.AI') and paper['categories'].__contains__('cs.LG') ):
                arxiv_AI_ML_subject.append("cs.AI;cs.LG")
            elif 'cs.AI' in paper['categories']:
                arxiv_AI_ML_subject.append("cs.AI")
            else:
                arxiv_AI_ML_subject.append("cs.LG")
            
            arxiv_AI_ML_journals.append(paper['journal-ref'])

            for str_idx in range( len(paper['id'])-1 ):
                if (paper['id'][str_idx:str_idx+2].isnumeric()):
                    if int(paper['id'][str_idx:str_idx+2])<22:
                        arxiv_AI_ML_years.append('20'+paper['id'][str_idx:str_idx+2])
                    else:
                        arxiv_AI_ML_years.append('19'+paper['id'][str_idx:str_idx+2])
                    break
    
    return (np.array(arxiv_AI_ML_abstracts), np.array(arxiv_AI_ML_subject), np.array(arxiv_AI_ML_years), np.array(arxiv_AI_ML_journals))

def encode_numpy_utf(input):
    size = np.shape(input)[0]

    result = np.empty_like(input)
    result[:(size//2)] = np.char.encode(input[:size//2].astype(str), 'utf-8')
    result[(size//2):] = np.char.encode(input[size//2:].astype(str), 'utf-8')

    return result


def get_excel_data(filename,columns_to_get,sheet=0):
    excel = pd.read_excel(filename,sheet)
    result_list = []

    for column in columns_to_get:
        if column is None:
            result_list.append(None)
        else:
            temp = excel.loc[:,[column]]
            result_list.append(temp.to_numpy().squeeze())
    
    return result_list

def find_nan(array):
    cnt=0
    for idx,stuff in enumerate(array):
        if (not isinstance(stuff,str)):
            print(stuff)
            print(idx)
            cnt+=1
            if(cnt>100):break

def read_split_data(arxiv=True):
    yes_columns = ['Abstract',None,None,'Source title']
    no_columns = ['Abstract','Subject Area',None,'Source title']
    yes_list = get_excel_data('AI_Classification_Toy_Dataset_2.xlsx',yes_columns,'yes')
    no_list = get_excel_data('no_labels.xlsx',no_columns)

    yes_list[1] = np.array(len(yes_list[0]) * ['AI'])

    yes_list[2] = np.array(len(yes_list[0]) * ['2000']) #TEMP UNTIL I GET DEM YEARS
    no_list[2] = np.array(len(no_list[0]) * ['2000']) #TEMP UNTIL I GET DEM YEARS


    if(arxiv):
        arxiv_data = get_arxiv_AI_ML_papers('arxiv-metadata-oai-snapshot.json')
        
        for i in range(len(yes_list)):
            yes_list[i] = np.append(yes_list[i],arxiv_data[i])


    p = np.random.permutation(len(no_list[0]))
    for i in range(len(no_list)):
        no_list[i] = no_list[i][p]
    
    p = np.random.permutation(len(yes_list[0]))
    for i in range(len(yes_list)):
        yes_list[i] = yes_list[i][p]

    
    test_list = [None] * len(yes_columns)

    split_index_no = int( len(no_list[0])*0.5 )
    for i in range(len(no_list)):
        test_list[i] = no_list[i][split_index_no:]
        no_list[i] = no_list[i][:split_index_no]


    yes_list.append( np.ones(len(yes_list[0]),dtype=np.int32) )
    no_list.append( np.zeros(len(no_list[0]),dtype=np.int32) )

        

    train_list = [None] * len(yes_list)
    val_list = [None] * len(yes_list)

    split_index_no = int( len(no_list[0])*0.75 )
    split_index_yes = int( len(yes_list[0])*0.75 )
    for i in range(len(yes_list)):
        train_list[i] = np.append(yes_list[i][:split_index_yes],no_list[i][:split_index_no])
        val_list[i] = np.append(yes_list[i][split_index_yes:],no_list[i][split_index_no:])

    p = np.random.permutation(len(train_list[0]))
    for i in range(len(train_list)):
        train_list[i] = train_list[i][p]
    
    p = np.random.permutation(len(val_list[0]))
    for i in range(len(val_list)):
        val_list[i] = val_list[i][p]

    return train_list,val_list,test_list


def create_dataset_file(arxiv=True):

    train_list,val_list,test_list = read_split_data(arxiv)


    dataset_dict = {'train_set' : {'abstract':train_list[0].tolist(),'labels':train_list[4].tolist(),'Subject Area':train_list[1].tolist(),'Year':train_list[2].tolist(),'Journal':train_list[3].tolist()},
                    'val_set': {'abstract':val_list[0].tolist(),'labels':val_list[4].tolist(),'Subject Area':val_list[1].tolist(),'Year':val_list[2].tolist(),'Journal':val_list[3].tolist()},
                    'test_set': {'abstract':test_list[0].tolist(),'Subject Area':test_list[1].tolist(),'Year':test_list[2].tolist(),'Journal':test_list[3].tolist()} }

    import json
    with open("AI_Paper_Classification_Dataset2.json", 'w') as json_file:
        json.dump(dataset_dict, json_file)



In [3]:
create_dataset_file()
print("done")
#arxiv_abstracts,arxiv_subjects,arxiv_years = get_arxiv_AI_ML_papers('arxiv-metadata-oai-snapshot.json')

done


In [29]:
print(len(train_set[0]))
print(np.count_nonzero( train_set[1] ))

print(len(val_set[0]))
print(np.count_nonzero( val_set[1] ))

print(len(test_set[0]))

316535
129371
105512
43124
249552
