In [None]:
import pandas as pd
import numpy as np
import json
import tensorflow as tf
import tensorflow_text as tf_text
import tensorflow_hub as tf_hub
from tensorflow.keras import mixed_precision
from official.nlp import optimization  # to create AdamW optimizer

device_to_use = 'CPU' #SET GLOBAL DEVICE
#device_to_use = 'GPU' #SET GLOBAL DEVICE

USE_SOFTMAX = False
TRAIN_MODEL = False


if (device_to_use=='GPU'):
    #mixed_precision.set_global_policy('mixed_float16') #tf bug, cant save model when using fp16 :(
    #tf.config.optimizer.set_jit(True) # Enable XLA. SOMETIMES IT DOESNT WORK AND U CANT FIGURE OUT WHATS WRONG
    pass
else:
    # Hide GPU from visible devices
    tf.config.set_visible_devices([], 'GPU')

print(tf.config.get_visible_devices())

In [None]:
def read_json_data(filename):
  with open(filename, 'r') as json_file:
    dataset_dict = json.load(json_file)

  dataset = []

  for _,group in dataset_dict.items():
    group_data = []

    for _,data in group.items():
      group_data.append(data)

    dataset.append(group_data)

  train_set = dataset[0]
  val_set = dataset[1]
  test_set = dataset[2]

  return train_set,val_set,test_set

def remove_copyright(text):

  copyright_idx = text.find('©')
  if(copyright_idx!=-1):
    text = text[:copyright_idx]

  return text


def create_tf_dataset(text,labels,preprocess_model=None,softmax=False,shuffle=True):
  #if categorical cross entropy loss is used, one-hot encode labels
  if(softmax and (labels is not None) ):
    labels = tf.one_hot(labels,2)

  if(labels is not None):
    dataset = tf.data.Dataset.from_tensor_slices( (text,labels) )
  else:
    dataset = tf.data.Dataset.from_tensor_slices( text )

  if(shuffle): dataset = dataset.shuffle(32768)
  
  dataset = dataset.batch(8)

  #apply pre-process model to each batch
  if (preprocess_model is not None and (labels is not None)):
    dataset = dataset.map(lambda x,y: (preprocess_model(x), y))
  elif (preprocess_model is not None and (labels is None)):
    dataset = dataset.map(lambda x: preprocess_model(x))

  #prefetch batches so that the gpu doesnt starve
  dataset = dataset.prefetch(8)

  return dataset

def get_datasets_and_metadata(filename):

  #get data
  train_set,val_set,test_set = read_json_data(filename)

  #iterate over the train validation and test set and remove all copyright messages
  train_abs = [None] * len(train_set[0])
  for idx,abs in enumerate(train_set[0]):
    train_abs[idx] = remove_copyright(abs)

  val_abs = [None] * len(val_set[0])
  for idx,abs in enumerate(val_set[0]):
    val_abs[idx] = remove_copyright(abs)

  test_abs = [None] * len(test_set[0])
  for idx,abs in enumerate(test_set[0]):
    test_abs[idx] = remove_copyright(abs)

  train_set[0] = train_abs
  val_set[0] = val_abs
  test_set[0] = test_abs

  return train_set,val_set,test_set

In [None]:
#select model to be used
bert_model_name = 'small_bert/bert_en_uncased_L-6_H-768_A-12'

map_name_to_handle = {
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1'
}

map_model_to_preprocess = {
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

#get model from tf hub
bert_preprocess_model = tf_hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
#define models

#abstracts are usually between 250-500 words so a sequence length of 512 is desirable
def build_preprocess_model(sentence_features, seq_length=512):
  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = tf_hub.load(tfhub_handle_preprocess)
  tokenizer = tf_hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  segments = [tokenizer(s) for s in input_segments]

  # Pack inputs. The details (start/end token ids, dict of output tensors)
  # are model-dependent, so this gets loaded from the SavedModel.
  packer = tf_hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(segments)
  return tf.keras.Model(input_segments, model_inputs)

def build_classifier_model(num_classes=2, softmax=False):
    
    class Classifier(tf.keras.Model):
        def __init__(self):
            super(Classifier, self).__init__(name="prediction")
            self.encoder = tf_hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')

            if(softmax): 
               self.dense = tf.keras.layers.Dense(num_classes,activation='softmax')
            else:
                self.dense = tf.keras.layers.Dense(1, activation='sigmoid', name='classifier')

        def call(self, preprocessed_text):
            encoder_outputs = self.encoder(preprocessed_text)
            pooled_output = encoder_outputs["pooled_output"]
            x = self.dense(pooled_output)
            return x

    model = Classifier()
    return model


def get_model_predictions(dataset,model,softmax=False):
    predictions = model.predict(dataset)
    
    if (softmax):
        yes_scores = predictions[:,1]
        no_scores = predictions[:,0]
        
        positive = np.count_nonzero(yes_scores>no_scores)
        negative = np.count_nonzero(yes_scores<no_scores)
    else:
        positive = np.count_nonzero(predictions>=0.5)
        negative = np.count_nonzero(predictions<0.5)

    print(float(positive)/(positive+negative))

    return predictions

In [None]:
#get data and create train and validation tf datasets

train_set,val_set,test_set = get_datasets_and_metadata("AI_Paper_Classification_Dataset.json")

preprocess_model = build_preprocess_model(["input 1"]) 

train_dataset = create_tf_dataset(train_set[0],train_set[1],preprocess_model,softmax=USE_SOFTMAX)
val_dataset = create_tf_dataset(val_set[0],val_set[1],preprocess_model,softmax=USE_SOFTMAX)

classifier_model = build_classifier_model(softmax=USE_SOFTMAX)

In [None]:
# train the model or load previous weights

if (USE_SOFTMAX):
    loss = tf.keras.losses.CategoricalCrossentropy()
else:
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)

#select required metrics 
metrics = [ tf.keras.metrics.CategoricalAccuracy() if USE_SOFTMAX else tf.keras.metrics.BinaryAccuracy() ,
            tf.keras.metrics.AUC(),
            tf.keras.metrics.FalsePositives(),
            tf.keras.metrics.FalseNegatives(),
            tf.keras.metrics.TruePositives(),
            tf.keras.metrics.TrueNegatives(),
            tf.keras.metrics.Precision(),
            tf.keras.metrics.Recall()
        ]


epochs = 1
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)


optimizer = optimization.create_optimizer(init_lr=3e-5,
                                        num_train_steps=num_train_steps,
                                        end_lr=1e-6,
                                        num_warmup_steps=num_warmup_steps,
                                        optimizer_type='adamw')


classifier_model.compile(optimizer=optimizer,
                        loss=loss,
                        metrics=metrics)


if (TRAIN_MODEL):
    history = classifier_model.fit(x=train_dataset,
                                validation_data=val_dataset,
                                epochs=epochs
                                )

    classifier_model.save_weights('./bert_weights/')
    
else:
    classifier_model.load_weights('./bert_weights/')

In [None]:
# run model on the validation set
classifier_model.evaluate(val_dataset)

In [None]:
#get prediction on the test set
test_dataset = create_tf_dataset(test_set[0],None,preprocess_model,shuffle=False,softmax=USE_SOFTMAX)
predictions = get_model_predictions(test_dataset,classifier_model)

In [None]:
#Using this function we can extract statistics from the predictions
#We can find out percentage of papers predicted as AI per subject,per year per journal etc.
#the data need to be argsorted and then using that index we need to sort the data and the preictions
#if we have a list of X abstracts, X subjects, X predictions, argsort the subjects
#using that index sort predictions and subjects
#input the sorted subjects and predictions to this function
#percentages of AI predictions per subject are printed
def extract_data(data_sorted,predictions_sorted,softmax=False):
  # percent AI predicted per unique value
  percent_AI_per_value = []
  # unique values
  values = [data_sorted[0]]

  total_count_per_value = 0
  pos_count_per_value = 0

  for idx,prediction in enumerate(predictions_sorted):
    if data_sorted[idx] != values[-1]:
      values.append(data_sorted[idx])
      percent_AI_per_value.append(float(pos_count_per_value)/total_count_per_value)
      total_count_per_value = 0
      pos_count_per_value = 0
    
    total_count_per_value += 1
    if(softmax):
      if prediction[1]>prediction[0]:
        pos_count_per_value += 1
    else:
      if prediction>=0.5:
        pos_count_per_value += 1

  percent_AI_per_value.append(float(pos_count_per_value)/total_count_per_value)

  for i in range(len(values)):
    print(values[i]) 
    print(percent_AI_per_value[i])



subjects = np.array(test_set[1])
subject_sort_idx = np.argsort(subjects.astype(np.str))
subj_sorted = subjects[subject_sort_idx]
predictions_subj_sort = predictions[subject_sort_idx]
extract_data(subj_sorted,predictions_subj_sort)

In [None]:
#print abstracts of a certain category predicted as AI

cnt = 0
abstracts_to_show = 10
category = 'Computer_Science'

for idx,prediction in enumerate(predictions_subj_sort):
    if subj_sorted[idx]==category and prediction>=0.5:
        if(cnt<abstracts_to_show):
            print(test_set[0][subject_sort_idx[idx]])
            print("")
        cnt+=1


In [None]:
#create excel with abstracts predicted as AI per subject area

current_subj = subj_sorted[0]
abstracts_list = []
writer = pd.ExcelWriter('AI_Predictions_per_Subject_Area.xlsx', engine = 'openpyxl')

for idx,prediction in enumerate(predictions_subj_sort):
    if subj_sorted[idx] != current_subj:
        df = pd.DataFrame(abstracts_list)
        df.to_excel(writer, sheet_name = current_subj)
        writer.save()
        abstracts_list = []
        current_subj = subj_sorted[idx]
    
    if(prediction>=0.5):
        abstracts_list.append(test_set[0][subject_sort_idx[idx]])

writer.close()