In [0]:
#%tensorflow_version 2.x

"""
CONNECTING TO LOCAL RUNTIME:

jupyter notebook \
  --NotebookApp.allow_origin='https://colab.research.google.com' \
  --port=8888 \
  --NotebookApp.port_retries=0
"""

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import keras
import random
import os
import json

from keras.models import Model,Sequential
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import adam
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense,Flatten,Input,Reshape
from keras.utils import plot_model

import random

tf.compat.v1.enable_eager_execution()

# Initialize "wandb" Logging to a New Run

In [0]:
import wandb
#wandb.init(project="geneus", magic=True)

# Import Dataset

In [0]:
"""
RUNNING WITH TEST DATA

with open('~/Desktop/Geneus/input_datas/dummy/dummy__data.json') as f:
    data = json.load(f)
    
with open('~Desktop/Geneus/input_datas/dummy/dummy__labels.json') as f:
    labels = json.load(f)
    
dataTF = tf.convert_to_tensor(data, dtype=tf.int16)    
labelsTF = tf.convert_to_tensor(labels, dtype=tf.int16)
"""

def loadJsonIntoTF(path, dataFileNames, labelFileNames, returnTF=True, testing_data_ratio=0.2):
  allData = []
  allLabels = []
  for idx in range(len(dataFileNames)):
    try:
      with open(path + dataFileNames[idx]) as f:
          allData += json.load(f)
    except:
      print("File not found: " + path + dataFileNames[idx])
    
    try:
      with open(path + labelFileNames[idx]) as f:
          allLabels += json.load(f)
    except:
      print("File not found: " + path + dataFileNames[idx])

  #Split into training and testing data
  numSamples = max(len(allData), len(allLabels))
  splitIdx = int(numSamples*(1 - testing_data_ratio))

  allDataNp = np.array(allData)
  allLabelsNp = np.array(allLabels)

  #Shuffles indices
  indices = list(np.random.permutation(numSamples))
  training_idx, test_idx = indices[:splitIdx], indices[splitIdx:]

  #Training and testing dataset
  data_train, data_test = allDataNp[training_idx], allDataNp[test_idx]
  labels_train, labels_test = allLabelsNp[training_idx], allLabelsNp[test_idx]

  if returnTF:
    dataTF_train = tf.convert_to_tensor(data_train, dtype=tf.float32)  
    dataTF_test = tf.convert_to_tensor(data_test, dtype=tf.float32)    
    labelsTF_train = tf.convert_to_tensor(labels_train, dtype=tf.int16)
    labelsTF_test = tf.convert_to_tensor(labels_test, dtype=tf.int16)
    return (dataTF_train, dataTF_test, labelsTF_train, labelsTF_test)
  else:
    return (data_train, data_test, labels_train, labels_test)


path = '/home/andrew/Desktop/Geneus/input_datas/'
dataFileNames = ["keyword1_data.json","keyword2_data.json","keyword3_data.json","keyword4_data.json","keyword5_data.json", "keyword6_data.json","keyword_data.json"]
labelFileNames = ["keyword1_labels.json","keyword2_labels.json","keyword3_labels.json","keyword4_labels.json","keyword5_labels.json", "keyword6_labels.json","keyword_labels.json"]

pathB = '/home/andrew/Desktop/bert_data/'
dataFileNamesB = ["bert_data.json"]
labelFileNamesB = ["bert_labels.json"]

pathPhylumLabels = '/home/andrew/Desktop/Geneus/input_datas/'
phylumLabelNames = ['class_labels.json']

(dataTF_train, dataTF_test, labelsTF_train, labelsTF_test) = loadJsonIntoTF(path, dataFileNames, labelFileNames)
(dataTFB_train, dataTFB_test, labelsTFB_train, labelsTFB_test) = loadJsonIntoTF(pathB, dataFileNamesB, labelFileNamesB)

#Getting phylum labels
(_, _, phylum_labels_train, phylum_labels_test) = loadJsonIntoTF(pathPhylumLabels, phylumLabelNames, phylumLabelNames, False)

phylum_labels_train = phylum_labels_train[:, 3:14]
phylum_labels_test = phylum_labels_test[:, 3:14]

phylum_labelsTF_train = tf.convert_to_tensor(phylum_labels_train, dtype=tf.int16)
phylum_labelsTF_test = tf.convert_to_tensor(phylum_labels_test, dtype=tf.int16)

#Printout TF to verify correct shapes and types
print(dataTF_train)
print(dataTF_test)
print(labelsTF_train)
print(labelsTF_test)
print("\n")
print(dataTFB_train)
print(dataTFB_test)
print(labelsTFB_train)
print(labelsTFB_test)
print("\n")
print(phylum_labelsTF_train)
print(phylum_labelsTF_test)

In [0]:
#print(phylum_labels_train.shape)
#for col in range(11):
#  indices = np.where(phylum_labels_train[:, col] == 1) 
#  print(len(indices[0]))

#print("\n\n")

#print(phylum_labels_test.shape)
#for col in range(11):
#  indices = np.where(phylum_labels_test[:, col] == 1) 
#  print(len(indices[0]))


#(_, _, labels_train, labels_test) = loadJsonIntoTF(path, dataFileNames, labelFileNames, False)
#(_, _, labelsB_train, labelsB_test) = loadJsonIntoTF(pathB, dataFileNamesB, labelFileNamesB, False)
#for col in range(3):
#  indices = np.where(labels_train[:, col] == 1) 
#  indices1 = np.where(labelsB_train[:, col] == 1) 
#  indices2 = np.where(labels_test[:, col] == 1) 
#  indices3 = np.where(labelsB_test[:, col] == 1) 
#  print(len(indices[0]))
#  print(len(indices1[0]))
#  print(len(indices2[0]))
#  print(len(indices3[0]))
#  print("\n\n")

# Model Definition

In [0]:
def make_bert_model(inputSize, outputSize, name="BERT_Model"):
  #Input Shape is (None, 1024)
  #Output Shape is (None, 3)
  model = tf.keras.Sequential(name = name)

  #Model Params
  layerSizes = [32, outputSize]
  activations = ['relu', 'softmax']

  assert len(layerSizes) == len(activations)
  for idx, numNeurons in enumerate(layerSizes):
    name = 'Dense_Layer_' + str(idx+1)
    if idx == 0:
      model.add(layers.Dense(numNeurons, input_dim=inputSize, activation=activations[idx], name=name))
    else:
      model.add(layers.Dense(numNeurons, activation=activations[idx], name=name))

  return model

def make_keywords_model(inputSize, outputSize, name="Keywords_Model"):
  #Input Shape is (None, 300)
  #Output Shape is (None, 3)
  model = tf.keras.Sequential(name = name)

  #Model Params
  layerSizes = [32, outputSize]
  activations = ['relu', 'softmax']

  assert len(layerSizes) == len(activations)
  for idx, numNeurons in enumerate(layerSizes):
    name = 'Dense_Layer_' + str(idx+1)
    if idx == 0:
      model.add(layers.Dense(numNeurons, input_dim=inputSize, activation=activations[idx], name=name))
    else:
      model.add(layers.Dense(numNeurons, activation=activations[idx], name=name))

  return model

#Print Model Summaries
bertModel = make_bert_model(1024, 3)
print(bertModel.summary())

kwModel = make_keywords_model(300, 3)
print(kwModel.summary())

bertPhylumModel = make_bert_model(1024, 11, name="BERT_Model_Phylum")
print(bertPhylumModel.summary())

kwPhylumModel = make_keywords_model(300, 11, name="Keywords_Model_Phylum")
print(kwPhylumModel.summary())

# Compile and Train Model

In [0]:
#Compile the Models
bertPhylumModel.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
kwPhylumModel.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

#Train Model
wandb.init(project="geneus", magic=True)
bertPhylumModel.fit(dataTFB_train, phylum_labelsTF_train, epochs=100, batch_size=64, validation_data=(dataTFB_test, phylum_labelsTF_test))
bertPhylumModel.save(os.path.join(wandb.run.dir, "bertModel.h5"))

wandb.init(project="geneus", magic=True)
kwPhylumModel.fit(dataTF_train, phylum_labelsTF_train, epochs=100, batch_size=64, validation_data=(dataTF_test, phylum_labelsTF_test))
kwPhylumModel.save(os.path.join(wandb.run.dir, "kwModel.h5"))

# Load model Weights from a Defined "wandb" Run

In [0]:
#Load in the test data
(_, bertTest_dataTF_test, _, bertTest_labelsTF_test) = loadJsonIntoTF('/home/andrew/Desktop/Geneus/input_datas/non_wiki/', ["bert_data.json"], ["bert_labels.json"], testing_data_ratio=1.0)
(_, test_dataTF_test, _, test_labelsTF_test) = loadJsonIntoTF('/home/andrew/Desktop/Geneus/input_datas/non_wiki/', ["keyword_data.json"], ["keyword_labels.json"], testing_data_ratio=1.0)

#pathToKwWeightFile = "/home/andrew/Desktop/Geneus/wandb/Keywords_Run1/kwModel.h5"
#pathToBertWeightFile = "/home/andrew/Desktop/Geneus/wandb/Bert_Run1/bertModel.h5"

#print(bertTest_labelsTF_test)
#print(test_labelsTF_test)
#print("\n")
#print(bertTest_dataTF_test)
#print(test_dataTF_test)

#Get the weights file
#kwWeightFile = wandb.restore('kwModel.h5', run_path="/wandb/Keywords_Run1/")
#bertWeightFile = wandb.restore('bertModel.h5', run_path="/wandb/Bert_Run1/")

#Load weights file into model
kwModel_loaded = make_keywords_model(300, 3)
kwModel_loaded.load_weights("/home/andrew/Desktop/Geneus/wandb/Keywords_Run1/kwModel.h5")
bertModel_loaded = make_keywords_model(1024, 3)
bertModel_loaded.load_weights("/home/andrew/Desktop/Geneus/wandb/Bert_Run1/bertModel.h5")

#Predict results on testing dataset (for Proof of Concept)
print(kwModel_loaded.predict(test_dataTF_test))
print(tf.math.argmax(test_labelsTF_test))
print("\n\n")
print(bertModel_loaded.predict(bertTest_dataTF_test))
print(tf.math.argmax(bertTest_labelsTF_test))