# Build Classifier
## Load libreries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import sklearn 
import sys
import pickle
import transformers
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from keras import models, layers, metrics
import datasets
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM

## Load dataset

In [None]:
data_files = {
	"train": "data/classifier-train.parquet",
	"valid": "data/classifier-valid.parquet",
	"test": "data/classifier-test.parquet"
}

ds = load_dataset("parquet", data_files=data_files)
ds

## Loading checkpoint and tokenizer

In [None]:
checkpoint = 'jackaduma/SecBERT'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
config = transformers.BertConfig.from_pretrained(checkpoint, output_hidden_states=True)
secbert_tf= transformers.TFBertModel.from_pretrained(checkpoint, config=config, trainable=True)

### Tokenize text

In [None]:
def tokenize (batch):
    return tokenizer(batch["text"], max_length=512, padding='max_length', truncation=True, return_tensors="tf")

In [None]:
ds_encoded = ds.map(tokenize, batched= True, batch_size= None)
print(ds_encoded["train"].column_names)

In [None]:
X_tf_train = [tokenizer(text, padding="max_length",max_length = 512, truncation=True)['input_ids'] for text in ds_encoded["train"]['text']]
X_tf_train = np.array(X_tf_train, dtype='int32')

X_tf_valid = [tokenizer(text, padding="max_length",max_length = 512, truncation=True)['input_ids'] for text in ds_encoded["valid"]['text']]
X_tf_valid = np.array(X_tf_valid, dtype='int32')

In [None]:
y_tf_train = ds_encoded["train"]["label"]
y_tf_valid = ds_encoded["valid"]["label"]

y_tf_train = np.array(y_tf_train)
y_tf_valid = np.array(y_tf_valid)

## Build model

In [None]:
input_ids_in = layers.Input(shape=(512,), name='input_token', dtype='int32')

x = secbert_tf(input_ids=input_ids_in)[0][:,0,:]
x = layers.Dropout(0.2, name='dropout')(x)
x = layers.Dense(768, activation='relu', name='pre-classifier')(x)
x = layers.Dense(190, activation='softmax', name='classification')(x)

model_tf = models.Model(inputs=input_ids_in, outputs = x, name='ClassificationModelTF')

model_tf.compile(optimizer='adam',loss='sparse_categorical_crossentropy', metrics=[metrics.SparseCategoricalAccuracy()])
model_tf.summary()



## Start training

In [None]:

# Train the model
history = model_tf.fit(X_tf_train, y_tf_train, batch_size=32, shuffle=True, epochs=12, validation_data=(X_tf_valid, y_tf_valid))

## Save model

In [None]:
model_tf.save('output/classificator-secbert.h5')