# text-classification-model-v1
This notebook classifies website data using transfer learning starting from an existing hugging face model
* Get a model checkpoint for an encoder model 
* Use reinforcement learning to apply the model on a new classification problem (EAGER website data) with limited new trained data
* Apply new head of model to full EAGER corpus to come up with mixes of models

## Install libraries

In [None]:
# install bson for reading mongodb data
!pip uninstall --yes bson
!pip install pymongo

In [None]:
# install hugging face 
!pip install transformers
!pip install datasets
!pip install DatasetDict

In [None]:
import tensorflow as tf
import numpy as np
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification

import gzip
import tarfile
import shutil
import bson
import pandas as pd

## Canned Examples

In [None]:
print(pipeline('sentiment-analysis')('This application looks promising'))

In [None]:
# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = dict(tokenizer(sequences, padding=True, truncation=True, return_tensors="tf"))

# This is new
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
labels = tf.convert_to_tensor([1, 1])
model.train_on_batch(batch, labels)

## EAGER test

In [None]:
# EAGER data -- unpack
tar_dir = '/dbfs/FileStore/eager/'
tar_file = tar_dir + "FirmDB_about2_20190131.tar"
print (tar_file)

In [None]:
# tar = tarfile.open(tar_file)
# tar.extractall(tar_dir)
# tar.close()

In [None]:
# gunzip about us pages
ungzip_file = tar_dir + "FirmDB_about2_20190131/pages_ABOUT2.bson"
gzip_file = ungzip_file + ".gz"
print (gzip_file)

In [None]:
with gzip.open(gzip_file, 'rb') as f_in:
    with open(ungzip_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
with open(ungzip_file,'rb') as f:
    about_pages = bson.decode_all(f.read())

In [None]:
about_pages[0]['full_text']

### Zero-shot classification

In [None]:
firm_file_location = '/dbfs/FileStore/eager/about/Zygo.txt'
txt_file = open(firm_file_location, "r")
content_list = txt_file.readlines()
print(content_list)

In [None]:
content_list[6]

In [None]:
classifier = pipeline("zero-shot-classification")

In [None]:
candidate_labels = ["business", "marketing", "manufacturing", "research", "engineering"]

for i in range(6,10):
  res = classifier(content_list[i], candidate_labels)
  print (res)

In [None]:
classifier("I havern't a dog.")

In [None]:
classifier = pipeline("text-classification", model = "textattack/distilbert-base-uncased-CoLA")

for i in range(0, len(content_list)):
  print ((content_list[i]))
  # res = classifier(content_list[i])
  # print (res)

### Garbage classifier
keep test == 1, discard == 0

In [None]:
import datasets
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
firm_file_location = '/dbfs/FileStore/eager/garbage_classifier_input.csv'
input_df = pd.read_csv(firm_file_location)
input_df[input_df['sample_text'].isnull()]

Unnamed: 0,sample_text,keep_text


In [None]:
dataset = datasets.load_dataset('csv', data_files=firm_file_location,  split='train')
dataset.cast_column("keep_text", datasets.Value('int8'))

In [None]:
# 80% train, 20% test + validation
train_test_dataset = dataset.train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid_dataset = train_test_dataset['test'].train_test_split(test_size=0.2)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = datasets.DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_valid_dataset['test'],
    'valid': test_valid_dataset['train']})

In [None]:
train_test_valid_dataset

In [None]:
def tokenize_function(x):
  return tokenizer(x["sample_text"], truncation=True, max_length=100)

In [None]:
tokenized_dataset = train_test_valid_dataset.map(tokenize_function, batched=True, batch_size=2000)

In [None]:
samples = tokenized_dataset["train"].to_dict()
samples = {k: v for k, v in samples.items() if k not in ["idx", "sample_text"]}
set([len(x) for x in samples["input_ids"]])

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=100, return_tensors="tf")

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols="keep_text",
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dataset["valid"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols="keep_text",
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
tf_train_dataset

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
)

In [None]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

batch_size = 8
num_epochs = 5
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)
from tensorflow.keras.optimizers import Adam

opt = Adam(learning_rate=lr_scheduler)

In [None]:
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])

In [None]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs)