# garbage_classifier

This notebook classifies website text snippets into useful or not (i.e., garbage) using transfer learning starting from an existing hugging face model
* Get a model checkpoint for an encoder model 
* Use reinforcement learning to apply the model on a new classification problem (EAGER website data) with limited new trained data
* Apply new head of model to full EAGER corpus to come up with mixes of models
* Metrics and model registered through a combination of comet.ml and tensorboard 

## Install and import libraries

In [11]:
# check environment
IN_COLAB = 'google.colab' in sys.modules
print (IN_COLAB)

False


In [17]:
# colab file system setup 
if IN_COLAB: 
    !git clone https://github.com/euphonic/EAGER.git
    !pwd
    !mkdir /content/logs

In [18]:
# mount google drive if in colab
drive_path = '/content/drive/'

if IN_COLAB:  
    from google.colab import drive
    drive.mount(drive_path)

In [20]:
from comet_ml import Experiment
from dotenv import load_dotenv

# setup comet_ml experiment
if IN_COLAB: 
    # read env file from Google drive 
    env_file = drive_path + 'MyDrive/raaste-config/.env'
    comet_config_file = drive_path + 'MyDrive/raaste-config/.comet.config'
    load_dotenv(env_file)

# read config file from git repo 
experiment = Experiment()

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/euphonic/general/374c3618792e47e0aea5b09cab9857a0



In [21]:
# install huggingface and other modules if in colab
if IN_COLAB: 
    !pip install transformers
    !pip install datasets
    !pip install python-dotenv

In [22]:
# ml libraries
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from sklearn.model_selection import train_test_split
import pandas as pd

# other
import sys
import numpy as np
import gzip
import tarfile
import datetime

In [23]:
# load tensorboard 
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


## Huggingface setup

In [24]:
# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 7.89kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 570/570 [00:00<00:00, 221kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 226k/226k [00:00<00:00, 3.44MB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 455k/455k [00:00<00:00, 6.91MB/s]


### Garbage classifier
keep test == 1, discard == 0

In [26]:
firm_file_location = '/content/EAGER/data/modeling/garbage/garbage_classifier_input.csv'
input_df = pd.read_csv(firm_file_location)

FileNotFoundError: [Errno 2] No such file or directory: '/content/EAGER/data/modeling/garbage/garbage_classifier_input.csv'

In [None]:
non_null_df = input_df[~ input_df['sample_text'].isnull() ]
non_null_df

Unnamed: 0,sample_text,of_interest
0,Our Management,0
1,Latest Press Releases,0
2,On-Going Clinical Studies on Very Low Nicotine...,1
3,Links to the ‚ÄúMiracle Plant‚Äù,0
4,This advisory note presents the conclusions an...,1
...,...,...
5619,DLS,0
5620,Sign up to get the latest news from Socialx,1
5621,Our Distributors,0
5622,The motivation for starting was the frustratio...,1


In [None]:
dataset = Dataset.from_pandas(non_null_df, split='train')
dataset.cast_column("of_interest", datasets.Value('int8'))

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['sample_text', 'of_interest', '__index_level_0__'],
    num_rows: 5624
})

In [None]:
# 80% train, 20% test + validation
train_test_dataset = dataset.train_test_split(test_size=0.2)
# Split the 20% test + valid in half test, half valid
test_valid_dataset = train_test_dataset['test'].train_test_split(test_size=0.2)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = datasets.DatasetDict({
    'train': train_test_dataset['train'],
    'test': test_valid_dataset['test'],
    'valid': test_valid_dataset['train']})

In [None]:
train_test_valid_dataset

DatasetDict({
    train: Dataset({
        features: ['sample_text', 'of_interest', '__index_level_0__'],
        num_rows: 4499
    })
    test: Dataset({
        features: ['sample_text', 'of_interest', '__index_level_0__'],
        num_rows: 225
    })
    valid: Dataset({
        features: ['sample_text', 'of_interest', '__index_level_0__'],
        num_rows: 900
    })
})

In [None]:
def tokenize_function(x):
  return tokenizer(x["sample_text"], truncation=True, max_length=100)

In [None]:
tokenized_dataset = train_test_valid_dataset.map(tokenize_function, batched=True, batch_size=2000)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
samples = tokenized_dataset["train"].to_dict()
samples = {k: v for k, v in samples.items() if k not in ["idx", "sample_text"]}
set([len(x) for x in samples["input_ids"]])

{2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100}

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length", max_length=100, return_tensors="tf")

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'__index_level_0__': TensorShape([4499]),
 'attention_mask': TensorShape([4499, 100]),
 'input_ids': TensorShape([4499, 100]),
 'of_interest': TensorShape([4499]),
 'token_type_ids': TensorShape([4499, 100])}

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols="of_interest",
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dataset["valid"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols="of_interest",
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
tf_train_dataset

<PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(8, None), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(8, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(8, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(8,), dtype=tf.int64, name=None))>

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset
)



<keras.callbacks.History at 0x7f1245ce5390>

In [None]:
batch_size = 32
num_epochs = 50
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

In [None]:
log_dir = "/content/logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)    

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=opt, loss=loss, metrics=["accuracy"], callbacks=[tensorboard_callback])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe8fabb2d50>

In [None]:
model.save('/content/EAGER/models/garbage_classifier_v1')



INFO:tensorflow:Assets written to: /content/EAGER/models/garbage_classifier_v1/assets


INFO:tensorflow:Assets written to: /content/EAGER/models/garbage_classifier_v1/assets


In [None]:
from google.colab import drive
drive.mount('/content/drive')