# DistillBERT Model

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [18]:
import pandas as pd
import numpy as np
import os
# os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
# os.environ['CUDA_VISIBLE_DEVICES']='0'

from preproc_text import process_tweets
from preproc_abbv import abbreviations
from preproc_class import TextPreprocess

from tensorflow.keras.preprocessing.text import text_to_word_sequence

from sklearn.model_selection import train_test_split

import ktrain
from ktrain import text

## Import Data

In [4]:
# importing data from hard disk
csv_path = os.path.join('/home/sbyhung/code/danm91/le_kingmakers/raw_data','training.1600000.processed.noemoticon.csv')
df = pd.read_csv(csv_path, header=None)
df = df.copy()

In [27]:
# get sample for testing
sample_size = int(df.shape[0] * 0.0001)
data_sample = df.sample(sample_size, random_state=0)
data_sample.shape

(160, 6)

In [28]:
# splitting data into X & y
X = data_sample.iloc[:, 5]
y = data_sample.iloc[:, 0]
display(X.shape)
display(y.shape)

(160,)

(160,)

In [29]:
# changing positive values from 4 to 1
y = y.map({0: 0, 4:1})

In [30]:
# cleaning with bespoke classes
X = X.apply(process_tweets)

In [31]:
# splitting data train:test:val = 60:20:20
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=0, stratify=y_temp)
display(X_train.shape)
display(X_test.shape)
display(X_val.shape)
display(y_train.shape)
display(y_test.shape)
display(y_val.shape)

(96,)

(32,)

(32,)

(96,)

(32,)

(32,)

In [32]:
# change pd series to array
X_train = np.array(X_train)
X_test = np.array(X_test)
X_val = np.array(X_val)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_val = np.array(y_val)

In [33]:
# check dimensions
display(X_train.shape)
display(X_test.shape)
display(X_val.shape)
display(y_train.shape)
display(y_test.shape)
display(y_val.shape)

(96,)

(32,)

(32,)

(96,)

(32,)

(32,)

## Processing Data

In [34]:
#### Tokenise words

In [35]:
y.unique()

array([0, 1])

## Define Model & Preprocess Data

In [36]:
# model name & class name
model_name = 'distilbert-base-uncased'
class_names = y.unique()

In [37]:
# transformer
trans = text.Transformer(model_name, maxlen=512, class_names=class_names)


In [39]:
# preprocessing data
train_data = trans.preprocess_train(X_train, y_train)
test_data = trans.preprocess_test(X_test, y_test)
val_data = trans.preprocess_test(X_val, y_val)

preprocessing train...
language: en
train sequence lengths:
	mean : 7
	95percentile : 13
	99percentile : 15


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 7
	95percentile : 14
	99percentile : 15


preprocessing test...
language: en
test sequence lengths:
	mean : 6
	95percentile : 11
	99percentile : 13


## Instantiate Model & Find Learning Rate & Fit

In [40]:
# instantiate model using get classifer
model = trans.get_classifier()

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

In [41]:
# set learner
learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=16)

In [42]:
# fine best learning rate
learner.lr_find(show_plot=True, max_epochs=10)

simulating training for different learning rates... this may take a few moments...
Epoch 1/10

In [None]:
learner.fit_onecycle(le-4, 1)

## Evaluate Model Performance

In [None]:
# Get performance of model
learner.validate(class_names=class_names)

In [None]:
# View data points with bigget losses
learner.view_top_losses(n=5, preproc=trans)

## Predictions

In [None]:
# define predictor function
predictor = ktrain.get_predictor(learner.model, preproc=trans)

In [None]:
# state prediction
test_string = """some example string to be evaluated"""

In [None]:
# get prediction result
predictor.predict(test_string)

## Saving Model

In [None]:
#######  NEED TO ENTER THE FOLLOWING TEXT ######

In [None]:
# run the text below NB. It is very large 00s of MB
predictor.save('name_of_model')