## Binary classification of tweets using tiny-BERT on a tokenized dataset of tweets (https://huggingface.co/huawei-noah/TinyBERT_General_4L_312D)

Imports and load data from URL then check for null values

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/twitter_disaster_tweets.csv", usecols=['text', 'target'])
df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [2]:
df.isnull().sum()


text      0
target    0
dtype: int64

Sample the data to effectively randomize the labeled entries

In [6]:
df = df.sample(frac=1).reset_index(drop=True)
df = df.rename(columns={'target': 'label'})
df.head()

Unnamed: 0,text,label
0,@VileLunar I trickshot with a regular controll...,0
1,@CTAZtrophe31 Everything must be OK because sh...,0
2,Two-vehicle collision at Fowlers Corners at Hw...,1
3,'I eat because it makes my mouth explode with ...,0
4,China's Stock Market Crash: Are There Gems In ...,0


Convert the pandas dataframe into a Huggingface dataloader for processing & inspect first element. Check shape.

In [7]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.2)

dataset['train'][0]

{'text': 'What you gonna do now puppies?! No more destroying my #iPhone Lightning cables! https://t.co/Z4jyHaRreW',
 'label': 0}

In [8]:
dataset.shape

{'train': (6090, 2), 'test': (1523, 2)}

Create dictionaries to map the numerical labels to categories

In [10]:
id2label = {0:'general', 1: 'disaster'}
label2id = {'general': 0, 'disaster': 1}

Tokenize data - AutoTokenizer ensures format is good for Tiny-BERT

In [9]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True, max_length=100)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 6090/6090 [00:00<00:00, 6390.83 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 5498.02 examples/s]


Check first element (in tokenized form)

In [11]:
tokenizer(dataset['train'][0]['text'])

{'input_ids': [101, 2054, 2017, 6069, 2079, 2085, 26781, 13046, 1029, 999, 2053, 2062, 9846, 2026, 1001, 18059, 7407, 15196, 999, 16770, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 1062, 2549, 3501, 2100, 8167, 15603, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1523
    })
})

In [13]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Build model and run

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=5,
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [16]:
trainer.train()

                                                 
 20%|██        | 191/955 [03:38<09:33,  1.33it/s]

{'eval_loss': 0.48747846484184265, 'eval_accuracy': 0.799080761654629, 'eval_runtime': 14.0149, 'eval_samples_per_second': 108.67, 'eval_steps_per_second': 3.425, 'epoch': 1.0}


                                                   
 40%|████      | 382/955 [06:59<08:15,  1.16it/s]

{'eval_loss': 0.44274550676345825, 'eval_accuracy': 0.8108995403808273, 'eval_runtime': 16.2008, 'eval_samples_per_second': 94.008, 'eval_steps_per_second': 2.963, 'epoch': 2.0}


 52%|█████▏    | 500/955 [08:50<06:17,  1.21it/s]

{'loss': 0.5203, 'grad_norm': 7.007883071899414, 'learning_rate': 4.764397905759163e-06, 'epoch': 2.62}


                                                 
 60%|██████    | 573/955 [10:15<05:22,  1.18it/s]

{'eval_loss': 0.4252675473690033, 'eval_accuracy': 0.8214051214707814, 'eval_runtime': 16.9492, 'eval_samples_per_second': 89.857, 'eval_steps_per_second': 2.832, 'epoch': 3.0}


                                                 
 80%|████████  | 764/955 [13:42<02:34,  1.23it/s]

{'eval_loss': 0.42995351552963257, 'eval_accuracy': 0.814182534471438, 'eval_runtime': 15.5594, 'eval_samples_per_second': 97.883, 'eval_steps_per_second': 3.085, 'epoch': 4.0}


                                                 
100%|██████████| 955/955 [17:22<00:00,  1.09s/it]

{'eval_loss': 0.42370468378067017, 'eval_accuracy': 0.8168089297439265, 'eval_runtime': 16.1491, 'eval_samples_per_second': 94.309, 'eval_steps_per_second': 2.972, 'epoch': 5.0}
{'train_runtime': 1042.9742, 'train_samples_per_second': 29.195, 'train_steps_per_second': 0.916, 'train_loss': 0.4760295927836633, 'epoch': 5.0}





TrainOutput(global_step=955, training_loss=0.4760295927836633, metrics={'train_runtime': 1042.9742, 'train_samples_per_second': 29.195, 'train_steps_per_second': 0.916, 'total_flos': 69927811858800.0, 'train_loss': 0.4760295927836633, 'epoch': 5.0})

In [17]:
trainer.evaluate()

100%|██████████| 48/48 [00:15<00:00,  3.12it/s]


{'eval_loss': 0.42370468378067017,
 'eval_accuracy': 0.8168089297439265,
 'eval_runtime': 15.6689,
 'eval_samples_per_second': 97.199,
 'eval_steps_per_second': 3.063,
 'epoch': 5.0}

In [18]:
trainer.save_model('tinybert-disaster-tweet')

In [1]:
from transformers import pipeline
import torch

# data = ['this movie was horrible, the plot was really boring. acting was okay',
#         'the movie is really sucked. there is not plot and acting was bad',
#         'what a beautiful movie. great plot. acting was good. will see it again']

data = ['There is a fire in the building', 'I am happy today', 'I am sad today', 
          'I am not feeling well','There is a flood in the city, go to higher ground', 'Oh my god its an earthquake!']

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-disaster-tweet', device=device)

classifier(data)

  from .autonotebook import tqdm as notebook_tqdm


[{'label': 'disaster', 'score': 0.863773763179779},
 {'label': 'general', 'score': 0.8823919296264648},
 {'label': 'general', 'score': 0.8765028119087219},
 {'label': 'general', 'score': 0.8826189041137695},
 {'label': 'disaster', 'score': 0.8745247721672058},
 {'label': 'disaster', 'score': 0.5946744084358215}]

Upload model to AWS (set up bucket name as appropriate)

In [4]:
# upload model folder to s3 bucket ml-models/tinybert-sentiment-analysis
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'mlops-dgw1974'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            print(s3_key)
            
            #s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-disaster-tweet', 'ml-models/tinybert-disaster-tweet')

ml-models/tinybert-disaster-tweet/config.json
ml-models/tinybert-disaster-tweet/model.safetensors
ml-models/tinybert-disaster-tweet/special_tokens_map.json
ml-models/tinybert-disaster-tweet/tokenizer.json
ml-models/tinybert-disaster-tweet/tokenizer_config.json
ml-models/tinybert-disaster-tweet/training_args.bin
ml-models/tinybert-disaster-tweet/vocab.txt


In [22]:
s3 = boto3.client('s3')
    
paginator = s3.get_paginator('list_objects_v2')
bucket_name = "mlops-dgw1974"
s3_prefix = 'ml-models/tinybert-disaster-tweet/'
local_path = 'test_dir'

# for page in paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix):
#     for item in page['Contents']:
#         print(item)
    

def download_dir(local_path, s3_prefix):
    os.makedirs(local_path, exist_ok=True)
    paginator = s3.get_paginator('list_objects_v2')
    for result in paginator.paginate(Bucket=bucket_name, Prefix=s3_prefix):
        if 'Contents' in result:
            for entry in result['Contents']:
                s3_key = entry['Key']

                local_file = os.path.join(local_path, os.path.relpath(s3_key, s3_prefix))
                # os.makedirs(os.path.dirname(local_file), exist_ok=True)

                s3.download_file(bucket_name, s3_key, local_file)

download_dir(local_path, s3_prefix)


In [21]:
os.path.join('test_dir', os.path.relpath('ml-models/tinybert-disaster-tweet/config.json', 'ml-models/tinybert-disaster-tweet/'))

'test_dir\\config.json'