In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/twitter_disaster_tweets.csv", usecols=['text', 'target'])

df.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df.isnull().sum()

df = df.sample(frac=1).reset_index(drop=True)
df = df.rename(columns={'target': 'label'})

In [5]:
df.head()

Unnamed: 0,text,label
0,Three Israeli soldiers wounded in West Bank te...,1
1,God damn it!!! I electrocuted myself ??,0
2,Investigators say a fatal Virgin Galactic spac...,1
3,#DnB #NewRelease EDGE Jimmy - Summer Rainstorm...,0
4,Jacksonville family bands together as memoria...,1


In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.2)

dataset['train'][0]

{'text': '@estellasrevenge the first time i went swiming in it i was basically screaming WHY DOES IT SMELL/TASTE SO BAD',
 'label': 0}

In [7]:
dataset.shape

{'train': (6090, 2), 'test': (1523, 2)}

In [8]:
id2label = {0:'general', 1: 'disaster'}
label2id = {'general': 0, 'disaster': 1}

In [9]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True, max_length=100)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 6090/6090 [00:01<00:00, 5396.73 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 7308.45 examples/s]


In [10]:
tokenizer(dataset['train'][0]['text'])

{'input_ids': [101, 1030, 28517, 25816, 2890, 8159, 3351, 1996, 2034, 2051, 1045, 2253, 9880, 2075, 1999, 2009, 1045, 2001, 10468, 7491, 2339, 2515, 2009, 5437, 1013, 5510, 2061, 2919, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1523
    })
})

In [12]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=5,
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [15]:
trainer.train()

                                                 
 20%|██        | 191/955 [03:53<11:01,  1.15it/s]

{'eval_loss': 0.4898892045021057, 'eval_accuracy': 0.7866053841103086, 'eval_runtime': 14.7325, 'eval_samples_per_second': 103.377, 'eval_steps_per_second': 3.258, 'epoch': 1.0}


                                                     
 40%|████      | 382/955 [18:42<09:40,  1.01s/it]

{'eval_loss': 0.46210426092147827, 'eval_accuracy': 0.8069599474720945, 'eval_runtime': 12.9316, 'eval_samples_per_second': 117.773, 'eval_steps_per_second': 3.712, 'epoch': 2.0}


 52%|█████▏    | 500/955 [21:02<09:39,  1.27s/it]

{'loss': 0.5034, 'grad_norm': 5.214336395263672, 'learning_rate': 4.764397905759163e-06, 'epoch': 2.62}


                                                 
 60%|██████    | 573/955 [23:10<06:55,  1.09s/it]

{'eval_loss': 0.47877442836761475, 'eval_accuracy': 0.7944845699277742, 'eval_runtime': 15.5256, 'eval_samples_per_second': 98.096, 'eval_steps_per_second': 3.092, 'epoch': 3.0}


                                                 
 80%|████████  | 764/955 [27:25<03:19,  1.04s/it]

{'eval_loss': 0.45250535011291504, 'eval_accuracy': 0.8063033486539725, 'eval_runtime': 16.91, 'eval_samples_per_second': 90.065, 'eval_steps_per_second': 2.839, 'epoch': 4.0}


                                                 
100%|██████████| 955/955 [31:03<00:00,  1.95s/it]

{'eval_loss': 0.45337241888046265, 'eval_accuracy': 0.8049901510177282, 'eval_runtime': 13.2374, 'eval_samples_per_second': 115.052, 'eval_steps_per_second': 3.626, 'epoch': 5.0}
{'train_runtime': 1863.6376, 'train_samples_per_second': 16.339, 'train_steps_per_second': 0.512, 'train_loss': 0.45754573482493455, 'epoch': 5.0}





TrainOutput(global_step=955, training_loss=0.45754573482493455, metrics={'train_runtime': 1863.6376, 'train_samples_per_second': 16.339, 'train_steps_per_second': 0.512, 'total_flos': 71633368245600.0, 'train_loss': 0.45754573482493455, 'epoch': 5.0})

In [17]:
trainer.evaluate()

100%|██████████| 48/48 [00:08<00:00,  5.84it/s]


{'eval_loss': 0.45337241888046265,
 'eval_accuracy': 0.8049901510177282,
 'eval_runtime': 8.4461,
 'eval_samples_per_second': 180.32,
 'eval_steps_per_second': 5.683,
 'epoch': 5.0}

In [18]:
trainer.save_model('tinybert-disaster-tweet')

In [19]:
from transformers import pipeline
import torch

# data = ['this movie was horrible, the plot was really boring. acting was okay',
#         'the movie is really sucked. there is not plot and acting was bad',
#         'what a beautiful movie. great plot. acting was good. will see it again']

data = ['There is a fire in the building', 'I am happy today', 'I am sad today', 
          'I am not feeling well','There is a flood in the city, go to higher ground']

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-disaster-tweet', device=device)

classifier(data)

[{'label': 'disaster', 'score': 0.8670669198036194},
 {'label': 'general', 'score': 0.8888350129127502},
 {'label': 'general', 'score': 0.8771545886993408},
 {'label': 'general', 'score': 0.8869462609291077},
 {'label': 'disaster', 'score': 0.8596359491348267}]

In [23]:
# upload model folder to s3 bucket ml-models/tinybert-sentiment-analysis
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'mlops-dgw1974'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-disaster-tweet', 'ml-models/tinybert-disaster-tweet')

config.json
model.safetensors
special_tokens_map.json
tokenizer.json
tokenizer_config.json
training_args.bin
vocab.txt
