In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from datasets import Dataset

In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [5]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
label2id = {'negative':0, 'positive':1}
id2label = {0:'negative', 1:'positive'}
dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map: 100%|██████████| 35000/35000 [00:01<00:00, 33443.92 examples/s]
Map: 100%|██████████| 15000/15000 [00:00<00:00, 34390.67 examples/s]


In [7]:
dataset['train'][0]

{'review': 'I LOVE Jack\'s jokes like \'The cliché is...\' or "Over the top cliché guy, black, oily skin, kinda spooky...". He is just hilarious! Daniel\'s starting to catch up on him to! Good thing Jack\'s not on the team anymore (in a way) or else it would have been sarcasm mania!!!!I just love all the plots (season 8, a little less, I have to admit), the characters are great, the actors are great, I\'m starting to pick up facial expressions (and more) from Jack, Daniel and Teal\'c...It just all theoretically possible and exciting...oops! Their I go again!!! Sorry, I\'m also starting to pick up traits from Carter, and all of this is driving my parents NUTZ!!!!!!! Well, to conclude, I think it\'s good for another three seasons or so, especially if they keep on packing the episodes with all this humor, drama, action and so forth!!!!!!!!!!!!!!!!',
 'sentiment': 'positive',
 'label': 1}

## Tokenizer

In [8]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

# tokenizer(dataset['train'][0]['review'])

def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 35000/35000 [00:06<00:00, 5561.08 examples/s]
Map: 100%|██████████| 15000/15000 [00:02<00:00, 6192.13 examples/s]


## Model evaluation functions & model build

In [9]:
from datasets import DownloadMode

In [10]:
import evaluate
import numpy as np
import sklearn


accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, 
                                                           num_labels=len(label2id), 
                                                           label2id=label2id, 
                                                           id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
import accelerate

In [13]:
print(accelerate.__version__)

1.2.1


In [14]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    run_name="sentiment-analysis"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [15]:
trainer.train()

 15%|█▌        | 500/3282 [04:16<22:48,  2.03it/s]

{'loss': 0.4644, 'grad_norm': 7.062708854675293, 'learning_rate': 1.695307739183425e-05, 'epoch': 0.46}


 30%|███       | 1000/3282 [08:23<18:49,  2.02it/s]

{'loss': 0.3592, 'grad_norm': 9.574926376342773, 'learning_rate': 1.3906154783668494e-05, 'epoch': 0.91}


                                                   
 33%|███▎      | 1094/3282 [10:15<27:36,  1.32it/s]

{'eval_loss': 0.31849968433380127, 'eval_accuracy': 0.869, 'eval_runtime': 63.5185, 'eval_samples_per_second': 236.152, 'eval_steps_per_second': 7.384, 'epoch': 1.0}


 46%|████▌     | 1500/3282 [13:40<15:08,  1.96it/s]   

{'loss': 0.3084, 'grad_norm': 11.188454627990723, 'learning_rate': 1.0859232175502743e-05, 'epoch': 1.37}


 61%|██████    | 2000/3282 [17:48<10:26,  2.05it/s]

{'loss': 0.2926, 'grad_norm': 18.35076332092285, 'learning_rate': 7.81230956733699e-06, 'epoch': 1.83}


                                                   
 67%|██████▋   | 2188/3282 [20:18<08:29,  2.15it/s]

{'eval_loss': 0.2975030243396759, 'eval_accuracy': 0.8777333333333334, 'eval_runtime': 57.4561, 'eval_samples_per_second': 261.069, 'eval_steps_per_second': 8.163, 'epoch': 2.0}


 76%|███████▌  | 2500/3282 [22:53<06:24,  2.03it/s]  

{'loss': 0.2729, 'grad_norm': 7.3425068855285645, 'learning_rate': 4.765386959171238e-06, 'epoch': 2.29}


 91%|█████████▏| 3000/3282 [27:00<02:18,  2.04it/s]

{'loss': 0.2585, 'grad_norm': 8.516209602355957, 'learning_rate': 1.7184643510054846e-06, 'epoch': 2.74}


                                                   
100%|██████████| 3282/3282 [36:57<00:00,  1.48it/s]

{'eval_loss': 0.28963857889175415, 'eval_accuracy': 0.8828, 'eval_runtime': 456.1483, 'eval_samples_per_second': 32.884, 'eval_steps_per_second': 1.028, 'epoch': 3.0}
{'train_runtime': 2217.0625, 'train_samples_per_second': 47.36, 'train_steps_per_second': 1.48, 'train_loss': 0.3202110792644717, 'epoch': 3.0}





TrainOutput(global_step=3282, training_loss=0.3202110792644717, metrics={'train_runtime': 2217.0625, 'train_samples_per_second': 47.36, 'train_steps_per_second': 1.48, 'total_flos': 882184338000000.0, 'train_loss': 0.3202110792644717, 'epoch': 3.0})

In [17]:
trainer.evaluate()

100%|██████████| 469/469 [00:56<00:00,  8.28it/s]


{'eval_loss': 0.28963857889175415,
 'eval_accuracy': 0.8828,
 'eval_runtime': 56.8808,
 'eval_samples_per_second': 263.709,
 'eval_steps_per_second': 8.245,
 'epoch': 3.0}

In [18]:
trainer.save_model('tinybert-movie-review-sentiment')


In [20]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

data = ['this was a terrible film, i hated it', 'this film was ace!', 'not sure about this film but it might have been good']

classifier = pipeline('text-classification', model='tinybert-movie-review-sentiment', device=device)

classifier(data)

[{'label': 'negative', 'score': 0.9880494475364685},
 {'label': 'positive', 'score': 0.9416130185127258},
 {'label': 'negative', 'score': 0.6329503059387207}]

## Upload movie sentiment analysis model to S3 instance (with boto3)

In [22]:
import boto3

In [31]:
#ec2 = boto3.client('ec2', region_name='us-east-1')

ec2.describe_instances()



{'Reservations': [],
 'ResponseMetadata': {'RequestId': '028b40ff-6def-4050-ac55-7f81bd0ab77a',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '028b40ff-6def-4050-ac55-7f81bd0ab77a',
   'cache-control': 'no-cache, no-store',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'content-type': 'text/xml;charset=UTF-8',
   'content-length': '219',
   'date': 'Fri, 10 Jan 2025 17:04:16 GMT',
   'server': 'AmazonEC2'},
  'RetryAttempts': 0}}

In [28]:
# Just run this once - creates and saves key pair
#resp = ec2.create_key_pair(KeyName='boomer')
file = open('creds/boomer.pem', 'w')
file.write(resp['KeyMaterial'])
file.close()

In [26]:
resp

{'KeyFingerprint': 'bf:7c:e5:3f:81:dd:8d:4c:99:ce:f7:f1:6d:e6:61:7d:e9:3d:36:46',
 'KeyMaterial': '-----BEGIN RSA PRIVATE KEY-----\nMIIEpAIBAAKCAQEA6aStg0Puo+550ROjZ3ryUd9yASMpaDFET+rY5nd1LBa62acy\nBEhXYDy2fMBRmdRoyF8RQ+OKEW4MaxjO2AmPvvKEErTtOE5PxmINGgaEKhqsHm4e\nZkr33MH/sUJL7R+qmGeUtGoFQV5Rya05nyt5dHGmibtKx67bNwetVuLenHnhSFmw\nTMy4PyXq7qmSHpU7nwucxG5oCJ2eWsKG7E0pcdnIUOMYplCH/NBgUxD13k0Dy1+/\nCYB6VrDjIQ57WlLsOM/Ql5eGSckEH+vSuWIUIx5LknP2FRTEa7ZuAaf7pLYxIgmu\n3bkZRdsy0kcRMCijZ2c9AgnSIJnTUmkdFOpQUQIDAQABAoIBAQCkV5M868GmSY4i\nBGBB33cnI7a7GhJ8GXtlg1mB8rDWjD5t7m7+GCxtowbxhQ1g2MnDdytbx2dMj03o\n3xBPYZpcw5Um22Co5Fy0vC9fCQpZ210KIob7iOwDuemys2FYr5d9kA5HQPPlYQTC\niTgov5m00GRGKRmWj2XetWW14eqXwMMVrlpWcgFBB5a0KIR8bAvP72M6fdse6CGx\ng7mj0Ebvj+P/49Zh0DB12IL7oGxIRcCPSe0/E0KV3Z2ZWyVwPkjDLOsJ7YLcL3Ch\nx2nH/LEiMPaCcfA1yUA2XmOZzx60eRueaWAZ0zo6hAjCGkoe/+a3rtpB0wC84hke\nPqny9TABAoGBAPnVtzBJCEXgxkxcDcCq2pHCAqYn1Lmw5Oknrd1jx0yc45gUdK8K\nIhjdNIJ0GsjsLGofNZNeiwLxGYPprGtbYg7v1HlpqSN1uAyUWt+aTA6LLBKFVCTE\nVC1xnFWSME

In [33]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'movie-sentiment-analysis'

def create_bucket(bucket_name):
    response = s3.list_buckets()
    buckets = [buck['Name'] for buck in response['Buckets']]
    if bucket_name not in buckets:
        s3.create_bucket(Bucket=bucket_name)
        print("Bucket is created")

    else:
        print("Bucket already exists in your account!!! Feel free to use it.")

create_bucket(bucket_name)

Bucket is created


In [85]:
# upload model folder to s3 bucket ml-models/tinybert-sentiment-analysis
import os
import boto3

#s3 = boto3.client('s3')

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            # s3.upload_file(file_path, bucket_name, s3_key)
            print(file_path, bucket_name, s3_key)


upload_directory('tinybert-movie-review-sentiment', 'ml-models/tinybert-sentiment-analysis')


tinybert-movie-review-sentiment/model.safetensors movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/model.safetensors
tinybert-movie-review-sentiment/tokenizer_config.json movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/tokenizer_config.json
tinybert-movie-review-sentiment/special_tokens_map.json movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/special_tokens_map.json
tinybert-movie-review-sentiment/config.json movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/config.json
tinybert-movie-review-sentiment/tokenizer.json movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/tokenizer.json
tinybert-movie-review-sentiment/training_args.bin movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/training_args.bin
tinybert-movie-review-sentiment/vocab.txt movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/vocab.txt


In [86]:
# Recreate the 'upload_directory' function with pathlib
#
#  THIS FUNCTION DOES THE SAME AS THE ONE ABOVE, BUT USES PATHLIB INSTEAD OF OS
#

from pathlib import Path

# s3_prefix = Path('ml-models/tinybert-sentiment-analysis')

def upload_directory_pathlib(directory_path, s3_prefix):
    directory = Path(directory_path)
    for file_path in directory.rglob("*"):
        if file_path.is_file():
            relpath = file_path.relative_to(directory).as_posix()
            s3_key = f"{s3_prefix}/{relpath}"

            #s3.upload_file(file_path, bucket_name, s3_key)
            print(file_path, bucket_name, s3_key)

upload_directory_pathlib('tinybert-movie-review-sentiment', 'ml-models/tinybert-sentiment-analysis')

tinybert-movie-review-sentiment/model.safetensors movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/model.safetensors
tinybert-movie-review-sentiment/tokenizer_config.json movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/tokenizer_config.json
tinybert-movie-review-sentiment/special_tokens_map.json movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/special_tokens_map.json
tinybert-movie-review-sentiment/config.json movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/config.json
tinybert-movie-review-sentiment/tokenizer.json movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/tokenizer.json
tinybert-movie-review-sentiment/training_args.bin movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/training_args.bin
tinybert-movie-review-sentiment/vocab.txt movie-sentiment-analysis ml-models/tinybert-sentiment-analysis/vocab.txt
