## Sentiment Classification Using TinyBert

In [None]:
import warnings
import pandas as pd
from datasets import Dataset

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
from transformers import AutoTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('data.csv')
data.head()

In [None]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

In [None]:
data['sentiment'].value_counts()

In [None]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0:'negative', 1:'positive'}

dataset = dataset.map(
    lambda x: {'label': label2id[x['sentiment']]}
)

In [None]:
dataset['train'][0]

### Tokenizer

In [None]:
model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'

tokenizer = AutoTokenizer.from_pretrained(
    model_ckpt,
    use_fast=True
)

In [None]:
print(tokenizer(dataset['train'][0]['review']).keys())

tokenizer(dataset['train'][0]['review'])

In [None]:
def tokenize(batch):
    temp = tokenizer(
        batch['review'],
        padding = True,
        truncation = True,
        max_length = 300
    )

    return temp


dataset = dataset.map(
    tokenize,
    batched=True,
)

In [None]:
dataset

### Building Model

In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Setting Up Classification Head
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels = 2,
    label2id = label2id,
    id2label = id2label
)

In [None]:
model

In [None]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

### Model Inference

In [None]:
trainer.save_model('tinybert-sentiment-analysis')

In [None]:
data = [
    'this movie was horrible, the plot was really boring. acting was okay',
    'the movie is really sucked. there is not plot and acting was bad',
    'what a beautiful movie. great plot. acting was good. will see it again'
]

In [None]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline(
    'text-classification',
    model = 'tinybert-sentiment-analysis',
    device = device
)

classifier(data)

### Push Model to AWS S3

In [None]:
import boto3

s3 = boto3.client('s3')

bucket_name = 'mlops-gaurav98094mona'

def create_bucket(bucket_name):
    response = s3.list_buckets()
    buckets = [buck['Name'] for buck in response['Buckets']]
    if bucket_name not in buckets:
        s3.create_bucket(Bucket=bucket_name)
        print("Bucket is created")

    else:
        print("Bucket already exists in your account!!! Feel free to use it.")

create_bucket(bucket_name)

In [None]:
import os
import boto3

s3 = boto3.client('s3')
bucket_name = 'mlops-gaurav98094mona'

def upload_directory(directory_path, s3_prefix):
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace("\\", "/")
            
            s3.upload_file(file_path, bucket_name, s3_key)


upload_directory('tinybert-sentiment-analysis', 'ml-models/tinybert-sentiment-analysis')