In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, IterableDataset
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from torch.cuda.amp import autocast, GradScaler
import os
import json

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

class TextIterableDataset(IterableDataset):
    """
    A custom iterable dataset for text classification.

    Args:
        texts (list): A list of input texts.
        labels (list): A list of corresponding labels.

    Attributes:
        texts (list): A list of input texts.
        labels (list): A list of corresponding labels.
        tokenizer (BertTokenizerFast): The tokenizer used for text preprocessing.

    Methods:
        preprocess_text(text): Preprocesses the input text by tokenizing, removing stopwords, and lemmatizing.
        process_data(text, label): Preprocesses the input text and returns the processed data as a dictionary.
        __iter__(): Iterates over the texts and labels, yielding the processed data for each pair.

    """

    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

    def preprocess_text(self, text):
        """
        Preprocesses the input text by tokenizing, removing stopwords, and lemmatizing.

        Args:
            text (str): The input text to be preprocessed.

        Returns:
            str: The preprocessed text.

        """
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        cleaned_text = ' '.join([lemmatizer.lemmatize(w.lower()) for w in tokens if w.isalpha() and w not in stop_words])
        return cleaned_text

    def process_data(self, text, label):
        """
        Preprocesses the input text and returns the processed data as a dictionary.

        Args:
            text (str): The input text to be preprocessed.
            label (int): The corresponding label for the input text.

        Returns:
            dict: The processed data as a dictionary with keys 'input_ids', 'attention_mask', and 'labels'.

        """
        text = self.preprocess_text(text)
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        return {'input_ids': encoding['input_ids'].squeeze(0), 'attention_mask': encoding['attention_mask'].squeeze(0), 'labels': torch.tensor(label)}

    def __iter__(self):
        """
        Iterates over the texts and labels, yielding the processed data for each pair.

        Yields:
            dict: The processed data as a dictionary with keys 'input_ids', 'attention_mask', and 'labels'.

        """
        for text, label in zip(self.texts, self.labels):
            yield self.process_data(text, label)

def collate_fn(batch):
    """
    Collate function for batching data.

    Args:
        batch (list): A list of dictionaries, where each dictionary represents a single data sample.

    Returns:
        dict: A dictionary containing the batched data with keys 'input_ids', 'attention_mask', and 'labels'.
    """
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

def load_dataset(file_path):
    """
    Loads the dataset from a CSV file.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        tuple: A tuple containing the list of texts and the list of labels.

    """
    try:
        df = pd.read_csv(file_path, dtype=str, low_memory=False)
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(file_path, dtype=str, low_memory=False, encoding='ISO-8859-1')
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, dtype=str, low_memory=False, encoding='latin1')
    df.dropna(subset=['text', 'label'], inplace=True)
    df['label'] = pd.to_numeric(df['label'], errors='coerce')
    df.dropna(subset=['label'], inplace=True)
    df['label'] = df['label'].astype(int)
    return df['text'].tolist(), df['label'].tolist()

def train(model, train_loader, optimizer, device, scaler):
    """
    Trains the model on the training data.

    Args:
        model (BertForSequenceClassification): The model to be trained.
        train_loader (DataLoader): The data loader for the training data.
        optimizer (AdamW): The optimizer used for training.
        device (torch.device): The device to be used for training.
        scaler (GradScaler): The gradient scaler for mixed precision training.

    """
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

def evaluate(model, data_loader, device):
    """
    Evaluates the model on the validation or test data.

    Args:
        model (BertForSequenceClassification): The model to be evaluated.
        data_loader (DataLoader): The data loader for the validation or test data.
        device (torch.device): The device to be used for evaluation.

    Returns:
        tuple: A tuple containing the accuracy, precision, recall, and F1 score.

    """
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions.extend(torch.argmax(outputs.logits, -1).tolist())
            true_labels.extend(labels.tolist())
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    return accuracy, precision, recall, f1

def save_model_and_tokenizer(model, tokenizer, model_path, tokenizer_path):
    """
    Saves the model and tokenizer to the specified paths.

    Args:
        model (BertForSequenceClassification): The model to be saved.
        tokenizer (BertTokenizerFast): The tokenizer to be saved.
        model_path (str): The path to save the model.
        tokenizer_path (str): The path to save the tokenizer.

    """
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    model.save_pretrained(model_path)
    if not os.path.exists(tokenizer_path):
        os.makedirs(tokenizer_path)
    tokenizer.save_pretrained(tokenizer_path)

def save_test_data_as_csv(test_texts, test_labels, file_path):
    """
    Saves the test data as a CSV file.

    Args:
        test_texts (list): The list of test texts.
        test_labels (list): The list of test labels.
        file_path (str): The path to save the CSV file.

    """
    df_test = pd.DataFrame({'text': test_texts, 'label': test_labels})
    df_test.to_csv(f"{file_path}/test_dataset.csv", index=False)

def save_evaluation_metrics_as_json(metrics, file_path):
    """
    Saves the evaluation metrics as a JSON file.

    Args:
        metrics (tuple): A tuple containing the evaluation metrics.
        file_path (str): The path to save the JSON file.

    """
    with open(f"{file_path}/evaluation_metrics.json", 'w') as f:
        json.dump(metrics, f)

# Main function
if __name__ == "__main__":
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load dataset
    file_path = '/content/drive/My Drive/Colab Notebooks/Fine-tuning TinyBERT for Sentiment Analysis/train_dataset.csv'
    texts, labels = load_dataset(file_path)

    # Split dataset
    train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)
    val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)

    # Create datasets and data loaders
    train_dataset = TextIterableDataset(train_texts, train_labels)
    val_dataset = TextIterableDataset(val_texts, val_labels)
    test_dataset = TextIterableDataset(test_texts, test_labels)

    train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

    # Train and evaluate model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3).to(device)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    scaler = GradScaler()

    for epoch in range(3):
        train(model, train_loader, optimizer, device, scaler)
        accuracy, precision, recall, f1 = evaluate(model, val_loader, device)
        print(f'Epoch {epoch+1}, Val Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

    metrics = evaluate(model, test_loader, device)
    print(f'Test Accuracy: {metrics[0]:.4f}, Precision: {metrics[1]:.4f}, Recall: {metrics[2]:.4f}, F1: {metrics[3]:.4f}')

    # Save model, tokenizer, test data, and evaluation metrics
    model_save_path = '/content/drive/My Drive/Colab Notebooks/Fine-tuning TinyBERT for Sentiment Analysis'
    save_model_and_tokenizer(model, BertTokenizerFast.from_pretrained('bert-base-uncased'), model_path=model_save_path, tokenizer_path=model_save_path)
    save_test_data_as_csv(test_texts, test_labels, model_save_path)
    save_evaluation_metrics_as_json(metrics, model_save_path)


Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Val Accuracy: 0.7710, Precision: 0.7692, Recall: 0.7710, F1: 0.7662
Epoch 2, Val Accuracy: 0.7789, Precision: 0.7779, Recall: 0.7789, F1: 0.7750
Epoch 3, Val Accuracy: 0.7845, Precision: 0.7851, Recall: 0.7845, F1: 0.7806
Test Accuracy: 0.7822, Precision: 0.7824, Recall: 0.7822, F1: 0.7785


In [3]:
# Import libraries
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
import numpy as np

def preprocess_text(text, tokenizer, max_len=512):
    """
    Preprocesses the input text by tokenizing it using the provided tokenizer.

    Args:
        text (str): The input text to be preprocessed.
        tokenizer (Tokenizer): The tokenizer object used for tokenization.
        max_len (int, optional): The maximum length of the tokenized sequence. Defaults to 512.

    Returns:
        tokens (Tensor): The tokenized and preprocessed text as a PyTorch tensor.
    """
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=max_len, return_tensors="pt")
    return tokens

def predict_sentiment(text, model, tokenizer):
    """
    Predicts the sentiment of the given text using the provided model and tokenizer.

    Args:
        text (str): The input text to analyze.
        model: The pre-trained sentiment analysis model.
        tokenizer: The tokenizer used to preprocess the text.

    Returns:
        predicted_class (numpy.ndarray): The predicted sentiment class.
        confidence (numpy.ndarray): The confidence scores for each sentiment class.
        probabilities (numpy.ndarray): The probability distribution over all sentiment classes.
    """
    preprocessed_text = preprocess_text(text, tokenizer)
    input_ids = preprocessed_text['input_ids']
    attention_mask = preprocessed_text['attention_mask']

    input_ids = input_ids.to(model.device)
    attention_mask = attention_mask.to(model.device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        confidence, predicted_class = torch.max(probabilities, dim=-1)

    return predicted_class.cpu().numpy(), confidence.cpu().numpy(), probabilities.cpu().numpy()

# Load the pre-trained sentiment analysis model and tokenizer
model_path = '/content/drive/My Drive/Colab Notebooks/Fine-tuning TinyBERT for Sentiment Analysis'
model = BertForSequenceClassification.from_pretrained(model_path).to('cuda')
tokenizer = BertTokenizerFast.from_pretrained(model_path)

# Perform sentiment analysis on user input
while True:
    # Get user input
    text = input("\nEnter text for sentiment analysis (or type 'exit' to stop): ")

    # Check if the user wants to exit
    if text.lower() == 'exit':
        print("Exiting sentiment analysis.")
        break

    # Predict the sentiment of the input text
    predicted_class, confidence, probabilities = predict_sentiment(text, model, tokenizer)
    sentiment_labels = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}

    # Display the predicted sentiment and confidence
    print(f"\nSentiment: {sentiment_labels[predicted_class.item()]}")
    print(f"Confidence: {confidence.item() * 100:.2f}%")

    # Display the probability distribution over all sentiment classes
    print("Probabilities:")
    for sentiment, probability in zip(sentiment_labels.values(), probabilities[0]):
        print(f"  {sentiment}: {probability.item() * 100:.2f}%")



Enter text for sentiment analysis (or type 'exit' to stop): I love you

Sentiment: Positive
Confidence: 96.54%
Probabilities:
  Negative: 1.33%
  Neutral: 2.13%
  Positive: 96.54%

Enter text for sentiment analysis (or type 'exit' to stop): I hate you

Sentiment: Negative
Confidence: 96.94%
Probabilities:
  Negative: 96.94%
  Neutral: 2.14%
  Positive: 0.92%

Enter text for sentiment analysis (or type 'exit' to stop): I'm fine

Sentiment: Neutral
Confidence: 52.93%
Probabilities:
  Negative: 8.62%
  Neutral: 52.93%
  Positive: 38.45%

Enter text for sentiment analysis (or type 'exit' to stop): Everything's gonna be alright

Sentiment: Positive
Confidence: 72.23%
Probabilities:
  Negative: 10.12%
  Neutral: 17.66%
  Positive: 72.23%

Enter text for sentiment analysis (or type 'exit' to stop): exit
Exiting sentiment analysis.
